
    PhV                       d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZm	Z	 d dl
mZmZmZmZmZmZmZ d dlZd dlmZ d dlZd dlmZmZ d dlmZ d dlmZmZ d dlmZ d d	lm Z  d
dl!m"Z"m#Z#m$Z$ d
dl"m%Z% d
dl$m&Z&m'Z'm(Z( d
dl)m*Z+ d
dl,m-Z-m.Z.m/Z/m0Z0m1Z1 d
dl2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z;  e8       jx                  Z=de$j|                  fdZ?de@fdZAde@fdZBde@fdZCde@fdZDd ZEd ZFej                   G d d             ZH G d  d!      ZIej                   G d" d#             ZJ G d$ d%      ZKej                   G d& d'             ZLej                   G d( d)eL             ZMej                   G d* d+eL             ZNej                   G d, d-eL             ZO G d. d/eL      ZP G d0 d1e5      ZQ G d2 d3eQ      ZR G d4 d5eR      ZSy)6    N)chaincount)AnyDictListOptionalSetTupleUnion)Expr)countersdynamo_timed)get_cpp_wrapper_cubin_path_name)free_unbacked_symbolsSymTypes)_get_qualified_name)SingletonInt   )	codecacheconfigir)CudaKernelParamCache)ComputedBufferInputBufferReinterpretView)grid)cache_on_selfget_benchmark_nameLineContextsympy_product	sympy_str)V   )CodeGenDeferredLineIndentedBufferPythonPrinter)	config_ofsignature_to_metanodec                     | j                         | j                         t        t        j                  j
                  j                  | j                  j                                     fS N)	
get_device	get_dtyper!   r"   graphsizevarssimplifylayoutstorage_size)r*   s    jC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/_inductor/codegen/wrapper.pybuffer_reuse_keyr5   )   sL     	!''""++DKK,D,D,FGH     sc                 l    | r| d   dk(  r| d d } 	 t        |        y# t        $ r Y yt        $ r Y yw xY w)NLFT)int
ValueError	TypeErrorr7   s    r4   is_intr?   4   sL     	QrUc\crFA
 	   s    	333c                 :    	 t        |        y# t        $ r Y yw xY wNFT)floatr<   r>   s    r4   is_floatrC   B   s'    a   s    	python_typec                    ddl m}m} | dk(  rd|  dS | |v r||    S |j                         D ]Q  \  }}t	        j
                  |dz   |       }t        |      dk(  s.|d   }||v sJ d| d	|        ||   }| d
| dc S  t        d|        )Nr#   )CONTAINER_PYTHON_TO_CPPPYTHON_TO_CPPTensorat::z const&z\[([a-zA-Z_]+)]r   zunsupported z type in convert_arg_type: <>zunsupport python_type: )cpprF   rG   itemsrefindalllenAssertionError)rD   rF   rG   py_containercpp_containercontainer_matchcontained_typecpp_contained_types           r4   convert_arg_typerW   J   s    ;hk]'**m#[)) (?'D'D'F#m**\4F%FT1$,Q/N-/Xl^+F~FVWX/!.~!>#_A&8%9;; (G 2;-@
AAr6   c                 L    ddd}|j                  | d       }|
J d|         |S )Nz
at::Tensorzstd::vector<at::Tensor>)rH   zList[Tensor]zNYI return type: )get)rD   python_to_cppcpp_types      r4   convert_return_typer\   b   sC     1M
   d3HB#4[M!BBOr6   c                    | j                   j                  D cg c]  }t        |j                         }}| j                   j                  D cg c]  }|j                   }}| j                   j
                  D cg c]  }t        |j                         }}t        |      }|dkD  sJ d       |dk(  rt        |d         }n3|dkD  r.dj                  |D cg c]  }t        |       c}      }d| d}t        ||      D 	
cg c]  \  }	}
t        |	       d|
  }}	}
 ddj                  |       d	S c c}w c c}w c c}w c c}w c c}
}	w )
Nr   z#must have at least one return valuer#   , zstd::tuple<rK    ())_schema	argumentsrepr	real_typenamereturnsrP   r\   joinziprW   )kernelx	arg_types	arg_namesrg   num_returnscpp_return_valuertuple_returnsarg_typearg_namecpp_arg_types               r4   get_cpp_op_schemaru   n   sY   ,2NN,D,DE,Dqakk",DIE!'!9!9:!9A!9I:*0..*@*@A*@QtAKK *@GAg,K?AAA?a.wqz:	q		7"K7a#6q#97"KL(q9 #&i";";Hh H%
&az2";   q<!8 9;;# F:A #Ls   D;E <EE
Ec           
         t               }d|  }|j                  d| d       |j                         5  t        |      dk(  r|j                  d|d           nt        |      t        |      k(  sJ t	        ||      D ]_  \  }}|j
                  j                         D  cg c]  \  } }d|  d|  }} }d	j                  |      }|j                  d
| d|        a d d d        ||j                         fS c c}} w # 1 sw Y   !xY w)Ngrid_wrapper_for_zdef z(meta):r#   return r   zmeta['z'] == z and zif z	: return )	r&   	writelineindentrP   ri   kwargsrM   rh   getvalue)	rf   configsgridsoutputfn_namer   cvalguardss	            r4    user_defined_kernel_grid_fn_coder      s   F!$(G
tG9G,-	u:?wuQxj12u:W---ug.aFGhhnnFVWFVsF4&se4FVW f-  3vhiv!>? / 
 FOO%%% X 
s   A/D$C=
6-D=DDc                   <    e Zd ZU eed<   ej                  ed<   d Zy)SymbolicCallArginner
inner_exprc                 ,    t        | j                        S r,   )strr   selfs    r4   __str__zSymbolicCallArg.__str__   s    4::r6   N)__name__
__module____qualname__r   __annotations__sympyr   r    r6   r4   r   r      s    J

r6   r   c                   4     e Zd Z fdZd ZddZddZ xZS )MemoryPlanningStatec                 ^    t         |           t        j                  t              | _        y r,   )super__init__collectionsdefaultdictlist
reuse_poolr   	__class__s    r4   r   zMemoryPlanningState.__init__   s$    @K@W@WA
r6   c                 L    t        | j                  j                  |d             S r,   )boolr   rY   )r   keys     r4   __contains__z MemoryPlanningState.__contains__   s    DOO''T233r6   c                 \    | j                   |   j                         }|j                  rJ |S r,   )r   pop	is_reusedr   r   items      r4   r   zMemoryPlanningState.pop   s+    s#'')>>!!r6   c                 \    |j                   rJ | j                  |   j                  |       y r,   )r   r   appendr   s      r4   pushzMemoryPlanningState.push   s&    >>!!##D)r6   )returnFreeIfNotReusedLine)r   r   )r   r   r   r   r   r   r   __classcell__r   s   @r4   r   r      s    
4
*r6   r   c                   L    e Zd ZU eed<   ee   ed<   dedej                  fdZ	y)!EnterCudaDeviceContextManagerLine
device_idxlast_seen_device_guard_indexcodedevice_cm_stackc                    t         j                  j                  r|j                  d       t         j                  j                  rj| j
                  >t        j                  j                  r|j                  d       y |j                  d       y | j
                  | j                  k(  sSJ d       | j
                   |j                  d| j                   d       y |j                  d| j                   d       y y |j                  d| j                   d	       |j                  |j                                |j                  d
| j                   d       y )N
z<AOTICudaStreamGuard stream_guard(stream, this->device_idx_);zcat::cuda::CUDAStreamGuard stream_guard(at::cuda::getStreamFromExternal(stream, this->device_idx_));z4AOTInductor only supports running on one CUDA devicez!at::cuda::CUDAGuard device_guard();zdevice_guard.set_index(zwith torch.cuda._DeviceGuard(z):ztorch.cuda.set_device(z) # no-op to ensure context)r"   r/   cpp_wrapperry   aot_moder   r   aot_inductorabi_compatibler   enter_contextrz   r   r   r   s      r4   codegenz)EnterCudaDeviceContextManagerLine.codegen   s+   77NN4 ww 44<**99Z ] 99T__LNMNL 44<NN;DOO;LBO NN%<T__<MR#PQ M NN:4??:K2NO))$++-8NN((99TUr6   N)
r   r   r   r;   r   r   r&   
contextlib	ExitStackr   r   r6   r4   r   r      s*    O"*3-/#N #Z=Q=Q #r6   r   c                   0    e Zd Zdedej
                  fdZy) ExitCudaDeviceContextManagerLiner   r   c                 Z    t         j                  j                  s|j                          y y r,   )r"   r/   r   closer   s      r4   r   z(ExitCudaDeviceContextManagerLine.codegen   s     ww""!!# #r6   N)r   r   r   r&   r   r   r   r   r6   r4   r   r      s    $N $Z=Q=Q $r6   r   c                   :    e Zd ZU ded<   dedd fdZdefdZd Zy	)
MemoryPlanningLineWrapperCodeGenwrapperstater   c                     | S )zFirst pass to find reuser   r   r   s     r4   planzMemoryPlanningLine.plan   s    r6   r   c                      y)zSecond pass to output codeNr   r   r   s     r4   r   zMemoryPlanningLine.codegen       r6   c                 r   g }t        j                  |       D ]t  }|j                  dk(  rt        | |j                        }|j	                  |j                   d|j
                  t        j                  u r|j                         n|        v t        |       j                   ddj                  |       dS )zF
        Emits a string representation that fits on one line.
        r   =r`   r^   ra   )dataclassesfieldsrf   getattrr   typer   Bufferget_namer   rh   )r   argsfieldr   s       r4   r   zMemoryPlanningLine.__str__   s      ''-EzzY&$

+CKK::,a%**		2IsST	 . t*%%&a		$'8::r6   N)	r   r   r   r   r   r   r&   r   r   r   r6   r4   r   r      s.    - 2F N ;r6   r   c                   D    e Zd ZU ej                  ed<   defdZdefdZ	y)AllocateLiner*   r   c                 p   | j                   j                         t        j                  j                  v rt        | j                        S t        | j                         }t        j                  rG||v rC|j                  |      }d|_        t        | j                  |j                   | j                         S | S NT)r*   r   r"   r/   removed_buffersNullLiner   r5   r   allow_buffer_reuser   r   	ReuseLine)r   r   r   	free_lines       r4   r   zAllocateLine.plan  s    99177#:#::DLL)) tyy)$$		#I"&IT\\9>>499EEr6   r   c                     | j                   j                         t        j                  j                  vsJ | j
                  j                  | j                         }|j                  |       y r,   )r*   r   r"   r/   r   r   make_buffer_allocationry   )r   r   lines      r4   r   zAllocateLine.codegen  sK    yy!!#177+B+BBBB||22499=tr6   N)
r   r   r   r   r   r   r   r   r&   r   r   r6   r4   r   r      s$    
))O- N r6   r   c                   R    e Zd ZU ej                  ed<   dZeed<   defdZ	de
fdZy)	r   r*   Fr   r   c                    t        | j                  j                  t        j                  t        j
                  f      r| S | j                  rJ | j                  j                         t        j                  j                  v rt        | j                        S t        j                  r%|j                  t!        | j                        |        | S r,   )
isinstancer*   r2   r   AliasedLayoutMultiOutputLayoutr   r   r"   r/   r   r   r   r   r   r   r5   r   s     r4   r   zFreeIfNotReusedLine.plan  s    dii&&)9)92;O;O(PQK>>!!99177#:#::DLL))$$JJ'		2D9r6   r   c                     | j                   j                         t        j                  j                  vsJ | j
                  s5|j                  | j                  j                  | j                                y y r,   )	r*   r   r"   r/   r   r   ry   r   make_buffer_freer   s     r4   r   zFreeIfNotReusedLine.codegen%  sR    yy!!#177+B+BBBB~~NN4<<88CD r6   N)r   r   r   r   r   r   r   r   r   r   r&   r   r   r6   r4   r   r     s0    
))OIt- EN Er6   r   c                   p    e Zd ZU ej                  ed<   ej                  ed<   dZeed<   defdZ	de
fdZy	)
r   r*   	reused_asT
delete_oldr   c                 p   | j                   j                         t        j                  j                  v rK| j
                  j                         t        j                  j                  v sJ t        | j                        S | j
                  j                         t        j                  j                  vsJ | S r,   )r*   r   r"   r/   r   r   r   r   r   s     r4   r   zReuseLine.plan1  s    99177#:#::>>**,0G0GGGGDLL))~~&&(0G0GGGGr6   r   c                 p   | j                   j                         t        j                  j                  vsJ | j
                  j                         t        j                  j                  vsJ |j                  | j                  j                  | j                   | j
                  | j                               y r,   )
r*   r   r"   r/   r   r   ry   r   make_buffer_reuser   r   s     r4   r   zReuseLine.codegen8  sz    yy!!#177+B+BBBB~~&&(0G0GGGGLL**499dnndooV	
r6   N)r   r   r   r   r   r   r   r   r   r   r&   r   r   r6   r4   r   r   +  s8    
))OyyJ- 
N 
r6   r   c                       e Zd Zy)r   N)r   r   r   r   r6   r4   r   r   @  s    r6   r   c                       e Zd ZdZ fdZd Zd Zed        Zd Z	ed        Z
d Zd	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Z	 	 	 	 dXdZd Zed        Zd Zd ZdefdZ defdZ!dede"e#e$jJ                  f   fdZ&d  Z'd! Z(d"e)d#e#fd$Z*d"e)d#e#fd%Z+d&e#d'e#d(e#d#e#fd)Z,d*e-e)d+f   d#e#fd,Z.d*e-e)d+f   d#e#fd-Z/d#e#fd.Z0d#e#fd/Z1d0 Z2d1 Z3d2 Z4d3 Z5	 dYd'e#d4e#d5e6e#   fd6Z7d7 Z8d8e#fd9Z9d: Z:d; Z;d< Z<d= Z=d'e#d>e>e?   fd?Z@	 	 	 	 dZd@ZAdA ZBdB ZCd#e#fdCZDdD ZEdE ZFdF ZGd[dGZHdH ZIdIe>e#   fdJZJdKe#dLe#dMe#fdNZKdOeLfdPZMdQ ZNdR ZOdS ZPd\dTZQdU ZRdV ZSdW ZT xZUS )]r   zB
    Generate outer wrapper in Python that calls the kernels.
    c                     t                    t                _        t	                _        t	                _        t	                _        i  _        t                _
        g  _        d _        d _        d _        d _        d _        d _        d _        d _        d _        d  _        d _        t.         _        t                _        i  _        t                _         j9                           j;                          t<        j>                  j@                  sBt<        j>                  jB                  jE                         D ]  \  }} jG                  ||        t                _$        t                _%        tM                _'         tQ        jR                  d        jT                         _*        tQ        jR                  d        fd	       }| _+        i  _,        y )
N []#Nonezsize()zstride()Tc                 <    j                   j                  |        y r,   headerry   )r   r   s    r4   add_import_oncez0WrapperCodeGen.__init__.<locals>.add_import_oncet  s    KK!!$'r6   )-r   r   r   _names_iterr&   r   prefixwrapper_callsrc_to_kernelsetkenel_numel_exprlinesdeclareendingopen_bracketclosed_bracketcomment	namespacenone_strsizestrider   supports_intermediate_hookspexprexpr_printercached_thread_localsuser_defined_kernel_cacheunbacked_symbol_declswrite_headerwrite_prefixr"   r/   r   constant_reprsrM   write_constant	allocatedfreeddictreuses	functools	lru_cachewrite_get_raw_streamr   _metas)r   rf   hashedr   r   s   `   r4   r   zWrapperCodeGen.__init__I  s    7$&$&*, #
!	 ,0)+/(!$'E!EG&%(U"ww ! 6 6 < < >f##D&1 !? "u
 f$=I$7$7$=%%%
! 
		T	"	( 
#	(  /r6   c                 D    | j                   j                  | d|        y )Nz = None  # r   r   rf   r!  s      r4   r  zWrapperCodeGen.write_constant{  s    k&:;r6   c                 ^    | j                   j                  dt        j                   d       y )Na"  
                from ctypes import c_void_p, c_long
                import torch
                import math
                import random
                import os
                import tempfile
                from math import inf, nan
                from torch._inductor.hooks import run_intermediate_hooks
                from torch._inductor.utils import maybe_profile
                from torch._inductor.codegen.memory_planning import _align as align

                from torch import device, empty, empty_strided
                from a   import AsyncCompile
                from torch._inductor.select_algorithm import extern_kernels

                aten = torch.ops.aten
                inductor_ops = torch.ops.inductor
                assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                alloc_from_pool = torch.ops.inductor._alloc_from_pool
                reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
                async_compile = AsyncCompile()

            )r   splicer   r   r   s    r4   r  zWrapperCodeGen.write_header~  s1      (() 
*	
r6   c                 :    | j                   j                  d       y )Nz
            import triton
            import triton.language as tl
            from torch._inductor.triton_heuristics import grid, start_graph, end_graph
            from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
            r   r%  r   s    r4   write_triton_header_oncez'WrapperCodeGen.write_triton_header_once  s    	
r6   c                     t        |      }|| j                  vrGdt        | j                         }|| j                  |<   | j                  j	                  | d|        | j                  |   S )Nmeta = )rd   r   rP   r   ry   )r   r*  vars      r4   add_meta_oncezWrapperCodeGen.add_meta_once  sg    Dzt{{"T[[)*+C #DKKKK!!SETF"34{{4  r6   c                     t         j                  j                  D cg c]  }|j                  | j                         c}S c c}w r,   )r"   r/   graph_outputscodegen_referencer  r   rk   s     r4   get_output_refszWrapperCodeGen.get_output_refs  s7    @A@U@UV@U1##D$5$56@UVVVs   "Ac                      y r,   r   r   s    r4   mark_output_typezWrapperCodeGen.mark_output_type      r6   c           
         t         j                  j                  j                         D ]  \  }}t	        |t
        j                        r!t        |j                               dk(  r>| j                  |j                               }| j                  |j                               }| j                  j                  d| d| d| d        y )Nr   zassert_size_stride(r^   ra   )r"   r/   graph_inputsrM   r   r   r   r    get_sizecodegen_shape_tuple
get_strider   ry   )r   rf   bufr  r  s        r4   codegen_input_size_assertsz)WrapperCodeGen.codegen_input_size_asserts  s    --335ID##uzz* S\\^,1++CLLN;D--cnn.>?FKK!!$7vRvRxq"QR 6r6   c                    | j                   j                  d       | j                   j                         5  t        j                  j
                  r| j                   j                  d       t        t        j                  j                  j                               }|dk7  r{dj                  t        j                  j                  j                                |dk7  rdnd }| j                   j                  | d       | j                   j                  d	       | j                  | j                   t        j                  j                         t        j                  r| j                          d d d        y # 1 sw Y   y xY w)
Nzs

            async_compile.wait(globals())
            del async_compile

            def call(args):
            torch.cuda.synchronize()r   r^   r#   r   ,z = argszargs.clear())r   r%  rz   r   tritondebug_sync_graphry   rP   r"   r/   r7  keysrh   codegen_inputssize_assertsr<  )r   inp_lenlhss      r4   r  zWrapperCodeGen.write_prefix  s   	
 [[!}}--%%&@A!''..3356G!|177#7#7#<#<#>?@wRS|Y\@]^%%Wo6%%n5QWW-A-AB""//1 "!!s   D;E::Fc                 ^    | j                          d| }| j                  | d| d       |S )Nstreamz = get_cuda_stream(ra   )r(  ry   r   indexrf   s      r4   r  z#WrapperCodeGen.write_get_raw_stream  s9    %%'w$25';<r6   c                 .    t        | j                         S r,   )nextr   r   s    r4   next_kernel_suffixz!WrapperCodeGen.next_kernel_suffix  s    t''()*r6   c                 \    | j                  t        || j                               || _        y r,   )ry   r   r   )r   r   s     r4   codegen_device_guard_enterz)WrapperCodeGen.codegen_device_guard_enter  s+    -D==	

 -7)r6   c                 6    | j                  t                      y r,   )ry   r   r   s    r4   codegen_device_guard_exitz(WrapperCodeGen.codegen_device_guard_exit  s    79:r6   c                     |r1| j                   j                  ddj                  |      z   dz          y | j                   j                  d       y )Nzreturn (r^   , )z	return ())r  ry   rh   )r   output_refss     r4   generate_returnzWrapperCodeGen.generate_return  s@    ''
TYY{5K(Ke(ST''4r6   c                      y r,   r   )r   results     r4   generate_endzWrapperCodeGen.generate_end  r5  r6   c                 (    | j                  ||       y r,   )generate_extern_kernel_alloc)r   fallback_kernelr   s      r4   generate_fallback_kernelz'WrapperCodeGen.generate_fallback_kernel  s    ))/4@r6   c           
         | j                   }t        j                  rdt        |j                        v rd| }|j                         }|j                         }|j                         }| j                  | j                   | d| ddj                  |       d|        | j                  rKt        j                  r:|7t        d   dxx   d	z  cc<   | j                  d
|j                  d| d       y y y y )Nview_as_complexz.clone()r+  r`   r^   ra   inductorintermediate_hooksr#   zrun_intermediate_hooks()r  r   memory_planningr   rj   r   get_origin_nodecodegen_kernel_namery   r  rh   r  generate_intermediate_hooksr   rf   )r   extern_kernelr   r  output_nameorigin_nodekernel_names          r4   rZ  z+WrapperCodeGen.generate_extern_kernel_alloc  s    !!&73}?S?S;T&T  x(F#,,.#335#779||n[M[M499T?:K1VHU	
 ,,22'Z !56!;6NN)+*:*:)=R}AN ( 3 -r6   c                     |r#|j                  d|j                                 n|j                  d|        | j                  | ddj                  |       d       y )Nzout=r`   r^   ra   )r   r0  ry   rh   )r   output_viewr0  r   rj   s        r4   generate_extern_kernel_outz)WrapperCodeGen.generate_extern_kernel_out  sY    KK${<<>?@AKK$0123&499T?"3156r6   c           
         t        |||      \  }}| j                  j                         5  | j                  j                  |       d d d        | j	                  t
        j                  j                  j                  j                        }| j                  | ddj                  |       d| d| d       y # 1 sw Y   sxY w)N.run(r^   z, grid=z	, stream=ra   )r   r   rz   r%  r  r"   r/   	schedulercurrent_devicerJ  ry   rh   )r   rh  r   r}   r   r   stream_names          r4   #generate_user_defined_triton_kernelz2WrapperCodeGen.generate_user_defined_triton_kernel  s    5k7DQ
d[[!KKt$ " //0A0A0P0P0V0VWm54 1i}TUV	
	 "!s   B99Cc                     | ddj                  t        t        |             }|dk(  r|r*|dt        |       z  }n|dj                  dg|z         z  }|d| j                   z  }| j                  |       y )Nr`   r?  aten.scatter_z	, reduce=r^   r   ra   )rh   mapr   rd   r  ry   	r   r   inputsrj   fnsrc_is_tensorreducer{   r   s	            r4   generate_scatter_fallbackz(WrapperCodeGen.generate_scatter_fallback  s     388CV$4567_$)DL>22DIIrdVm,,D!DKK=!!tr6   c
           	      V    | j                  | d| ddj                  |       d       y )Nr+  r`   r^   ra   ry   rh   
r   rf   rj   codegen_argscpp_op_schemacpp_kernel_keycpp_kernel_overload_nameop_overloadraw_argsoutputss
             r4   6generate_extern_kernel_alloc_and_find_schema_if_neededzEWrapperCodeGen.generate_extern_kernel_alloc_and_find_schema_if_needed+  s-     	$s6(!DIIl,C+DAFGr6   c                      y r,   r   )r   r*   s     r4   generate_inf_and_nan_checkerz+WrapperCodeGen.generate_inf_and_nan_checker9  r   r6   c                    t         j                  r| j                          t               }|j	                  | j
                         t        j                         5 }|j                  | j                  j                                t         j                  r| j                  |       t         j                  r| j                          |r!t         j                  r| j                          n| j!                          t        j                         }| j"                  D ]|  }t%        |t&              r|j)                  | j                         /t%        |t*        t,        f      r|j)                  | j                  |       b| j                  j/                  |       ~ | j1                         }| j3                          t         j4                  j6                  r| j                  j/                  d       t         j                  r| j9                          | j;                  |       d d d        | j=                          | j?                          |j	                  | j@                         |j                         5  |j	                  | j                         d d d        | jC                  |       | jE                  |       |jG                         S # 1 sw Y   xY w# 1 sw Y   GxY w)Nr>  )$r   profile_bandwidthr(  r&   r%  r   r   r   r   r  rz   profiler_mark_wrapper_call#generate_profiler_mark_wrapper_callgenerate_start_graphra  memory_planmemory_plan_reuser  r   r   r   r   r   ry   r2  r4  r@  rA  generate_end_graphrU  "append_precomputed_sizes_to_prefixfinalize_prefixr   rX  add_benchmark_harnessgetvaluewithlinemap)r   is_inferencerW  stackr   r   rT  s          r4   generatezWrapperCodeGen.generate=  s   ##))+!dkk"!!#u 1 1 8 8 :;0088?''))+  6 6  "&&((224O

d$67LL!2!2398 LL!2!2OD%%//5 # ..0K!!#}}--!!++,FG'''')  -I $L 	//1dkk"]]_MM$++,  	&!""6*))++c $#T _s   GK0KKKc                 \    ddl m}  ||       j                  | j                        | _        y )Nr#   )MemoryPlanner)ra  r  r   r  )r   r  s     r4   r  zWrapperCodeGen.memory_planw  s     2"4(--djj9
r6   c                    t         j                  j                         }| j                  rt	        | j                  d   t
              r| j                  d   j                  j                  |vri| j                  j                          | j                  rCt	        | j                  d   t
              r&| j                  d   j                  j                  |vrit               }t        t        | j                              D ]K  }t	        | j                  |   t
              s!| j                  |   j                  |      | j                  |<   M y )Nr9   )r"   r/   get_output_namesr  r   r   r*   rf   r   r   rangerP   r   )r   	out_namesplanning_stateis       r4   r  z WrapperCodeGen.memory_plan_reuse|  s    GG,,.	 JJ4::b>+=>

2##((	9 JJNN JJ4::b>+=>

2##((	9 -.s4::'A$**Q-);< $

1 2 2> B

1 (r6   r   c           	      z    |j                  | j                   | d| d| j                   | j                          y )Nz_size = .)ry   r  r  r  r   r   rf   s      r4   codegen_input_size_var_declz*WrapperCodeGen.codegen_input_size_var_decl  s3    $,,vXdV1TYYK}UVr6   c           	      z    |j                  | j                   | d| d| j                   | j                          y )Nz
_stride = r  )ry   r  r  r  r  s      r4   codegen_input_stride_var_declz,WrapperCodeGen.codegen_input_stride_var_decl  s6    ||nTF*TF!DKK=N	
r6   r7  c                     t        j                  d       fd       }t        j                  d       fd       }t        j                  j                  j                         }d t        t        |j                                     }t        t        fd|j                                     }|D ]r  \  }}	t        j                  j                  j                  |	      }	|	|v s4|j                  |	       j                   j                   |	 d|  j                          t |D ]  \  }}
|
j                         }t        |      D ]|  \  }}	t        j                  j                  j                  |	      }	|	|v s4|j                  |	       j                   j                   |	 d ||       d| d j                          ~  |D ]  \  }}
|
j!                         }t        |      D ]|  \  }}	t        j                  j                  j                  |	      }	|	|v s4|j                  |	       j                   j                   |	 d ||       d| d j                          ~  y)	z$Assign all symbolic shapes to localsNc                 2    j                  |        |  dS )N_size)r  rf   r   r   s    r4   sizeofz-WrapperCodeGen.codegen_inputs.<locals>.sizeof  s    ,,T48V5>!r6   c                 2    j                  |        |  dS )N_stride)r  r  s    r4   strideofz/WrapperCodeGen.codegen_inputs.<locals>.strideof  s     ..tT:V7##r6   c                 <    t        | d   t        j                        S )Nr#   )r   r   r   rk   s    r4   is_exprz.WrapperCodeGen.codegen_inputs.<locals>.is_expr  s    adEJJ//r6   c                      |        S r,   r   )rk   r  s    r4   <lambda>z/WrapperCodeGen.codegen_inputs.<locals>.<lambda>  s    ^r6   r+  r   r   )r  r  r"   r/   r0   free_symbolsr   filterrM   r1   removery   r  r  r8  	enumerater:  )r   r   r7  r  r  neededgraph_inputs_exprgraph_inputs_tensorsrf   shapevalueshapesdimr  s   ``           @r4   rC  zWrapperCodeGen.codegen_inputs  s'   
 
		T	"	" 
#	" 
		T	"	$ 
#	$
 !!..0	0 !1C1C1E!FG#+\-?-?-AB 
 -KD%GG$$--e4Ee$$,,wc$}MN	 - 0KD%^^%F'/
U((11%8F?MM%(NN<<.s6$<.#a}U	 0 0 0KD%%%'F'/
U((11%8F?MM%(NN<<.s8D>2B!C5$++W	 0 0r6   c           
      f   | j                   j                         5  t        j                  j                  j
                  j                         D ]L  \  }}| j                   j                  | j                   | d| j                  |       | j                          N 	 d d d        y # 1 sw Y   y xY wNr+  )r   rz   r"   r/   r0   inv_precomputed_replacementsrM   ry   r  r  r  )r   symexprs      r4   r  z1WrapperCodeGen.append_precomputed_sizes_to_prefix  s    [[!WW--JJPPR	T%%||nSET->->t-D,Edkk]S S "!!s   BB''B0c                      y r,   r   r   s    r4   r  zWrapperCodeGen.finalize_prefix      r6   rk   r   c                 f    t        t        j                  j                  j	                  |            S r,   )r  r"   r/   r0   r1   r1  s     r4   codegen_python_sizevarz%WrapperCodeGen.codegen_python_sizevar  s"    QWW%%..q122r6   c                 $    | j                  |      S r,   )r  r1  s     r4   codegen_sizevarzWrapperCodeGen.codegen_sizevar  s    **1--r6   basenamerf   rJ  c                     | d| dS )Nr   r   r   r   r  rf   rJ  s       r4   codegen_tuple_accessz#WrapperCodeGen.codegen_tuple_access  s    1UG1%%r6   r  .c                     t        t        | j                  |            }t        |      dk(  ryt        |      dk(  r	d|d    dS ddj	                  |       dS )Nr   z()r#   r`   rS  r^   ra   )r   rt  r  rP   rh   r   r  partss      r4   codegen_python_shape_tuplez)WrapperCodeGen.codegen_python_shape_tuple  s^    S44e<=u:?u:?uQxj$$499U#$A&&r6   c                 $    | j                  |      S r,   )r  )r   r  s     r4   r9  z"WrapperCodeGen.codegen_shape_tuple  s    ..u55r6   c                     dj                  dj                  |t        |      t        |      | j	                  |      | j	                  |      g            S )Nalloc_from_pool({})r^   )formatrh   r  r   r9  )r   rf   offsetdtyper  r  s         r4   codegen_alloc_from_poolz&WrapperCodeGen.codegen_alloc_from_pool  sS    $++II&MJ,,U3,,V4

 
	
r6   c           	          | j                  |      }| j                  |      }| j                  |      }d|j                          d| d| d| d	S )Nreinterpret_tensor(r^   ra   )r9  r  r   )r   datar  r  r  writers         r4   codegen_reinterpret_viewz'WrapperCodeGen.codegen_reinterpret_view  s[    ''-))&1%%f-$T]]_$5RvRxr&QRSSr6   c                 2    | j                  | d| d       y )N.copy_(ra   ry   r   srcdsts      r4   codegen_device_copyz"WrapperCodeGen.codegen_device_copy  s    #gcU!,-r6   c                 `    | j                  | j                   | d| | j                          y r  )ry   r  r  )r   rf   r  s      r4   codegen_multi_outputz#WrapperCodeGen.codegen_multi_output  s)    $,,vS}EFr6   c           
           fd}fd}j                  g d       j                         5  j                  dd       t        j                  j
                  j                         D ]U  \  }}j                  d|         |||j                         |j                         |j                  |j                         W t        j                  j                  j                         D ]^  \  }}t        |t        j                        rCt        t        j                  j                   j"                  j%                  |d       t&              rdt        |t        j(                        r1 ||t        j                  j                   j+                  |             |j-                         D cg c]+  }t        j                  j                   j+                  |      - }}|j/                         D cg c]+  }t        j                  j                   j+                  |      - }} |||||j1                         |j3                                a dd	j5                  t        j                  j                  j7                                d
}	j                  d|	        j                  d       d d d        y c c}w c c}w # 1 sw Y   y xY w)Nc                     j                  |  dj                  |       dj                  |       d| d| d
       y )Nz = rand_strided(r^   
, device='	', dtype=ra   )ry   r  )rf   r  r  devicer  r   r   s        r4   add_fake_inputz@WrapperCodeGen.benchmark_compiled_module.<locals>.add_fake_input  sT    &(2259:"226:; <!()E7!5r6   c                 2    j                  |  d|        y r  r  )rf   r   r   s     r4   add_expr_inputz@WrapperCodeGen.benchmark_compiled_module.<locals>.add_expr_input
  s    vS./r6   )r   r   z3def benchmark_compiled_module(times=10, repeat=10):z
                from torch._dynamo.testing import rand_strided
                from torch._inductor.utils import print_performance
                Tstripzglobal zcall([r^   z])zfn = lambda: z8return print_performance(fn, times=times, repeat=repeat))
writelinesrz   r%  r"   r/   	constantsrM   ry   r  r  r  r  r7  r   r   Symbolr0   
var_to_valrY   r   r   	size_hintr8  r:  r-   r.   rh   rB  )
r   r   r  r  rf   r  rk   r  r  call_strs
   ``        r4   benchmark_compiled_modulez(WrapperCodeGen.benchmark_compiled_module  s&   		0 	K	
 ]]_MM     !ww00668e   74&!12%**,ekk	  9  !ww3399;eeU\\2zGG$$//33E4@,8 eUZZ0"4)9)9)C)CE)JKDINNDTUDTqQWW--77:DTEUEJEUEUEWXEWagg..88;EWFX"eVU-=-=-?AR  <"  		!''*>*>*C*C*E FGrJH}XJ78WXI _8 VX; _s+   E=K.0J8K20J="BK8
KKc                     t         j                  sy| j                  |       |j                  g d       |j	                         5  |j                  ddt                dg       ddd       y# 1 sw Y   yxY w)zL
        Append a benchmark harness to generated code for debugging
        N)r   r   zif __name__ == "__main__":zBfrom torch._inductor.wrapper_benchmark import compiled_module_mainzcompiled_module_main('z', benchmark_compiled_module))r   benchmark_harnessr  r  rz   r   )r   r   s     r4   r  z$WrapperCodeGen.add_benchmark_harness6  sh     ''&&v.@A]]_X,-?-A,BB_` __s    A//A8rj   metadatac                 \    |r| dnd}| j                   j                  d| | d|        y )Nr   r   z

r+  r'  )r   rf   rj   r  cudametadata_comments         r4   define_kernelzWrapperCodeGen.define_kernelH  s9     /7hZr?BT"2!3D6VHEFr6   c                 &   |j                   }t        |j                        g}|j                         D ]l  }t	        |t
        j                  t
        j                  f      r |j                  |j                                Mt        |      dkD  s\|j                  |       n t        |      }|| j                  v r| j                  |   S | dt        | j                         }|| j                  |<   t               j                  d|d       j                  dd       j!                          dd	lm}m}	 g }
i }|j)                         D ]  \  }}|j*                  j-                  |      }||j.                  v r|||<   5t	        |t
        j                  t
        j                  f      rF|
j                   |	||j1                         |j                         t	        |t                            |
j                   |||              d
}d|i}t3        |
|      t4        j6                  j8                  j:                  j,                  t4        j6                  j8                  j:                  j<                  |t?        |
      gd}|D cg c]&  }|j@                  |jB                  |jD                  d( }}j                  d|d|d|d       j                  |jF                  d       ddl$m% |hfd |       j                  d       tM        jN                  |j                        \  }}tM        jP                  |j                        }d| d| }| jS                  |jU                         |       |S c c}w )Nr   _zasync_compile.triton(z, '''z
            import triton
            import triton.language as tl
            from torch._inductor.utils import instance_descriptor
            from torch._inductor.triton_heuristics import user_autotune
            Tr  r#   )SizeArg	TensorArgztl.int32rh  )
size_dtype)	signaturer  device_typer  r}   )r{   	num_warps
num_stagesz5
            @user_autotune(
                configs=z ,
                inductor_meta=z,
                triton_meta=zV,
                filename=__file__
            )
            @triton.jit
            )JITFunctionc                 ,   | j                   j                  j                  D ]  }|v r|| j                   j                  v s!| j                   j                  |   }t	        |      rXj                          j                  d       j                  |j                  d       j                  |        |       t	        |t        t        t        f      sj                          j                  | d|       j                  |        y )Nz@triton.jitTr  r+  )rw  __code__co_names__globals__r   newlinery   r%  r  addr;   r   r   )
cur_kernelsymbol_namesymbolr  compile_wrappersymbols_includedtraverses      r4   r  zBWrapperCodeGen.define_user_defined_triton_kernel.<locals>.traverse  s    )}}55>>"22*--";";;']]66{CF!&+6'//1'11-@'..vzz.F(,,[9 (#FS#t,<='//1'11[MVJ2OP(,,[9  ?r6   z''')z# Original path: :)+r   idrw  valuesr   r   r   r   r   r.   rP   tupler  r&   ry   r%  r  commonr  r   rM   rm   rJ  
constexprsr0  r)   r"   r/   rn  ro  r   r(   r{   r  r  r  r@  r  inspectgetsourcelinesgetsourcefiler  r|   )r   rj   r}   r{   original_name	cache_keyargrf   r  r   r  r  r   idxindex_dtypeinductor_metatriton_metar   r  linenosrcfiler  r  r  r  r  s                         @@@@r4   !define_user_defined_triton_kernelz0WrapperCodeGen.define_user_defined_triton_kernelN  sa    		]O	==?C#		2+=+=>?  1W!  % # )$	66611)<<#d&D&D"E!FG48&&y1(*!!$9-9J%"PQ  	 	
 	!.57		HC""((-Cf'''!$	##		2+=+=>?  --/&sO<<   c!23! '" !4
 +9Mgg''66<<77,,;;@@"!),-
 "
 "	 !--#--$//
 " 	 
 	  $,/ 0(O ,	
	
 	vzz6 	')?	:" 	!!&)**6995	6''		2&wiq9$$&	

 m
s   +Nrh  c                    | d|j                    d}|| j                  vrc| j                  j                  |       | j                  | j                   | d| j                  |j                         | j                          n;| j                  | d| j                  |j                         | j                          t        ||j                        S )Nr  numelr+  )	r   r  r  ry   r  r  r'  r  r   )r   rh  treer  s       r4   generate_numel_exprz"WrapperCodeGen.generate_numel_expr  s    a}E2t,,,!!%%d+NN<<.c$*;*;DJJ*G)HV NNdV3t'8'8'D&Edkk]ST tTZZ00r6   c                 H    | ddj                  |       d| j                   S )Nr`   r^   ra   )rh   r  )r   rf   	call_argss      r4   wrap_kernel_callzWrapperCodeGen.wrap_kernel_call  s'    q9-.a}==r6   c                     | j                   j                  d       | j                   j                  dt        j                  j                   d       |j                  | j                   j                                y )Nz*from torch.profiler import record_functionzwith record_function('graph_z_inductor_wrapper_call'):)r  ry   r"   r/   graph_idr   rz   r   r  s     r4   r  z2WrapperCodeGen.generate_profiler_mark_wrapper_call  sb    ##$PQ##*177+;+;*<<UV	
 	D--4467r6   c                 :    | j                   j                  d       y )Nzstart_graph()r  ry   r   s    r4   r  z#WrapperCodeGen.generate_start_graph  s    ##O4r6   c                 :    | j                   j                  d       y )Nzend_graph()r1  r   s    r4   r  z!WrapperCodeGen.generate_end_graph  s    ##M2r6   	grid_argsc                     |S r,   r   )r   rf   r3  s      r4   generate_default_gridz$WrapperCodeGen.generate_default_grid  s    r6   c           
         |rdj                  d |D              }| j                  t        j                  j                  j
                  j                        }|r6dj                  d |D              }	| j                  | d| d|	 d| d       yd| d}
| j                  | d	| d
| d|
 d       y| j                  | j                  ||             y)7  
        Generates kernel call code.

        cuda: Defines whether the backend is GPU. Otherwise the backend is CPU.

        triton: Defines whether the GPU backend uses Triton for codegen.
                Otherwise it uses the CUDA language for codegen.
                Only valid when cuda == True.
        r^   c              3   2   K   | ]  }t        |        y wr,   r  .0r   s     r4   	<genexpr>z6WrapperCodeGen.generate_kernel_call.<locals>.<genexpr>   s     %HideDki   c              3   2   K   | ]  }t        |        y wr,   r9  r:  s     r4   r<  z6WrapperCodeGen.generate_kernel_call.<locals>.<genexpr>  s     $BTTU4[Tr=  rm  z, grid=grid(z
), stream=ra   z	c_void_p(r  r`   N)	rh   r  r"   r/   rn  ro  rJ  ry   r,  )r   rf   r+  r   device_indexr  r@  call_args_strrp  grid_str
stream_ptrs              r4   generate_kernel_callz#WrapperCodeGen.generate_kernel_call  s    $  II%Hi%HHM33!!0066K 99$BT$BBfE-XJjQ\P]]^_  )Q7
$qabANONN400yABr6   c                 :    | j                   j                  |       y r,   )r  r   )r   r   s     r4   ry   zWrapperCodeGen.writeline  s    

$r6   c                 L    | j                   j                  t        |             y r,   )r  r   r   )r   ctxs     r4   r   zWrapperCodeGen.enter_context  s    

+c*+r6   c                     t               r,   )NotImplementedError)r   type_r   is_legacy_abis       r4   val_to_cpp_arg_strz!WrapperCodeGen.val_to_cpp_arg_str  s    !##r6   c                 6    t        |t              r't        t        j                  t        |                  S t        |t        j                        rt        |      S t        |t        t        f      rAt        j                   G d d             t         t        |       fd|D                    S t        |t        j                  j                        rt        |      S t        |t         t"        t$        f      r|j'                         S t        |      S )Nc                       e Zd ZU eed<   d Zy)+WrapperCodeGen.val_to_arg_str.<locals>.Shimrefc                     | j                   S r,   )rO  r   s    r4   __repr__z4WrapperCodeGen.val_to_arg_str.<locals>.Shim.__repr__#  s    88Or6   N)r   r   r   r   r   rQ  r   r6   r4   ShimrN    s    $r6   rR  c              3   L   K   | ]  } j                  |              y wr,   val_to_arg_str)r;  arR  r   s     r4   r<  z0WrapperCodeGen.val_to_arg_str.<locals>.<genexpr>&  s"     HaT%8%8%; <as   !$)r   r   r  r   expandrd   r   r  r   r   	dataclassr   torch_ops
OpOverloadr   r   r   r   r0  )r   r7   rR  s   ` @r4   rU  zWrapperCodeGen.val_to_arg_str  s    a"d1g.//5::&8OE4=)""$ $ #$ QHaHHII5::001&q))NKIJ&&((7Nr6   c                     |j                         }|j                         }t        |j                               }t        |j	                               }| j                  |j                         ||||      S r,   )r-   r.   r  r8  r:  make_allocationr   )r   bufferr  r  r  r  s         r4   r   z%WrapperCodeGen.make_buffer_allocation/  sb    ""$  "foo'(v((*+##FOO$5vueVTTr6   c           
      ,   	 t        t        j                  |            }||k(  r'| d| j	                  |       d|j
                   d| dS | d| j	                  |       d| j	                  |       d|j
                   d| d
S # t        $ r d }Y rw xY w)Nz	 = empty(r  r  ra   z = empty_strided(r^   )r  r   make_contiguous_strides_for	Exceptionr9  r   )r   rf   r  r  r  r  expecteds          r4   r]  zWrapperCodeGen.make_allocation6  s    	R;;EBCH X&	++E23 4!;;-yq: &)++E232++F34 5!;;-yq:  	H	s   B BBc           	      `    | j                    | d| | j                   d| j                   d| 	S )Nr+    r_   r  r  r
  )r   new_nameold_namer
  s       r4   make_tensor_aliasz WrapperCodeGen.make_tensor_aliasI  s6    ,,zXJt{{m2dll^STU\T]^^r6   c                 (    d|j                          S )Ndel )r   r   r^  s     r4   r   zWrapperCodeGen.make_buffer_freeL  s    foo'())r6   names_to_delc                 8    ddj                  d |D               S )Nrj  r^   c              3       K   | ]  }|  y wr,   r   r;  rf   s     r4   r<  z4WrapperCodeGen.make_free_by_names.<locals>.<genexpr>P  s     >   rh   r   rl  s     r4   make_free_by_namesz!WrapperCodeGen.make_free_by_namesO  s    dii>>>?@@r6   rg  rf  del_linec           	      `    | j                    | d| | | j                   d| j                   d	S )Nr+  rd   reusere  )r   rg  rf  rt  s       r4   codegen_exact_buffer_reusez)WrapperCodeGen.codegen_exact_buffer_reuseR  s7    ,,zXJxjRPTP\P\~]cddr6   r   c                    |j                         |j                         k(  sJ |j                         }|j                         }d}|t        j                  j	                         vr|rd| j                  |       }|j                         |j                         k(  r]|j                         |j                         k(  r<|| j                  v r| j                  j                  |       | j                  |||      S | j                  ||j                         |j                         d| j                        }|| j                  v r| j                  j                  |       | j                   | d| | d| j                   dS )N;z; r   r+  rd  rv  )r.   r   r"   r/   r  r   r8  r:  r  r  rw  r  r  r  r
  )r   oldnewr   rg  rf  rt  reinterpret_views           r4   r   z WrapperCodeGen.make_buffer_reuseU  sE   }}#--/111<<><<>1773355*D11#678H<<>S\\^+0@CNNDT0T4444))--h7228XxPP88!11d6G6G
 t888%%))(3,,z-=,>xj4<<.X^__r6   c                     | j                  t        || j                   | d|j                  j	                          | j
                   d| j                   d             y )Nr+  rd  z alias)ry   r%   r  viewr0  r  r
  )r   rf   r2   s      r4   codegen_deferred_allocationz*WrapperCodeGen.codegen_deferred_allocationi  sW    <<.c&++*G*G*I)J4;;-WYZ^ZfZfYggmn	
r6   c                    |j                         dk(  sJ d       |j                         }|t        j                  j                  v s|| j
                  v ry | j
                  j                  |       t        |t        j                  t        j                  f      ry |j                         }t        |t        j                        ry t        |t        j                        rt        |j                  t        j                        s*J dt!        |j                         d|j                          | j#                  |j                  j$                         | j'                  ||       y | j)                  t+        | |             y )Nr   )Only support zero workspace size for now!zunexpected z: )get_workspace_sizer   r"   r/   r   r  r  r   r   ExternKernelAllocMultiOutput
get_layoutMutationLayoutr   r~  r   r   codegen_allocationr  r  ry   r   )r   r^  rf   r2   s       r4   r  z!WrapperCodeGen.codegen_allocationq  s<   %%'1,	76	7,  177***ddnn.D4 !!2>>2
 ""$fb//0fb../R// @T&++./r&++?@  ##FKK$4$45,,T6:|D&12r6   c                 ^   |j                         dk(  sJ d       |j                         }t        |t        j                        r!| j                  | j                  |             y | j                  |      sy | j                  j                  |       | j                  t        | |             y )Nr   r  )r  r   r   r   r   ry   r   	can_reuser  r  r   )r   r^  rf   s      r4   codegen_freezWrapperCodeGen.codegen_free  s    %%'1,	76	7,   fbnn-NN40089~~f%

t*489r6   c                 "   |j                         }|t        j                  j                  v sb|t        j                  j                  v sF|t        j                  j
                  v s*|t        j                  j                  v s|| j                  v ryyrA   )r   r"   r/   r   r7  r  never_reuse_buffersr  )r   input_bufferoutput_bufferrf   s       r4   r  zWrapperCodeGen.can_reuse  sj    $$&AGG+++qww+++qww(((qww222tzz!r6   c                     |j                         | j                  v xr. | j                  |j                            |j                         k(  S r,   )r   r  )r   r^  reused_buffers      r4   	did_reusezWrapperCodeGen.did_reuse  sC     OO, KFOO-.-2H2H2JJ	
r6   c                    t        |      t        |      k(  sJ | j                  |       | j                  j                  |j	                                | j
                  j                  |j	                                |j	                         | j                  |j	                         <   | j                  t        | ||             y r,   )	r5   r  r  r  r   r  r  ry   r   )r   r  r  s      r4   codegen_inplace_reusez$WrapperCodeGen.codegen_inplace_reuse  s    -1A-1PPPP-

|,,./=11340<0E0E0GM**,-y|]CDr6   c                     t        |      }|| j                  v r|S | j                  j                  |       | j                  |z   S r,   )r   r  r  r  )r   r  rf   s      r4   codegen_unbacked_symbol_declz+WrapperCodeGen.codegen_unbacked_symbol_decl  sC    6{4---K &&**40<<$&&r6   r   NNNr   NNTTr   r,   )Vr   r   r   __doc__r   r  r  r   r(  r-  r2  r4  r<  r  r  rM  rO  rQ  rU  rX  r\  rZ  rk  rq  rz  r  r  r   r  r  r  r&   r  r  r   r   r   	TensorBoxrC  r  r  r   r  r  r  r
   r  r9  r  r  r  r  r  r  r   r  r%  r)  r,  r  r  r  r   r   r5  rC  ry   r   rK  rU  r   r]  rh  r   rs  rw  r   r   r  r  r  r  r  r  r  r   r   s   @r4   r   r   D  s   0d<
8 
 
! W W
S2.+7;5A,7	

& "$H 7, 7,r:
C$W W
. 

2"226sBLL7H2I2h3 3 3. .# .&S & &C &C &'dCi0@ 'S '6tSy)9 6c 6
S 
Tc T.G3Yj& LPGG!$G08Gxt1s 1$>853# $s)   CD ,$s $.U&_*AtCy Ae3 e# eQT e`d `(
3::$
E'r6   r   c                   r    e Zd ZdZ fdZ	 	 	 	 dL fd	Zd Zd Zd Zd Z	d	e
d
ede
fdZd Zdef fdZdef fdZd Zd Z fdZd Z	 dMde
de
dee
   fdZd Zd Zd Zd Z fdZd Z fdZd Zd  Zd! Z  fd"Z!d#e"d$e
fd%Z#d&e
de
d'e
d$e
fd(Z$d)e%e"d*f   d$e
fd+Z&d, Z'd- Z(d. Z)d/ Z*d0 Z+d1e,e
   fd2Z-d3e
d4e
d5e
f fd6Z.d7 Z/d8 Z0d9 Z1d: Z2d; Z3d< Z4d= Z5 e6jn                  d      dNd>e
fd?       Z8d@ Z9	 dOdAZ:d$e
fdBZ;d$e
fdCZ<dD Z= fdEZ>dF Z?	 	 	 	 dPdGZ@	 dQdHZAdI ZBd$e
fdJZCd$e
fdKZD xZES )RCppWrapperCodeGenzH
    Generates cpp wrapper for running on CPU and calls cpp kernels
    c                    t         |           d| _        d| _        d| _        d| _        d| _        d| _        d| _        t               | _
        d| _        d	| _        d
| _        d| _        d| _        t               | _        t#               | _        t#               | _        t               | _        t#               | _        t#               | _        t               | _        ddlm}m} || _         G d d|      } |       j8                  | _        y )Nauto ry  {}z//rI   zat::Tensor()zsizes()z	strides()inductor_entry_cppFr#   )cexpr
CppPrinterc                       e Zd Zd Zy)6CppWrapperCodeGen.__init__.<locals>.GridExprCppPrinterc                     |j                   \  }}| j                  | j                  |            }| j                  | j                  |            }|j                  sJ d       d| d| dS )Nz"Expect integers in GridExprPrinterr`   /ra   )r   parendoprint
is_integer)r   r  rk   divs       r4   _print_FloorDivzFCppWrapperCodeGen.__init__.<locals>.GridExprCppPrinter._print_FloorDiv  sb    3JJt||A/jjc!23L(LL1#Qse1~%r6   N)r   r   r   r  r   r6   r4   GridExprCppPrinterr    s    &r6   r  )r   r   r  r  r  r	  r
  r  r  r  extern_call_opsr  r  call_func_namer  r  outputs_need_copyr   kernel_callsite_idint_array_iddeclared_int_array_varstmp_tensor_id
arg_var_idused_cached_dtypesrL   r  r  r  r  grid_expr_printer)r   r  r  r  r   s       r4   r   zCppWrapperCodeGen.__init__  s    !&"u	!2	+0(!$"''!G'*u$"W'"%%*!	& 	& "4!5!=!=r6   Nc                 V   |rt         |   ||||||      S t        j                  j                  rt
        j                  j                  rddlm	} g }|D ]  }	dt        | j                         }
| j                  d|
 | j                          | j                  d|	 d|
 d       t        j                  j                  |	      }||   }|j                  d| d	|
 d
        | j                  | j!                  ||             y| j                  | j!                  ||             y)r7  r#   DTYPE_TO_CPPvar_zvoid *4AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(, &));r`   z*)(ra   N)r   rC  r"   r/   r   r   r   r   rL   r  rL  r  ry   r  r.   r   r,  )r   rf   r+  r   r?  r  r@  r  new_argsr  var_namer  	cpp_dtyper   s                r4   rC  z&CppWrapperCodeGen.generate_kernel_call  s   $ 7/i|T6  wwF$7$7$F$F-$C!%d4??&;%<=HNNVH:dkk]#CDNNNseSVW_V``cd GG--c2E ,U 3IOOa	{#hZq$AB % t44T8DEt44T9EFr6   c                 F    | j                   j                  d| d|        y )Nz// r_   r   r#  s      r4   r  z CppWrapperCodeGen.write_constant  s!    D66(34r6   c                    t         j                  j                  r~t        t        j
                  j                  t        j
                  j                  t              dd            5 }| j                  j                  |j                                d d d        n| j                  j                  d       t        j                  j                  r| j                  j                  d       n| j                  j                  d       | j                  j                  d       ddlm} | j                  j                  d	| d
| d       y # 1 sw Y   xY w)Naoti_runtimezinterface.cppz
                import torch
                from torch._inductor.codecache import CppWrapperCodeCache

                cpp_wrapper_src = (
                '''
                z2#include <torch/csrc/inductor/aoti_torch/c/shim.h>a  
                #include <ATen/ATen.h>
                #include <ATen/core/dispatch/Dispatcher.h>
                #include <ATen/native/BinaryOps.h>
                #include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
                #include <torch/csrc/inductor/inductor_ops.h>
                #define reinterpret_tensor torch::inductor::_reinterpret_tensor
                #define alloc_from_pool torch::inductor::_alloc_from_pool
                z"#include <c10/util/generic_math.h>r#   )ALIGN_BYTESzd
            [[maybe_unused]] static int64_t align(int64_t nbytes) {
              return (nbytes + z	 - 1) & -z;
            }
            )r"   r/   r   openospathrh   dirname__file__r   r%  readr   r   r   ra  r  )r   fr  s      r4   r  zCppWrapperCodeGen.write_header"  s   77RWW__X6X""1668, 
 KK --KKSTKK
 	?@0 	  +}Ik] C	
I s   &*E		Ec                     ddl m} t               }t        t        j
                  j                        D ]  \  }}t        ||      rd||<   d||<    || _        y )Nr   )ShapeAsConstantBufferFT)	r   r  r  r  r"   r/   r/  r   output_is_tensor)r   r  r  r  rk   s        r4   r4  z"CppWrapperCodeGen.mark_output_typeP  sU    .6 5 56FC!23(- %(, %	 7 !1r6   c                     t         j                  j                  r7| j                  j	                  d       | j                  j	                  d       y y )Nznamespace torch {znamespace aot_inductor {)r"   r/   r   r   ry   r   s    r4   r  zCppWrapperCodeGen.write_prefix]  s:    77KK!!"56KK!!"<= r6   	info_kindr  rf   c                 L    | j                   j                  | d| d| d       y )Nr   
].name = "";)r   ry   )r   r  r  rf   s       r4   write_input_output_infoz)CppWrapperCodeGen.write_input_output_infob  s)     	9+Qse:dV2HIr6   c                    t        t        j                  j                  j	                               }t        j                  j
                  r| j                  j                  d       n)| j                  j                  d| j                   d       | j                  j                         5  t        j                  j
                  rRt        j                  j                  r| j                  j                  d       n7| j                  j                  d       n| j                  j                  d       |dk7  rt        t        j                  j                  j	                               D ]  \  }}t        t        j                  j                  |   t        j                         rdd	lm} d
dlm}  |t        j                  j                  |         }|J d       ||   }t        j                  j                  rJ d       | j                  j)                  | d| d| d| d       | j                  j)                  d| d| d        t+        d t-        t        j                  j.                  j1                               D              sJ d       t        t        j                  j.                  j	                               D ]  \  }}t        j                  j
                  rdt        j                  j                  r#| j                  j)                  d| d| d       ]| j                  j)                  d| dd| dz          ||z   }	| j                  j)                  d| d|	 d        | j3                  | j                  t        j                  j                         t        j                  j
                  r6| j                  j)                  d       | j                  j)                  d       d d d        y # 1 sw Y   y xY w)Na  
                void AOTInductorModel::run_impl(
                    AtenTensorHandle*
                        input_handles, // array of input AtenTensorHandle; handles
                                        // are stolen; the array itself is borrowed
                    AtenTensorHandle*
                        output_handles, // array for writing output AtenTensorHandle; handles
                                        // will be stolen by the caller; the array itself is
                                        // borrowed
                    DeviceStreamType stream,
                    AOTIProxyExecutorHandle proxy_executor
                ) {
                zstd::vector<at::Tensor> z)(const std::vector<at::Tensor>& inputs) {z
                            auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, num_inputs());
                        z
                            auto inputs = alloc_tensors_by_stealing_from_handles(input_handles, num_inputs());
                        zM
                        py::gil_scoped_release release;
                    r   r   )may_get_constant_buffer_dtyper#   r  z(Fails to get the dtype of the sympy.Exprz@Need to add .item support for abi_compatible AOTInductor codegenr_   z
 = inputs[z].item<>();r  z = std::move(inputs[]);c              3   P   K   | ]  }t        |t        j                           y wr,   r   rY  rH   r;  vs     r4   r<  z7CppWrapperCodeGen.write_wrapper_decl.<locals>.<genexpr>         5U
1ell+5U   $&!Expect all constants to be Tensorz = constants_.at(r   z$ = *tensor_handle_to_tensor_pointer(zconstants_.at(r  z];zinputs.clear();zNauto& kernels = *dynamic_cast<AOTInductorModelKernels*>(this->kernels_.get());)rP   r"   r/   r7  rB  r   r   r%  r  rz   r   r   r   r  r   r   r   r  rL   r  ry   allr   r  r  rC  )
r   
inputs_lenr  	input_keyr  r  r  r  constants_keyconstants_idxs
             r4   write_wrapper_declz$CppWrapperCodeGen.write_wrapper_declj  s^   --2245
77KK  KK,T-@-@,AAkn [[!ww&&55KK&& KK&& "" Q&/0D0D0I0I0K&LNC!!''"6"6y"A5::NI5 =GG00;! "-FEF-$0$7	 & 3 3 B B^]^B--(k9+ZuGI;VZ[ --#I;.B3%sK) 'M0  59!'':K:K:R:R:T5U  323  '00A0A0F0F0H&I"]77## **99-- %m_4EcU"O --#M?2VW"0S;< %/$4MKK))jrJ! 'J( QWW-A-ABww%%&78%%dW "!!s   +MPPr   c                     t         j                  j                  r.|j                  d| d       |j                  d| d| d       y t        |   ||       y )N	int64_t* z_size;z1AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_sizes(r  z_size));)r   r   r   ry   r   r  r   r   rf   r   s      r4   r  z-CppWrapperCodeGen.codegen_input_size_var_decl  sW    --NNYtfF34NNCD6TFRZ[ G/d;r6   c                     t         j                  j                  r.|j                  d| d       |j                  d| d| d       y t        |   ||       y )Nr  z_stride;z3AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_strides(r  z
_stride));)r   r   r   ry   r   r  r  s      r4   r  z/CppWrapperCodeGen.codegen_input_stride_var_decl  sW    --NNYtfH56NNEdV3tfT^_ G1$=r6   c                    | j                   j                  d       | j                   j                  d       | j                   j                  d       t        | j                  j	                         | j
                  j	                               D ]!  }| j                   j                  d| d       # | j                   j                  d       | j                   j                  d       y )Nznamespace {zDclass AOTInductorModelKernels : public AOTInductorModelKernelsBase {z	  public:z    CUfunction z
{nullptr};};z}  // namespace)r   ry   r   r  r  r  )r   rj   s     r4   codegen_model_kernelsz'CppWrapperCodeGen.codegen_model_kernels  s    m,R	
 	k*%%')G)G)N)N)P
F KK!!OF8<"HI
 	d#/0r6   c           
         t        t        j                  j                        }t        t        j                  j                        }t        t        j                  j
                        }| j                  j                  d| d| d| d       | j                  j                         5  t        t        j                  j                  j                               D ]@  \  }\  }}t        |t        j                        rJ d|d       | j                  d||       B t        t        j                  j
                  j                               D ]  \  }\  }}t        |t        j                         sJ | j                  j#                  d| d| d	       | j                  j#                  d| d
| j%                  |j&                         d       | j                  j#                  d| d|j)                          d       | j                  j#                  d| d|j+                         j-                          d       dj/                  |j1                         D cg c]  }t3        |       c}      }	| j                  j#                  d| d|	 d       dj/                  |j5                         D cg c]  }t3        |       c}      }
| j                  j#                  d| d|
 d        | j                  j#                  d       d }| j                  j#                  d |t6        j8                  j:                         d	       | j                  j#                  d |t6        j8                  j<                         d	       t        t        j                  j                        D ]B  \  }}t        |t        j                        rJ dd       d| }| j                  d||       D | j                  j#                  d       ddd       | j                  j#                  d       yc c}w c c}w # 1 sw Y   /xY w)ao  
        // Generated code example
        AOTInductorModel::AOTInductorModel()
            : AOTInductorModelBase(4, 1) {
        inputs_info_[0].name = "input0";
        inputs_info_[0].dtype = "torch.float16";
        ...
        constants_info_[0].name = "L__self___weight";
        constants_info_[0].dtype = at::kFloat;
        constants_info_[0].offset = 0;
        constants_info_[0].data_size = 8192;
        constants_info_[0].shape = {64, 32};
        constants_info_[0].stride = {32, 1};
        ...
        outputs_info_[0].name = "output0";
        outputs_info_[0].dtype = "torch.float16";
        }
        z
            AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map, std::optional<std::string> cubin_dir)
                : AOTInductorModelBase(r^   z, cubin_dir) {
            zinput name=z cannot be symbolicinputs_info_zconstants_info_[r  r  z].dtype = static_cast<int32_t>(r   z].offset = ry  z].data_size = z].shape = {r  z].stride = {z/update_constants_map(std::move(constants_map));c                     | j                  dd      j                  dd      j                  dd      j                  dd      S )	N\z\\"z\"r   z\n	z\t)replacer  s    r4   escape_stringzBCppWrapperCodeGen.codegen_model_constructor.<locals>.escape_string*  s9    IIdF+WS%(WT5)WT5)	r6   zin_spec_ = "zout_spec_ = "zoutput name=r   outputs_info_z=this->kernels_ = std::make_unique<AOTInductorModelKernels>();Nr  )rP   r"   r/   r7  r/  r  r   r%  rz   r  rM   r   r   r   r  rY  rH   ry   codegen_dtyper  storage_offsetuntyped_storagenbytesrh   r  r   r  r   r   serialized_in_specserialized_out_spec)r   
num_inputsnum_outputsnum_constantsr  rf   inptensorr7   size_str
stride_strr  r   s                r4   codegen_model_constructorz+CppWrapperCodeGen.codegen_model_constructor  s   ( --.
!''//0AGG--.((2|2k]"]O T	
 [[!$-agg.B.B.H.H.J$K [dC% 7!D7"567  ,,^S$G	 %L (11B1B1H1H1J'K#^dF!&%,,777%%*:3%z$r&TU%%&se+J4K]K]^d^j^jKkJllno %%&se;v7L7L7N6OqQ %%&se>&:P:P:R:Y:Y:[9\\]^  99fkkm%Dmc!fm%DE%%(8\(SV&WX!YY'H1A'HI
%%&se=CH# (L* KK!!"ST KK!!}V-@-@-S-STUUWX KK!!f.A.A.U.U VWWYZ  ))>)>?V%EJJ 8"TG#678   u~,,_c4H  @ KK!!Oi "p 	c"I &E (I/ "!s-   )F4P<P2/AP<5P7EP<2
P<<Qc                     t         j                  j                  r | j                          | j	                          | j                          t        |   |      S r,   )r"   r/   r   r  r  r  r   r  )r   r  r   s     r4   r  zCppWrapperCodeGen.generateF  sD    77&&(**,!w--r6   c                     t               }t        j                  j                  r&| j                  D ]  }|j                  d| d        |j                  | j                         || _        y )NzCACHE_TORCH_DTYPE(r   )r&   r   r   r   r  ry   r%  r   )r   cached_dtypes_bufferr  s      r4   r  z!CppWrapperCodeGen.finalize_prefixM  s[    -/--00$..1CE7"/MN 1##DKK0*r6   rj   r  c                 B    | j                   j                  d| d       y )Nr   r'  )r   rf   rj   r  r  s        r4   r  zCppWrapperCodeGen.define_kernelU  s     	Rxr?+r6   c                 H   t         j                  j                  rYt         j                  j                  j	                         }t        |      D ]!  \  }}||v rdt        j                  j                  r#| j                  j                  d| d| d       H| j                  j                  d| dd| dz          ot        j                  j                  rs|| j                  v rB| j                  j                  d| d       | j                  j                  d	| d
| d       | j                  j                  d| d| d       | j                  j                  d| dd| dz          $ y | j                  j                  ddj                  |       d       y )Nzaoti_torch_clone(z, &output_handles[r  zoutput_handles[z'] = reinterpret_cast<AtenTensorHandle>(znew at::Tensor(std::move(z.clone())));z4aoti_torch_new_uninitialized_tensor(&output_handles[zaoti_torch_assign_tensors(z, output_handles[z] = z.release();znew at::Tensor(r  zreturn {r^   z};
})r"   r/   r   r  rB  r  r   r   r   r  ry   r  rh   )r   rT  	cst_namesr  r   s        r4   rU  z!CppWrapperCodeGen.generate_returnZ  s   77))..0I(5VY& **99))33/x7I#cR ))33-cU2YZ 9&NO
 **99!T%>%>> --77"VWZV[[^ _ !--77"<VHDUVYUZZ] ^ !--77"1#d6(+ N
 ))33-cU2YZ /xs;<C  6L '')DIIk4J3K7(STr6   c           	      >   t         j                  j                  r4|j                  d       |j                  d       |j                  d       y |j                  d       t	        j
                  |j                               }|j                  d| j                   d| d| j                   d       t        d	 | j                  j                         D              rd
}nmt        t        t         j                  j                              D cg c]  }| j                  |   rd| dnd| d }}ddj!                  |       d}d| d}d}t         j                  j"                  rt        d t%        t         j                  j"                  j                               D              sJ d       ddj!                  t         j                  j"                  j'                                d}|d| dz  }|j                  d| d| d| j                   d       y c c}w )Nz} // AOTInductorModel::run_implz} // namespace aot_inductorz} // namespace torchz'''
)zA
            module = CppWrapperCodeCache.load(cpp_wrapper_src, 'z', 'z', z)
            c              3       K   | ]  }|  y wr,   r   )r;  rk   s     r4   r<  z1CppWrapperCodeGen.generate_end.<locals>.<genexpr>  s     98Qq8rp  zreturn f(args_tensor)zoutputs[r   z].item()r   r^   zI
                    outputs = f(args_tensor)
                    return z
            z[args_tensor = [arg if isinstance(arg, torch.Tensor) else torch.tensor(arg) for arg in args]c              3   P   K   | ]  }t        |t        j                           y wr,   r  r  s     r4   r<  z1CppWrapperCodeGen.generate_end.<locals>.<genexpr>  r  r  r  z(
                    constants_tensor = zF
                    args_tensor.extend(constants_tensor)
            zQ
            def _wrap_func(f):
                def g(args):
                    z
                    z?
                return g
            call = _wrap_func(module.)r"   r/   r   ry   r   	code_hashr|   r%  r  r  r  r  r  r  rP   r/  rh   r  r   rB  )	r   rW  wrapper_call_hash
return_strr  r  outputs_strargs_strconstants_strs	            r4   rX  zCppWrapperCodeGen.generate_end  s8   77>?:;34"%//0ABAAEATAT@UUYZkYlloptpypyoz {	
 9$//668990J s177#8#89::A $(#8#8#;(1#Q8A3hAWW:   dii013K'= )J
 q77
  59!'':K:K:R:R:T5U  323   		!''*;*;*@*@*B CDAFM ((5 7 H 	 J L !&&*&9&9%: ;		
5s   "Hc                     |j                  d      }|d   }|dk(  r|d   }d| }| j                  d| ddj                  |       d	       y )
Nz::r9   callaoti_torch_zAOTI_TORCH_ERROR_CODE_CHECK(r`   r^   r  )splitry   rh   )r   rj   r   kernel_tokenskernel_suffixshim_fns         r4   "generate_c_shim_extern_kernel_callz4CppWrapperCodeGen.generate_c_shim_extern_kernel_call  s^    T*%b)F")"-M/5gYa		$?PPSTUr6   c                     |j                   }| d}| j                  d| d       d| }| j                  |j                         ||gz          | j                  d| d| d       y )N_handleAtenTensorHandle ry  &RAIIAtenTensorHandle r`   r   )rf   ry   r  rc  )r   re  r   rf   output_handle_name
output_args         r4   #generate_c_shim_extern_kernel_allocz5CppWrapperCodeGen.generate_c_shim_extern_kernel_alloc  s    !! $vW-*+=*>a@A+,-
//--/1D	
 	.tfA6H5ILMr6   c                     t         j                  j                  r-t        j                  j
                  r| j                  ||       y t        | !  ||       y r,   )	r"   r/   r   r   r   r   r$  r   rZ  )r   re  r   r   s      r4   rZ  z.CppWrapperCodeGen.generate_extern_kernel_alloc  s=    77 3 3 B B44]DIG0Er6   c                 F   g }g }|j                         }t        |j                        D ]  \  }}t        |t        j
                        r|j                          }| d}	|j                  r5|j                  d   d   |k(  s J d|j                  d   d   d|d|       | j                  d|	 d       |j                  d	|	        |j                  d
| d|	 d       t        |t              r4| d| }
| j                  d|
 d| d       |j                  d	|
        ||j                  d       t        d       ||z   }|j                  J d|j                         | j                  |j                  |       |D ]  }| j                  |        y )Nr  r   r#   zexpected output.indices[0][1]=z == idx=z for output_name_base=r  ry  r   r!  r`   r   r  int64_t r+  nullptrzunsupported type of {output=}z9abi_compatible_kernel is None for fallback_kernel.kernel=)r   r  r  r   r   r  indicesry   r   r;   rH  abi_compatible_kernelrj   r  )r   r[  r   output_argsoutput_raii_handlesoutput_name_baser  r   rf   r"  rf  raii_handles               r4   generate_c_shim_fallback_kernelz1CppWrapperCodeGen.generate_c_shim_fallback_kernel  s    *335$_%<%<=KC&"..1 //+,(,vW%5">>q)!,3]86>>!#4Q#7"9cVCZIYH[\]3!23E2FaHI""Q'9&:#;<#**+D63E2FbI FC(!1 2!C59+c&CD""Q{m#45""9-)*IJJ) >* k!11=	JG0F0F/HI	J=//114	
 /KNN;' /r6   c                     t         j                  j                  r-t        j                  j
                  r| j                  ||       y t        | !  ||       y r,   )	r"   r/   r   r   r   r   r/  r   r\  )r   r[  r   r   s      r4   r\  z*CppWrapperCodeGen.generate_fallback_kernel  s=    77 3 3 B B00$GG,_dCr6   c                    |rO|j                          }|j                          d}| j                  d| d| d       |j                  d|       n|j                  d|        t        j
                  j                  r-t        j                  j                  r| j                  ||       y | j                  | j                  ||             y )N_as_stridedr  r+  ry  r   )r0  r   ry   insertr"   r/   r   r   r   r   r  r,  )r   rj  r0  r   rj   output_as_stridedrf  s          r4   rk  z,CppWrapperCodeGen.generate_extern_kernel_out  s    #.#@#@#B"C(1134K@KNNU;-s3D2EQGHKK;'KK/0277 3 3 B B33FDANN400>?r6   c                    t        |      dk7  sJ t        |      dk(  r|d   }net        j                  |      J d }t        |      D ]8  \  }}t	        fd|j
                  j                         D              s3||   } n |J | j                  |||t        j                  j                  j                  j                  dd       y )Nr   r#   c              3   :   K   | ]  \  }}|d    |   k(    yw)r*  Nr   )r;  r   r  r*  s      r4   r<  zHCppWrapperCodeGen.generate_user_defined_triton_kernel.<locals>.<genexpr>  s&     Q@PHCsd6l3//@Ps   T)r   r?  r  r@  )rP   r   rY   r  r  r{   rM   rC  r"   r/   rn  ro  rJ  )	r   rh  r   r}   r   grid_decisionr  r   r*  s	           @r4   rq  z5CppWrapperCodeGen.generate_user_defined_triton_kernel  s    4yA~~t9> GM'++K8D### M!'*1Q@PQQ$(GM + !,,,!!**99?? 	" 	
r6   c           
         t         j                  j                  r,t        j                  j
                  r|j                  dd      }| d| ddj                  t        t        |             }|dk(  r=|r2|rP|dt         j                  j                  j                  |       z  }n |J d       |ddj                  |       z  }|d| j                   z  }| j                  |       y )	NrI   r  r`   r^   r?  rs  z:Expect reduce to be None for aten.scatter_ with scalar srcra   )r"   r/   r   r   r   r   r  rh   rt  r   wrapper_coderU  r  ry   ru  s	            r4   rz  z+CppWrapperCodeGen.generate_scatter_fallback(  s     77 3 3 B B^^FM:F6("SXXc#v.>%?$@A b!5!5!D!DV!L MNND NPOP" b&)*++D!DKK=!!tr6   c                 Z    t         j                  j                  ry t        |   |       y r,   )r"   r/   r   r   r  )r   r   r   s     r4   r  z'CppWrapperCodeGen.add_benchmark_harness=  s     77%f-r6   rk   r   c                 r    | j                  t        j                  j                  j	                  |            S r,   )r  r"   r/   r0   r1   r1  s     r4   r  z!CppWrapperCodeGen.codegen_sizevarB  s(      !1!1!:!:1!=>>r6   r  rJ  c                     t         j                  j                  rt        j                  j
                  r|S d| d| dS )Nz	std::get<z>(ra   )r"   r/   r   r   r   r   r  s       r4   r  z&CppWrapperCodeGen.codegen_tuple_accessE  s7    77 3 3 B BKugRz33r6   r  .c                     t        t        | j                  |            }t        |      dk(  ryt        |      dk(  r	d|d    dS ddj	                  |       dS )Nr   z{}r#   r  z, }r^   r  )r   rt  r  rP   rh   r  s      r4   r9  z%CppWrapperCodeGen.codegen_shape_tupleL  s^    S--u56u:?u:?az&&DIIe$%R((r6   c                     	 t         j                  j                  j                  |      }t	        |       y# t
        $ r Y yw xY w)NTF)r"   r/   
_shape_env_maybe_evaluate_staticr;   ra  )r   rk   r   s      r4   is_statically_known_intz)CppWrapperCodeGen.is_statically_known_intT  s=    	''$$;;A>CF 		s   47 	AAc                 ,     t         fd|D              S )Nc              3   \   K   | ]#  }t        j                  |      t               % y wr,   )r   rA  r;   r;  rk   r   s     r4   r<  zECppWrapperCodeGen.is_statically_known_list_of_ints.<locals>.<genexpr>]  s$     QS:d::1=sCSs   ),)r  )r   lsts   ` r4    is_statically_known_list_of_intsz2CppWrapperCodeGen.is_statically_known_list_of_ints\  s    QSQQQr6   c                 @    | j                  |j                               S r,   )rF  r8  rk  s     r4   !can_prove_buffer_has_static_shapez3CppWrapperCodeGen.can_prove_buffer_has_static_shape_  s    44V__5FGGr6   c                 f    | j                    xr# t        j                  xr | j                  |      S r,   )r  r   r   rH  rk  s     r4    can_cache_buffer_in_thread_localz2CppWrapperCodeGen.can_cache_buffer_in_thread_localb  s3     		M ?))?66v>	
r6   c                     t        |j                         t        j                        s+t        j
                  j                  r| j                  |      rdS |j                          dS )Nr   	.reset();)	r   r  r   r   r"   r/   r   rJ  r   rk  s     r4   r   z"CppWrapperCodeGen.make_buffer_freek  sZ     &++-r/C/CD  T%J%J6%R 	
 OO%&i0		
r6   rl  c                 2    dj                  d |D              S )Nr_   c              3   &   K   | ]	  }| d   yw)rL  Nr   ro  s     r4   r<  z7CppWrapperCodeGen.make_free_by_names.<locals>.<genexpr>t  s     D|t4&	*|s   rq  rr  s     r4   rs  z$CppWrapperCodeGen.make_free_by_namess  s    xxD|DDDr6   rg  rf  rt  c                 l    t         j                  j                  r	d| d| dS t        |   |||      S )Nr  z = std::move(z);  // reuse)r   r   r   r   rw  )r   rg  rf  rt  r   s       r4   rw  z,CppWrapperCodeGen.codegen_exact_buffer_reusev  s;    --8*M(<HH75h(SSr6   c                 :    | j                   j                  d       y )NzGRECORD_FUNCTION("inductor_wrapper_call", c10::ArrayRef<c10::IValue>());r1  r/  s     r4   r  z5CppWrapperCodeGen.generate_profiler_mark_wrapper_call|  s    ##U	
r6   c                      y r,   r   r   s    r4   r(  z*CppWrapperCodeGen.write_triton_header_once  r  r6   c                      y r,   r   r   s    r4   r  z&CppWrapperCodeGen.generate_start_graph  r  r6   c                      y r,   r   r   s    r4   r  z$CppWrapperCodeGen.generate_end_graph  r  r6   c                 X    |j                         D ]  }| j                  d| d        y )Nz3AOTI_TORCH_ERROR_CODE_CHECK(aoti_check_inf_and_nan(r  )	get_namesry   )r   nodesr;  s      r4   r  z.CppWrapperCodeGen.generate_inf_and_nan_checker  s+    ??$CNNEcU#N %r6   c                    t         j                  j                  r,d|j                   d|j                  r|j                   S d S ddlm} |j                   d||j                      d|j                   dS ||j                      S )	Ncached_torch_device_type_r?  r   r#   )DEVICE_TO_ATENc10::Device(r^   ra   )r   r   r   r   rJ  rL   rY  )r   r  rY  s      r4   codegen_devicez CppWrapperCodeGen.codegen_device  s    --.v{{m1V\\V\\<abb_`<abb+ <<+ ~fkk:;2fll^1M 'v{{34r6   c                     t         j                  j                  r=t        |      j	                  d      d   }| j
                  j                  |       d| S ddlm} ||   S )Nr  r9   cached_torch_dtype_r#   )DTYPE_TO_ATEN)	r   r   r   r   r  r  r  rL   r^  )r   r  	dtype_strr^  s       r4   r  zCppWrapperCodeGen.codegen_dtype  sY    --E
((-b1I##''	2(44* ''r6   	int_arrayc                     || }dt        | j                         }|| j                  vr3| j                  j                  |       |j	                  d| d| d       |S )N
int_array_r'  z[] = ry  )rL  r  r  r  ry   )r   r`  r  r,  s       r4   codegen_int_array_varz'CppWrapperCodeGen.codegen_int_array_var  sj     >F4 1 1234d222((,,S1xuE)A>?
r6   c           
          | j                  |j                         |j                         |j                         |j	                         |j                         | j                  |            S r,   )r]  r   r-   r.   r8  r:  rJ  rk  s     r4   r   z(CppWrapperCodeGen.make_buffer_allocation  s[    ##OOOO11&9
 	
r6   c                    | j                  |      }| j                  |      }| j                  |      }| j                  |      }t        j                  j
                  rY|j                  d      \  }}	t        t        |            | j                  || j                        | j                  || j                        ||t        j                  j                  rdn|	d| dg}
d }|r| j                  j                  |       | j                  j!                  d| d       | j                  j#                         5   || j                  ||
       | j                  j!                  d| d	       d d d        | j                  j!                  d
       d| d| dS  || j                  ||
       d| d| dS t        j                  j                  r)|j%                  d      r|j                  d      d    d}n|}| j&                   | d| j(                   d| d| d| d| d| j*                   S # 1 sw Y   xY w)Nr?  zthis->device_idx_r   r  c                 v    | j                  d| d       | j                  ddj                  |       d       y )Nr  _handle;z5AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(r^   r  r|  )r  rf   r   s      r4   	gen_allocz4CppWrapperCodeGen.make_allocation.<locals>.gen_alloc  s?    &&):4&'IJ&&KDIIVZOK\\_`r6   "thread_local RAIIAtenTensorHandle _handle = ([&] {rx   rg  })();r  r`   _handle.get());r!  	_handle);rZ  r   z, this->device_idx_)r+  zempty_strided(r^   z, at::TensorOptions(z).dtype(z)))r[  r  r9  r   r   r   r  r   rP   rc  r  r"   r/   r   r  r  ry   rz   
startswithr  r  r  )r   rf   r  r  r  r  rJ  r  r  	device_idr   rh  tensor_devices                r4   r]  z!CppWrapperCodeGen.make_allocation  s3    $$V,""5)''.))&1--%+\\#%6"KCJ**41B1BC**643D3DE'(ww'7'7#YD6!D 0))--d3!!++8>OP &&--/d//t<%%//'$x0HI 0 !!++G4*4&$GG$++T48.tfAdV9EE77 1 1. A%||C0344HIM"M ||nTF#dnn%5^fBvh2=/%PRSWS^S^R_a	
 0/s   <4H44H=c                    t         j                  j                  r| j                  |      }| j                  |      }dt	        | j
                         }|t        |      | j                  |      t        t        |            | j                  || j                        | j                  || j                        d| g}| j                  j                  d| d       | j                  j                  ddj                  |       d       d| d	S d
j                  dj                  |t        |      | j                  |      | j                  |      | j                  |      g            S )Ntmp_tensor_handle_r   r  ry  z8AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch__alloc_from_pool(r^   r  RAIIAtenTensorHandle(ra   r  )r   r   r   r9  rL  r  r  r  r   rP   rc  r  ry   rh   r  )	r   rf   r  r  r  r  r  tmp_namer   s	            r4   r  z)CppWrapperCodeGen.codegen_alloc_from_pool  s\   --++E2D--f5F+D1C1C,D+EFHf""5)CJ**41B1BC**643D3DEH:D ''*;H:Q(GH''J499UY?J[[^_ +8*A66$++II&M&&u-,,U3,,V4

 
	
r6   c                    t        t        |            }| j                  |      }| j                  |      }| j                  |      }t        j
                  j                  rRdt        | j                         || }|j                          || j                  ||      | j                  ||      |d g}	fd}
| j                  |      r| j                  |      r| j                  |      r| j                  j                         |j                  d d       t!        |d      r|j#                         }nt%        j&                         }|5   |
||	       |j                  d d       d d d        |j                  d	       |j                  d
 d d       S  |
||	       d dS |j                         |||g}	ddj)                  |	       dS # 1 sw Y   lxY w)Nrr  r   c                 x    | j                  d d       | j                  ddj                  |       d       y )Nr  ry  z;AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch__reinterpret_tensor(r^   r  r|  )r  r   rt  s     r4   gen_reinterpret_callzHCppWrapperCodeGen.codegen_reinterpret_view.<locals>.gen_reinterpret_call+  sD      #4XJa!@A  QRVR[R[\`RaQbbefr6   ri  rj  rz   rx   ry  rk  r  r`   rl  rs  ra   r  r^   )r   rP   r9  r  r   r   r   rL  r  r   rc  rJ  rF  r  r  ry   hasattrrz   r   nullcontextrh   )r   r  	size_liststride_listr  r  r  r  r  r   rw  rz   rt  s               @r4   r  z*CppWrapperCodeGen.codegen_reinterpret_view  s    #i.!''	2))+6%%f---+D1C1C,D+EFH ~ ==?#**48**66:H:D 55d;99)D99+F))--h7  8
BST 68,#]]_F'335F(6$$wxj%:;    )  'z8*OL   .> +8*A66MMOT66:D(4(9;;W Vs   GG(c                     t         j                  j                  r| j                  d| d| d       y | j                  | d| d       y )Nz4AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_tensor_copy_(r^   r  r  r   )r   r   r   ry   r  s      r4   r  z%CppWrapperCodeGen.codegen_device_copyk  sK    --NNFse2cURUV NNcU'#b12r6   c                 \    t         j                  j                  st        |   ||       y y r,   )r   r   r   r   r  )r   rf   r  r   s      r4   r  z&CppWrapperCodeGen.codegen_multi_outputs  s(     ""11G(u5 2r6   c                      |j                   j                  D cg c]  }|j                   }}|j                   j                  D cg c]  }|j                   }}g g fd}t        ||      D ]C  \  }}	|	t        |	t        j                        r |||	j                                ; |||	       E  fd}
|D ]  }t        |t        j                        rt        |t        j                        r+t        |j                         t        j                        raJ t        |t        j                        r+t        |j                         t        j                        rJ t        d| d       |D ]s  }|J d       t        |t        t        f      r-|D ]'  } |
|t        j                  j                                ) O |
|t        j                  j                                u fS c c}w c c}w )Nc                    t         j                  t         j                  t         j                  t         j                  t         j
                  f}t        j                  t        j                  f}t        |t         j                        r@t        | |      sJ dt        |               j                  | j                                 y t        |t         j                        rj                  t        |              y t        |t         j                         rj                  t        |              y t        |t         j"                        rIt        | t$        t&        t(        f      sJ t        | t$              rj                  t        |              y y t        |t         j*                        rt        | t,        t.        f      sJ t        |j1                         t         j                        r/j3                  | D cg c]  }|j                           c}       y t        |j1                         t         j4                        rht        |j1                         j1                         t         j                        r2j3                  | D cg c]  }||j                           c}       y t        |j1                         t         j                  t         j                   f      r)j3                  | D cg c]  }t        |       c}       y t        |j1                         t         j"                        rc| D cg c]  }t        |t$               }}t7        |      rit9        |      sJ d       j3                  | D cg c]  }t        |       c}       y t        |j1                         |      sJ dt        |              y y t        ||      sJ dt        |              y c c}w c c}w c c}w c c}w c c}w )Nzgot z6AOTInductor only supports int scalars of the same typez<Fall through arguments must be one of static_arg_types, got )rY  	FloatTypeBoolType
StringTypeTypeDeviceObjTyper   r   r   r   
TensorTyper   r   r0  IntTyper   
SymIntType
NumberTyper;   rB   r   ListTyper   r  getElementTypeextendOptionalTypeanyr  )r  rr   static_arg_typesinductor_tensor_buffersrV  is_int_typenew_int_argsnew_tensor_argss         r4   	fill_argszOCppWrapperCodeGen.generate_extern_kernel_args_decl_if_needed.<locals>.fill_args  s\     

##  		""'#
 (E$4$45!#'>?S4S	{ASS?&&#*?*?*A)BDHemm4##CH-He&6&67##CH-He&6&67!#UD'9:::c3' ''C1 (Henn5!#e}555 h5579I9IJ#**PS+TPS1q/B/B/D.E,FPS+TU++-u/A/A ++-<<>@P@P $**=@RSAMA//123SR  ++-u?O?O/P !''(=AQ(=> 7 7 95;K;KL?B"Cs!:a#5sK"C;'"'  TST   %++S,ASSVS,AB% //13C gUVZ[cVdUefg  ( ". cQRVW_R`Qabc ; ,U S )> #D
 -Bs$    O+O0O07O5=O:O?c                    t        |t        j                        rUj                  d|  d       j                  d|  d       j                  d|  d|  d       j	                  |         y t        |t        j
                        rt        d      t        |t        j                        r3t        |j                         t        j
                        rt        d	      t        d
|       )Nr  z_handle;  // output bufferzAAOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_new_uninitialized_tensor(&z
_handle));r!  r`   rm  z#NYI support for return type: SymIntz)NYI support for return type: List[SymInt]zUnsupported return type found: )
r   rY  r  ry   r   r  rH  r  r  rQ   )r  return_typer  r   s     r4   fill_output_argzUCppWrapperCodeGen.generate_extern_kernel_args_decl_if_needed.<locals>.fill_output_arg  s    +u'7'78!23%7QRSWX[W\\fg !6se1SEKL&&#0K)9)9:)*OPPK8Z**,e.>.>> **UVV$'F{m%TUUr6   zreturn type z is not yet supported.z+Optional return types are not yet supported)rb   rc   re   rg   r   ri   r   rY  r  r  r  r  rH  r   r  rY   )r   r  r  r+  rk   rl   return_typesr  r  rr   r  r  r#  outr  r  s   `             @@r4   *generate_extern_kernel_args_decl_if_neededz<CppWrapperCodeGen.generate_extern_kernel_args_decl_if_neededy  s    +6*=*=*G*GH*GQQ[[*G	H(3(;(;(C(CD(C1(CD@	cD !95MCh(:(:;c8#:#:#<=c8, 6	V$ (K+(8(8:K););<!+"<"<">@P@PQQQK8!+"<"<">@P@PQQQ)";-/EF  ( &J)X+XX)*tUm4%C#C)9)9)=)=)?@ &  
E,<,<,@,@,BC & ,,k IDs   G6	G;c
                     t        j                         r!|J |J |	J | j                  |||||	      S | j                  ||||||      S r,   )r   	is_fbcode=generate_extern_kernel_alloc_and_find_schema_if_needed_fbcode:generate_extern_kernel_alloc_and_find_schema_if_needed_ossr}  s
             r4   r  zHCppWrapperCodeGen.generate_extern_kernel_alloc_and_find_schema_if_needed  s}     ***'''&&&UU  RR( r6   c           
      .   || j                   vr]| j                  d| d       | j                  d| d| d       | j                  d| d       | j                   j                  |       | j                  d| d	| d
dj                  |       d       y )Nzstatic auto op_z = c10::Dispatcher::singleton()z	.findSchemaOrThrow("", "z")z	.typed<r  r  z = op_z.call(r^   r   )r  ry   r  rh   )r   rf   rj   r~  r  r  r  s          r4   r  zLCppWrapperCodeGen.generate_extern_kernel_alloc_and_find_schema_if_needed_oss	  s     !5!55NN!.!11PQ NN(5M4NbQ NNY}oT:;  $$^4D6/vdii6M5NbQ	
r6   c                    fd |      }t        |t              r|g}| j                  |||      \  }}dj                  |      }	dj                  |      }
t	        t
        j                  j                        dz
  }| j                  d| dt	        |       d|
 dt	        |       d|	 d       | j                  j                  |       y )	Nc                     | J d       t        | t        j                        r| j                         S t        | t        t
        f      r t        |       fd| D              S t        dt        |              )Nz+None, i.e. optional output is not supportedc              3   .   K   | ]  } |        y wr,   r   )r;  oextract_output_names     r4   r<  zCppWrapperCodeGen.generate_extern_kernel_alloc_and_find_schema_if_needed_fbcode.<locals>.extract_output_name.<locals>.<genexpr>9	  s      EA!4Q!7s   zUnexpected output: )r   r   r  r   r   r  r   rQ   )r  r  s    r4   r  zlCppWrapperCodeGen.generate_extern_kernel_alloc_and_find_schema_if_needed_fbcode.<locals>.extract_output_name4	  si    ?Q$QQ?#r~~.||~%C$/ tCy E EEE$':49+%FGGr6   r^   r#   z8aoti_torch_proxy_executor_call_function(proxy_executor, z, std::vector<int64_t>{z
}.data(), z , std::vector<AtenTensorHandle>{z
}.data());)r   r   r  rh   rP   r"   r/   extern_kernel_nodesry   r  r  )r   rf   r  r  r  r  r+  tensor_call_argsint_call_argstensor_call_args_strint_call_args_strextern_kernel_node_indexr  s               @r4   r  zOCppWrapperCodeGen.generate_extern_kernel_alloc_and_find_schema_if_needed_fbcode,	  s    	H *'2k3'&-K
 ;;;
	

  $yy)9: IIm4#&qww'B'B#Ca#G F'(=!" #%%6$7{#$% &..B-C;P	
 	  0r6   c                    t         j                  j                  r|st        |t        j
                        r|yt        |t        t        t        t        f      rDdt        | j                         }| j                  d| d| j                  |       d       d| S t        |j                         t        j                        sd| j                  |       S | j                  |      S )N0r  r  r+  ry  r   )r   r   r   r   rY  r  r   r;   r   rB   rL  r  ry   rU  r  r  )r   rI  r   rJ  r  s        r4   rK  z$CppWrapperCodeGen.val_to_cpp_arg_strY	  s    ..!5%"4"45{#c367!$t"7!89xjD4G4G4L3MQOP8*~%e224e6F6FG4..s3455""3''r6   c                 h    |t         j                  j                  ryyt        |t              r&t         j                  j                  r|rdS dS |rdS dS t        |t
              r| dS t        |t              rd| dS t        |t        t        t        f      r|j                         S t        |t        j                        r j                  |      S t        |t        j                        r j                  |      S t        |t               r(|t!        d      t!        d	      fv r|t!        d      k(  ry
yt        |t"        t$        f      rYddj'                   fd|D               d}t         j                  j                  r j)                  |       dt+        |       S |S t-        |      S )Nr  c10::nullopt1truefalser:   r  infz-infz&std::numeric_limits<float>::infinity()z'-std::numeric_limits<float>::infinity()r  r^   c              3   @   K   | ]  }j                  |        y wr,   rT  rD  s     r4   r<  z3CppWrapperCodeGen.val_to_arg_str.<locals>.<genexpr>	  s     #HCqD$7$7$:Cs   r  )r   r   r   r   r   r;   r   r   r   r   r0  rY  r  r[  r  r  rB   r   r  rh   rc  rP   rd   )r   r   rW  s   `  r4   rU  z CppWrapperCodeGen.val_to_arg_strj	  sw   ;""11!T"""11!s*s*!$v1'1S!U!9S!se1:nk?KL((**U\\*&&s++U[[)%%c**U#eeFm/L(LeEl"?@dE]+$))#HC#HHILF""1144V<=RCzJJ9r6   r  )NFr,   )Fr  r  )Fr   r   r   r  r   rC  r  r  r4  r  r   r;   r  r  r&   r  r  r  r  r  r  r   r  rU  rX  r  r$  rZ  r/  r\  rk  rq  rz  r  r   r  r  r
   r9  rA  rF  rH  rJ  r   r   rs  rw  r  r(  r  r  r  r[  r  r  r  rc  r   r]  r  r  r  r  r  r  r  r  rK  rU  r   r   s   @r4   r  r    s]   (>\ &GP5,
\1>
JJ J 	JcJ< <>. >1V#p.+ LQ,,!$,08,
)UV8
tV	NF!(FD@
.*.
? ?# ?4S 4 4C 4C 4)tSy)9 )c )RH

EtCy ET3 T# TQT T


( Ys   
 TY/
b
S 
@V<	V<p36x-B "$ R "$
.+1Z(s ("#S #r6   r  c            	            e Zd ZdZ fdZ fdZd Z	 ddededee   f fd	Z	 fd
Z
 ej                  d      dedededefd       Zd Zddedee   defdZ	 d fd	Z xZS )CudaWrapperCodeGenzI
    Generates cpp wrapper for running on GPU and calls CUDA kernels
    c                 N    t         |           t               | _        d| _        y r   )r   r   r   grid_idr  r   s    r4   r   zCudaWrapperCodeGen.__init__	  s    w	r6   c                     t         |           | j                  j                  d       t        j
                  j                  s| j                  j                  d       | j                  j                  d       y )Nz#include <filesystem>zr
                #include <c10/cuda/CUDAGuard.h>
                #include <c10/cuda/CUDAStream.h>
                a
  
            #define CUDA_DRIVER_CHECK(EXPR)                    \
            do {                                               \
                CUresult code = EXPR;                          \
                const char *msg;                               \
                cuGetErrorString(code, &msg);                  \
                if (code != CUDA_SUCCESS) {                    \
                    throw std::runtime_error(                  \
                        std::string("CUDA driver error: ") +   \
                        std::string(msg));                     \
                }                                              \
            } while (0);

            namespace {

            struct Grid {
                Grid(uint32_t x, uint32_t y, uint32_t z)
                  : grid_x(x), grid_y(y), grid_z(z) {}
                uint32_t grid_x;
                uint32_t grid_y;
                uint32_t grid_z;

                bool is_non_zero() {
                    return grid_x > 0 && grid_y > 0 && grid_z > 0;
                }
            };

            }  // anonymous namespace

            static inline CUfunction loadKernel(
                    std::string filePath,
                    const std::string &funcName,
                    uint32_t sharedMemBytes,
                    const std::optional<std::string> &cubinDir = std::nullopt) {
                if (cubinDir) {
                    std::filesystem::path p1{*cubinDir};
                    std::filesystem::path p2{filePath};
                    filePath = (p1 / p2.filename()).string();
                }

                CUmodule mod;
                CUfunction func;
                CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str()));
                CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
                if (sharedMemBytes > 0) {
                    CUDA_DRIVER_CHECK(cuFuncSetAttribute(
                        func,
                        CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
                        sharedMemBytes
                    ))
                }
                return func;
            }

            static inline void launchKernel(
                    CUfunction func,
                    uint32_t gridX,
                    uint32_t gridY,
                    uint32_t gridZ,
                    uint32_t numWarps,
                    uint32_t sharedMemBytes,
                    void* args[],
                    cudaStream_t stream) {
                CUDA_DRIVER_CHECK(cuLaunchKernel(
                    func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr
                ));
            }
            )r   r  r   r%  r   r   r   r   s    r4   r  zCudaWrapperCodeGen.write_header	  sb    23""11KK 	CE	
r6   c                 @    d| }| j                  d| d| d       |S )NrH  zcudaStream_t z" = at::cuda::getCurrentCUDAStream(r   r  rI  s      r4   r  z'CudaWrapperCodeGen.write_get_raw_stream	  s4    wD6!CE7"M	
 r6   Nrf   rj   r  c                 .    |st         |   ||||      S y r,   )r   r  )r   rf   rj   r  r  r   s        r4   r  z CudaWrapperCodeGen.define_kernel	  s#     7(vxFF r6   c                    | j                   j                  d       t        j                  j                  s{t        | j                  j                         | j                  j                               D ]!  }| j                   j                  d| d       # | j                   j                  d       t        | )  |      S )Nr   zstatic CUfunction  = nullptr;)r   ry   r"   r/   r   r   r  r  r  r   r  )r   r  rj   r   s      r4   r  zCudaWrapperCodeGen.generate	  s    d#ww""))+T-K-K-R-R-T %%(:6(+&NO KK!!$'w--r6   mangled_name
cubin_path
shared_memc                 J   t         j                  j                  rE| j                  d| d       | j                  d| d| d| d| d	       | j                  d       y | j                  d	| d       | j                  d
| d| d| d| d	       | j                  d       y )Nzif (kernels.z == nullptr) {z    kernels.z = loadKernel("r  z", z, this->cubin_dir_);r  if (z    r   )r"   r/   r   ry   )r   rf   r  r  r  s        r4   generate_load_kernel_oncez,CudaWrapperCodeGen.generate_load_kernel_once
  s     77NN\$?@NN oj\l^SVWaVbbvy NN3NNT$78NNoj\l^3zlZ\_ NN3r6   c           	         t         j                  j                  j                         }g }|D ]  dt	        | j
                         }t        t        j                  t        j                  t        f      r| j                  d| d d       nNt        t        j                        r)| j                  d| d| j                         d       nt              r| j                  d| d d       nt              r| j                  d| d d       nt!        fd|D              r| j                  d| d d       ndk(  r| j                  d| d	       n{d
k(  r| j                  d| d       n`t"        j$                  j&                  r.| j                  d| d       | j                  d d| d       n| j                  d| d d       |j)                  d|         dj+                  |      S )Nr  r  r+  ry  zint zfloat c              3   N   K   | ]  }t              |j                  k(    y wr,   )r   rf   )r;  r7   r  s     r4   r<  z8CudaWrapperCodeGen.generate_args_decl.<locals>.<genexpr>#
  s     AASX's   "%r(  r  r  z = c10::nullopt;zCUdeviceptr r  z, reinterpret_cast<void**>(&z)));z! = reinterpret_cast<CUdeviceptr>(z.data_ptr());r   r^   )r"   r/   r0   r  rL  r  r   r   Integerr  r   ry   r   r  r?   rC   r  r   r   r   r   rh   )r   r+  dynamic_symbolsr  r  r  s        @r4   generate_args_declz%CudaWrapperCodeGen.generate_args_decl
  s   ''**779Cd4??345H#u||_MNxjC5:;C,xjD4E4Ec4J3K1MNhZs3%q9:#zSE;<AAAxjC5:;	!xj<=&xj0@AB&&55NN\(1#=>NNNseSopxoyy}~ NN&xj0QRUQVVcd OOazN+5 8 yy""r6   r   r  c                 "   |s|S t        |t              sJ d|d       |D cg c]   }t        |t              r|j                  n|" }}t	        | }t        j                  |      }|J d| d       |d   |d   |d   d} ||      S c c}w )	z{
        Generate grid configs for launching a CUDA kernel using the grid
        function from triton_heuristics.
        zexpected grid=z to be a listcuda kernel parameters for $ should already exist at this momentx_blocky_blockz_block)XBLOCKYBLOCKZBLOCK)r   r   r   r   default_gridr   rY   )r   rf   r   r  egrid_fnparams	block_cfgs           r4   r5  z(CudaWrapperCodeGen.generate_default_grid7
  s    
 K$%G4''GG%OSTt!
1o >AEtT%%))$/	T(.RS	T Y'Y'Y'
	
 y!! Us   %Bc                    |st         |   ||||||      S t        j                  |      }|J d| d       |j                  dd       }|J d       |j                  t	               d       }	|	t
        j                  j                  |	      s
J d|	        |j                  dd      }
| j                  |||	|
       | j                  |      }dt        | j                         }| j                  d	| d
| d       t        j                  j                  rdn| j!                  |      }| dt        | j"                         }t%        |t&        t(        f      s
J d|       |D cg c]+  }t        j                  j*                  j-                  |      - }}t/        d |D              }|D cg c]  }| j1                  |       }}dj3                  |      }| j                  d| d| d       |r| j                  d| d       t        j                  j                  rd| n|}| j                  dj5                  || d| d| d|d   |d   ||             |r| j                  d       y y c c}w c c}w )Nr  r  r  zmissing mangled_namez0cubin file should already exist at this moment: r  r   kernel_args_var_zvoid* z[] = {r  rH  _grid_z2expected grid to be a list or tuple but got: grid=c              3   2   K   | ]  }t        |        y wr,   )r   r:  s     r4   r<  z:CudaWrapperCodeGen.generate_kernel_call.<locals>.<genexpr>o
  s     'UPT(=d(CPTr=  r^   zGrid z = Grid(r   r  z.is_non_zero()) {zkernels.z-launchKernel({}, {}, {}, {}, {}, {}, {}, {});z.grid_xz.grid_yz.grid_zr  r  )r   rC  r   rY   r   r  r  existsr  r  rL  r  ry   r"   r/   r   r  r  r   r   r  r0   r1   r  r  rh   r  )r   rf   r+  r   r?  r  r@  r  r  r  r  kernel_args_varrH  	grid_namer   grid_has_unbacked_symbolsr3  grid_args_strkernel_var_namer   s                      r4   rC  z'CudaWrapperCodeGen.generate_kernel_callL
  s    7/i|T6  &))$/	T(.RS	Tzz.$7'?)??'ZZ ? A4H
%"''..+
 	K=j\J	K 
 ZZa0
&&t\:zR++I6	,T$2I2I-J,KL0	{#FG((Hd.G.G.U 	 fF4#5"67	4-
 	C@4'B	C 
 =AADD  ))$/DA$''UPT'U$U!>BCddT++D1d	C		),yk-CD$NNT),>?@/0ww/?/?HTF+T;BB+W%+W%+W%{#|$		
 %NN3 %+ BCs   ,0I:4I?r   )Tr  )r   r   r   r  r   r  r  r   r   r  r  r  r  r;   r  r  r   r   r   r5  rC  r   r   s   @r4   r  r  	  s    
Q
f LPGG!$G08G. Y  '* 8; IL     #D"# "T#Y "d ", PT8  8 r6   r  )Tr   r   r   r  r  r  rN   	itertoolsr   r   typingr   r   r   r   r	   r
   r   r   r   rY  torch._dynamo.utilsr   r   torch._inductor.codecacher   %torch.fx.experimental.symbolic_shapesr   r   torch.fx.noder    torch.utils._sympy.singleton_intr   r   r   r   r   r   r   r   r   triton_heuristicsr   r  utilsr   r   r   r    r!   virtualizedr"   r  r$   r%   r&   r'   triton_utilsr(   r)   r  r  r   r5   r   r?   rC   rW   r\   ru   r   rX  r   r   r   r   r   r   r   r   r   r   r  r  r   r6   r4   <module>r     s        	 	 " ? ? ?    6 E Q - 9 $ $ , = = 4   H H 6 	299 c  B# B0	S 	<,&"   * *( ' ' 'T$ $ ; ; ;4 %  , E, E E( 
" 
 
(	! 	@'W @'FF FR&t * t r6   