
    PhF              	         d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ d dlm%Z% d dl&Z&d dl'm(Z(m)Z) d dl*Z*d dl+m,Z, d dl-m.Z. d d	l/m0Z0 d d
l1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7  e	jp                  e9      Z: e"d      Z;ee&jx                  e&jx                  f   Z=dndodZ>d Z? ej                  d      dpd       ZAd ZBdqdZCd ZDd ZEdrdZF	 	 	 	 	 	 dsdZGdtdZH	 	 	 	 dudZI	 	 	 	 dvdZJdwdZKd ZLd ZMdxdydZN	 dz	 	 	 	 	 	 	 d{dZO	 d|	 dydZPd}d ZQd~d!ZRdd"ZSd# ZTd$ ZU e)d%      ZV e"d&d'(      ZW G d) d*eeVeWf   e       ZXdd+ZYd, ZZd- Z[d. Z\	 d	 	 	 dd/Z]d0 Z^dd1Z_dd2Z`dd3Zadd4Zbdd5Zcd6 Zd ej                  d7g d8 ef        ef        ef        ef       g9      Zg ej                  d      dd:       Zhej                  dd;       Zjdd<Zk ej                  d=      d>        Zl G d? d@e      Zm G dA dB      Zn G dC dD      Zo ej                  d      dE        ZpdpdFZqddGZrddHZsdIdJdKZtdL ZudM Zv G dN dO      ZwdP ZxdQ Zyej                  dR        ZzddSZ{dT Z|d dUddVZ}ddWZ~dX ZdY ZdZ Zd[ Zdd\Zej                  d]        Zd^ Zd'Z	 d dlZd_ Zd` Zda Zdb Zdc Z ej                  d      dd        Z ej                  d      de        Zdf Zdg ZdpdhZddiZdj Z G dk dlej*                        ZddmZy# e$ r dIZY tw xY w)    )annotationsN)StringIO)AnyCallableDictGenericIterableList
NamedTupleOptionalProtocolSetTypeVarUnion
ValuesView)mock)Concatenate	ParamSpec)get_interface_for_device)
DeviceType)	EventList)CeilDivCleanDivFloorDivModularIndexing   )config_Tc                    |         t         j                  j                          t        j                  t	        d      t         j                  d      }t         j                  j                  d      }t         j                  j                  d      }|j                          t        d      D ]  }|j                           |          |j                          t         j                  j                          |j                  |      dz  }t        dt	        ||z              }t        dt	        ||z              }	t        |      D ]	  } |          t         j                  j                  t         j                  j                  j                  g      5 }
t        |	      D ]  }|j                           |          t         j                  j                          d	d	d	       t        j!                  d
       t        j!                  
j#                         j%                  dd             t'        |
j)                         D cg c]0  }|j*                  t,        j                  k(  r|j.                  dk7  r|2 c}      }t1        |      |	z  dk7  rt3        dt1        |      |	      t1        |      |	z  }t'        t5        |      D cg c]  \  }}||z  dk7  r| c}}      }|j7                          |j#                         }t        j!                  d       t        j!                  |j%                  d             t9        d |D              dz  |	z  }t        j!                  d|       |S # 1 sw Y   xY wc c}w c c}}w )aR  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
    g    Acuda)dtypedeviceT)enable_timing   r   )
activitiesNz
raw eventsself_cuda_time_total)sort_by	row_limitzContext Syncr   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %szprofiling time breakdown)r)   c              3  4   K   | ]  }|j                     y wN)cuda_time_total).0events     `C:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/_inductor/utils.py	<genexpr>z+do_bench_using_profiling.<locals>.<genexpr>   s     ?e##s   g     @@zprofiling results: %s ms)torchr    synchronizeemptyintEventrecordrangezero_elapsed_timemaxprofilerprofileProfilerActivityCUDAlogdebugkey_averagestabler   eventsdevice_typer   namelenRuntimeError	enumerate_build_treesum)fnwarmuprepcachestart_event	end_event_estimate_msn_warmupn_repeatpir.   filtered_eventsnum_event_per_groupactual_eventsress                    r/   do_bench_using_profilingr[   9   s    D	JJKKJuyyHE **"""6K

  t 4I1X
  	JJ**959K 1c&;./0H1c#+,-H 8_
  
		NN++00
 
  
 
xAKKMD	 ! 	

 
 IIlIIann$$-Cr$RS 	
#  JOO3

n8T #	
O ?h&!+- 	
 	
 o.9 &o6	
65&&!+ 6	
M !..0MII()IIm!!B!/0
??
?&
H8
SCII(#.J_
 
$	
	
s   AM05M=
N
0M:c                 t    t        j                  d       d        } |       \  }}||vrd||<    || i |d   S )Nc                     	 ddl m}  | t	        j
                  |       j                  j                  d      	 dfS dfS # t        $ r}t        d      |d }~ww xY w)Nr   )do_benchzrequires Triton	quantilespercentiles)triton.testingr^   ImportErrorNotImplementedErrorinspect	signature
parametersget)triton_do_benchexcs     r/   load_tritonzdo_bench.<locals>.load_triton   sz    	B C   1<<@@M 
 	
 	
 	
  	B%&78cA	Bs   > 	AAA)g      ?g?g?r   )	functools	lru_cache)argskwargsrj   rh   quantile_field_names        r/   r^   r^      sV    
 
, ,7=(O(&(&5"#D+F+A..    c                     	 ddl m}  | d uxr% t        t        t        j
                  dd       d      S # t        $ r Y yw xY w)Nr   	roi_aligntorchvisionrs   F)torchvision.opsrs   hasattrgetattrr1   opsrb   rr   s    r/   has_torchvision_roi_alignry      sI    -$ 
EII}d3[*
 	
  s   03 	??c                 v    t        j                  t        j                  | D cg c]  }|s|	 c}      S c c}w r+   )rk   reduceoperatormul)rm   xs     r/   conditional_productr      s-    HLLd*@da1d*@AA*@s   6
6
c                d   | t        j                  d      j                  S t        | t              rt        j                  |       } | j
                  dk7  rZ| j                  Nt        | j
                        }t        j                  | j
                  |j                  j                               S | S )Ng        cpu)index)
r1   tensorr"   
isinstancestrtyper   r   Workercurrent_devicer"   device_interfaces     r/   decode_devicer      s    ~||C '''&#f%{{e 43FKK@||FKK/?/F/F/U/U/WXXMrp   c                r    t        j                  t        j                  | t	        j
                  d            S )Nr   )rk   r{   r|   r}   sympyInteger)its    r/   sympy_productr      s$    HLL"emmA.>??rp   c           	         t        |       t        |      k(  sJ t        j                  t        d t	        | |      D                    S )Nc              3  ,   K   | ]  \  }}||z    y wr+    )r-   abs      r/   r0   zsympy_dot.<locals>.<genexpr>   s     >odaAEos   )rF   r   expandrJ   zip)seq1seq2s     r/   	sympy_dotr      s8    t9D	!!!<<>c$o>>??rp   c                \    | D ci c]  }t        |      | c}j                         S c c}w r+   )idvalues)r   r~   s     r/   uniquer      s+     !bBqE1Hb!((**!s   )c           
        t        | t        j                        st        |t        j                        rt        | |      S t        | t              rt        |t              s$J |  dt        |        d| dt        |              | | z   S )Nz: , )r   r   Exprr   r4   r   )numerdenoms     r/   ceildivr      s     %$
5%**(Eue$$ eS!js' 9
4;-r%4;-89  uf_rp   c                    | dk  sJ d       | dz  } | | dz	  z  } | | dz	  z  } | | dz	  z  } | | dz	  z  } | | dz	  z  } | dz  } | S )z9Return the smallest power of 2 greater than or equal to nl        z32-bit onlyr               r   )ns    r/   next_power_of_2r      sk    :$}$:FAaKAaKAaKAaKAbLAFAHrp   c                    | D cg c]G  }t        |t        j                        r|j                  j                  nt        j                  |      I c}S c c}w )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    )r   r1   SymIntnodeexprr   r   )lstrV   s     r/   convert_shape_to_inductorr      sK     SVRUQz!U\\2a8HHRU  s   AAc                   ddl m} | D cg c]j  }t        |t              r|nUt        |t        j
                        rt        |      n0|j                  j                  j                  j                  |d      l c}S c c}w )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    r   )VN)hint)
virtualizedr   r   r4   r   r   graphsizevars	shape_envcreate_symintnode)r   r   rV   s      r/   convert_shape_to_symintr      s       A	 a 	
 a' VWW''99!$9G		H
   s   A/A=c                    t        | t        j                  j                        sJ t	        d | j
                  j                  D              S )z-
    Does this op overload have aliasing
    c              3  8   K   | ]  }|j                   d u  y wr+   )
alias_info)r-   r   s     r/   r0   zis_view.<locals>.<genexpr>  s     F1EAq||4'1Es   )r   r1   _ops
OpOverloadany_schema	arguments)ops    r/   is_viewr     s9     b%**//000F1E1EFFFrp   c                   | j                   dk(  syt        | j                  t        j                  j
                        s| j                  t        j                  u sy| j                  t        j                  u st        | j                        rt        d | j                  D              S t        j                  j                  | j                  j                  v S )Ncall_functionFc              3  2   K   | ]  }t        |        y wr+   )is_pointwise_use)r-   us     r/   r0   z#is_pointwise_use.<locals>.<genexpr>  s     :	1#A&	s   )r   r   targetr1   r   r   r|   getitemr   allusersTag	pointwisetags)uses    r/   r   r     s    66_$ 	3::uzz445xGWGW9W
zzX%%%)<:		:::99#**//11rp   c                   t         j                  j                         }g }g }t        |      D ]e  \  }}t	        |t         j
                        r5|j                  |j                  d|              |j                  |       U|j                  |       g t        d |j                         D              sJ |j                  | t        |      |      }t        | j                  j                        dk(  r2t        | j                  j                  d   j                         dk(  r|f}|j#                  |       t         j                  j%                  i |      }	|	|fS )Nargc              3  R   K   | ]  }t        |t        j                          ! y wr+   )r   r1   Tensorr-   r~   s     r/   r0   z$gen_gm_and_inputs.<locals>.<genexpr>.  s     H1:a..s   %'r   r   r   )r1   fxGraphrH   r   r   appendplaceholderr   r   r   tuplerF   r   returnsr   r   outputGraphModule)
r   rm   rn   gg_argsa_argsr   r   r   gms
             r/   gen_gm_and_inputsr   $  s   AFFD/3c5<<(MM!--#aS	23MM#MM# " HHHHH??65=&9DFNN""#q(&&q)../8;wHHTN			b!	$Bv:rp   c                h    | dk(  ry t        |       }|j                         r|j                          y y )Nr   )r   is_availabler2   r   s     r/   r2   r2   ;  s4    /7$$&$$& 'rp   c                    t        |       t        j                  d       t        j                         }t        |      D ]  } | | }t        |        t        j                         }J ||z
  S )Ni9  )r2   r1   manual_seedtimeperf_counterr7   )modelexample_inputstimesr"   t0rQ   resultt1s           r/   timedr   C  sm     	d				B5\'F  
			B7Nrp   c                    t        j                  t        |      D cg c]  }t        | |||       c}      }t        j                  |      |z  }t        ||z  d       |S c c}w )Nz.6f)r1   r   r7   r   medianprint)	rK   rm   r   repeatbaseliner"   rQ   timingstooks	            r/   print_performancer   R  s_     llE&MRMqE"dE6:MRSG<< 5(D	T(]3!K Ss   A#c                H     t        | |             t        | |fd       y)zKReplace obj.method() with a new method that returns a precomputed constant.c                      S r+   r   )r   s   r/   <lambda>z#precompute_method.<locals>.<lambda>^  s    rp   N)rw   setattr)objmethodr   s     @r/   precompute_methodr   [  s     !WS&!#FC(rp   c                *    |D ]  }t        | |        y)zFReplace methods with new methods that returns a precomputed constants.N)r   )r   methodsr   s      r/   precompute_methodsr  a  s    #v& rp   c                <    t        | |kD        t        | |k        z
  S r+   )r4   )r   r   s     r/   cmpr  g  s    q1u:AE
""rp   c                R    t        |       dk(  r t        |       | d   g      |z  S | S )Nr   r   )rF   r   )r~   sizes     r/   pad_listliker  k  s-    
1v{tAw!v%%rp   c                B    t        |       dk(  rg S d }t        | |      S )Nr   c                F    t        | t              r| S | j                         S r+   )r   r   get_name)elems    r/   	sort_funcztuple_sorted.<locals>.sort_funcw  s    dC K ==?"rp   key)rF   sorted)r~   r  s     r/   tuple_sortedr  s  s&    
1v{	# !##rp   PRVT)	covariantc                  &    e Zd Zedd       ZddZy)CachedMethodc                     y r+   r   selfs    r/   clear_cachezCachedMethod.clear_cache  s    rp   c                     y r+   r   )r  rm   rn   s      r/   __call__zCachedMethod.__call__  s    rp   N)returnNone)rm   zP.argsrn   zP.kwargsr  r  )__name__
__module____qualname__staticmethodr  r  r   rp   r/   r  r    s     rp   r  c                ~     d j                    dt        j                          fd       }fd}||_        |S )N___cachec                Z    t        |       st        |  |              t        |       S r+   )rv   r   rw   )r  rK   r  s    r/   wrapperzcache_on_self.<locals>.wrapper  s*    tS!D#r$x(tS!!rp   c                8    t        |       rt        |        y y r+   )rv   delattr)r  r  s    r/   r  z"cache_on_self.<locals>.clear_cache  s    4D# rp   )r  rk   wrapsr  )rK   r&  r  r  s   `  @r/   cache_on_selfr*    sD    r{{m6
"C__R" "
 &GNrp   c           
     ^   ddl m} t        | t              rgt	        j
                  t        j                  | D cg c]0  }t        |d      r"|j                  r|j                  j                  2 c}t                     S t        | |j                        r| j                  S t               S c c}w )Nr   irr   ) r-  r   listrk   r{   r|   or_rv   r   originssetExternKernel)node_scheduler-  r   s      r/   aggregate_originsr5    s    -&LL *)D4(TYY 		!!)
 E
 	
 
M2??	3$$$us   5B*
c                   t        |       }|dk(  rb|D cg c]B  }|j                  dk(  r1d|j                  v r#|j                  d   j                  j                  D }}t        t        |            }n|dk(  rg }|D ]y  }|j                  dk(  sd|j                  v s"|j                  d   d   }t        |d   t              r|j                  |d          \|j                  |d   j                         { t        t        |            }n5|dk(  r*|D cg c]  }|j                  dk(  s|j                    }}nt        |}dj                  d	g|z         S c c}w c c}w )
Noriginal_atenr   r1   source_fn_stackr'   r   inductor_noderQ   fused)r5  r   meta_overloadpacketr  r  r2  r   r   r   rE   rc   join)r4  descriptive_namesall_originsoriginsources	source_fns         r/   get_fused_kernel_namerC    sY   #M2KO+ &
%yyO+6;;0N KK(88AA% 	 

 W&	g	%!FyyO+0AV[[0P"KK(9:2>	ilC0NN9Q<0NN9Q<#8#89 " W&	o	-&1
&1FVYY/5QFKKk 	 
 "!G88WI'((1
$
s   AEE+Ec                   t        |       }|D cg c]  }|j                  dk(  s| }}t        j                  t              }t        j                  t              }|D ]  }d|j
                  v r@t        |j
                  d   j                        }||   j                  |j                         d|j
                  v s`|j
                  d   d   d   }||   j                  |j                          |j                   ddj                  t        |j                                      ddj                  t        |j                                      d}	g }
t        |j                               D ]@  \  }}|
j                  |j                   d	| d
dj                  t        |                    B |	dj                  |
      fS c c}w )Nr   r7  	from_noder   z Source Nodes: [r   z], Original ATen: [] z => 
)r5  r   collectionsdefaultdictr/  r;  r   r<  r   rE   commentr=  r  keysitems)r4  r&  r?  r@  inductor_nodesfrom_node_dictoriginal_aten_dictr   r  metadatadetailed_metadataoriginal_nodenodess                r/   get_kernel_metadatarU    s   #M2K+6W;&)):Vf;NW ,,T2N$006dii'dii0@@ACs#**4995$))#))K(+A.C3&&tyy1  ??
+DIIf^=P=P=R6S,T+U V99V,>,C,C,E%FGH	K 
  &~';';'= >u  qtDIIfUm4L3MN	
 !? TYY0111+ Xs
   GGc                    t        |       } t        |       }| rV| j                         }|j                  D ]4  }|r	 ||      r||vs|j	                  |       | j                  |       6 | rV|S )zJReturns the set of nodes whose values depend on those within initial_queue)r/  r2  popr   addr   )initial_queueskip_filterdominated_setr   users        r/   dominated_nodesr]    su     'M&M
  "JJD{40=(!!$'$$T*   rp   c                   dd l }ddlm fd|j                         D cg c]  } |      s|j                   }}| D cg c]  } |      s|j                   }}t         |j                  g ||       S c c}w c c}w )Nr   r   r,  c                    t        | j                        r | j                        S t        | j                        r | j                        S t        | j                        xr t        | j
                        S r+   )r   	TensorBoxdata
StorageBoxIRNode	Pointwise)r   r-  is_unrealized_nodes    r/   re  z*gather_origins.<locals>.is_unrealized_node  s^    a&%aff--a'%aff--!RYY'GJq",,,GGrp   )	itertoolsr.  r-  r   r1  r2  chain)	rm   rn   rf  valkwarg_originsr   arg_originsr-  re  s	          @@r/   gather_originsrk     s    H -3MMOWOS?QRU?VS[[OMW*.J$32DS2I3;;$KJy<<m<== XJs   BBBBc                ,   t        | t        j                        r| j                  S t        | t        j                        r)dj                  t        t        | j                              S t        | t        j                        r)dj                  t        t        | j                              S t        | t        t        t        f      rC| j                  j                   ddj                  t        t        | j                               dS t        |       S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    z + z * (r   ))r   r   SymbolrE   Addr=  map	sympy_strrm   Mulr   r   r   funcr  r   )r   s    r/   rr  rr    s     $%yy$		"zz#i344$		"zz#i344$(H=>))$$%QtyyY		1J'K&LANNt9rp   c                F    | d   dk7  sJ t        j                  | dd      S )Nr   sT)integernonnegative)r   ro  rE   s    r/   sympy_symbolrz  #  s)     7c>> <<d==rp   c           
         d }t        j                  |       j                  |j                         D ci c]  \  }} ||       ||       c}}      S c c}}w )z=
    xreplace is faster than subs, but is way more picky
    c                <    t        | t              rt        |       S | S r+   )r   r   rz  r  s    r/   promote_stringsz#sympy_subs.<locals>.promote_strings1  s    c3$$
rp   )r   sympifyxreplacerM  )r   replacementsr}  kvs        r/   
sympy_subsr  ,  sX    

 ==''<H<N<N<PQ<PDAq	_Q/	/<PQ Qs   A
c                @    t        fd| j                  D              S )Nc              3  T   K   | ]  }|j                   j                         ! y wr+   )rE   
startswith)r-   r  prefixs     r/   r0   z)free_symbol_startswith.<locals>.<genexpr><  s#     E2DQqvv  (2Ds   %(r   free_symbols)r   r  s    `r/   free_symbol_startswithr  ;  s    E%2D2DEEErp   c                @    t        fd| j                  D              S )Nc              3  :   K   | ]  }|j                   v   y wr+   ry  )r-   r  patterns     r/   r0   z"free_symbol_has.<locals>.<genexpr>@  s     =*<Qw!&& *<s   r  )r   r  s    `r/   free_symbol_hasr  ?  s    =%*<*<===rp   c                    h d}t        j                         r|j                  h d       | j                  j                  D ]  }t        |j                        |v s y y)N>   aten._local_scalar_denseaten.multinomial.defaultfbgemm.dense_to_jagged.default%fbgemm.jagged_to_padded_dense.default,aten._fused_moving_avg_obs_fq_helper.default7aten._fused_moving_avg_obs_fq_helper_functional.defaultrun_with_rng_staterun_and_save_rng_state>   aten.scatter.srcaten.scatter_add_aten.scatter.reduceaten.index_put.defaultaten.index_put_.defaultaten.scatter_reduce.twoaten.scatter_add.defaultaten.scatter_reduce_.twoaten.scatter.value_reduceaten.scatter_reduce.two_outaten._unsafe_index_put.defaultTF)r1   $are_deterministic_algorithms_enabledupdater   rT  r   r   )r   forbidden_setr   s      r/   has_incompatible_cudagraph_opsr  C  sW    	M 113	
 t{{},  rp   instance_descriptor)divisible_by_16
equal_to_1ids_of_folded_argsdivisible_by_8)defaultsc                 2   t         j                  j                  d      } | ^t        j                  ddt        j                               }t         j                  j                  t        j                         d|z         } t        j                  | d       | S )NTORCHINDUCTOR_CACHE_DIRz[\\/:*?"<>|]rQ   torchinductor_T)exist_ok)osenvironrg   resubgetpassgetuserpathr=  tempfile
gettempdirmakedirs)	cache_dirsanitized_usernames     r/   r  r  k  sr    

89IVVOS'//:KLGGLL!11
	 KK	D)rp   c              #  4  K   t        j                         5 }t        j                  j	                  t
        j                  d|i      5  t
        j                  j                  |d      }t        j                  j	                  t
        j                  d|i      5  d t        | t              rt        |       dk(  sJ d       t
        j                  j                  |      rtt        j                  |      }| j                  |D ci c]D  }d|vr>|t
        j                  j                  t
        j                  j                  ||            F c}       ddd       ddd       ddd       yc c}w # 1 sw Y   xY w# 1 sw Y   #xY w# 1 sw Y   yxY ww)z
    Contextmanager that provides a clean tmp cachedir for inductor.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    r  tritonTRITON_CACHE_DIRNr   z!expected empty cache_entries dictz.lock)r  TemporaryDirectoryr   patchdictr  r  r  r=  r   rF   existslistdirr  getsize)cache_entriesinductor_cache_dirtriton_cache_dirfilesfs        r/   fresh_inductor_cacher  x  s=     
	$	$	&*<ZZ__JJ24FG
  "ww||,>I.@BR-STmT2}-2W4WW2ww~~&67 "

+; <%,, */).A#*!#3 !"277??277<<@PRS3T#U U). U	
 
'	& UT	
 
 
'	&sa   F1FAF A-E4A	E/E4F F&	F/E44E=9F  F		FFFc           	         | j                   }t        t        |             }t        t	        t        ||d                  S )NT)r  reverse)__getitem__r7   rF   r/  reversedr  )seqgettera_rs      r/   argsortr    s1    __F
C/C>?@@rp   r   c                L    t        j                  d|       j                         S )Nr   r!   )r1   r3   element_sizer  s    r/   get_dtype_sizer    s    ;;r'4466rp   c                      e Zd ZU ded<   y)LineContextr   contextN)r  r  r   __annotations__r   rp   r/   r  r    s    Lrp   r  c                  d    e Zd ZdZddZddZddZddZd Zd Z	d Z
d	 Zd
 Zd ZddZddZy)IndentedBufferr   c                     g | _         || _        y r+   )_lines_indent)r  initial_indents     r/   __init__zIndentedBuffer.__init__  s    %rp   c                   t               }d}g }| j                  D ]  }t        |t              r
 |       }|/t        |t              r|j                  ||j                  f       Kt        |t              sJ |j                  |       |j                  d       |d|j                  d      z   z  } |j                         |fS )Nr   rH  )r   r  r   DeferredLineBaser  r   r  r   writecountgetvalue)r  bufrU   linemaplines        r/   getvaluewithlinemapz"IndentedBuffer.getvaluewithlinemap  s    jKKD$ 01v<D+.4<<01dC(((IIdOIIdOTZZ%%%A   ||~w&&rp   c                ,    | j                         \  }}|S r+   )r  )r  r  rQ   s      r/   r  zIndentedBuffer.getvalue  s    '')1rp   c                b   t               }| j                  D ]  }t        |t              r
 |       }|t        |t              r.t        |t
              sJ |j                  d      r|j                  |d d        f|j                  |       |j                  d        |j                         S )N\r'   rH  )	r   r  r   r  r  r   endswithr  r  )r  r  r  s      r/   getrawvaluezIndentedBuffer.getrawvalue  s    jKKD$ 01v<D+.dC(((}}T"		$s)$		$		$   ||~rp   c                8    | j                   j                          y r+   )r  clearr  s    r/   r  zIndentedBuffer.clear  s    rp   c                ,    t        | j                        S r+   )boolr  r  s    r/   __bool__zIndentedBuffer.__bool__  s    DKK  rp   c                :    d| j                   | j                  z  z  S )NrG  )r  tabwidthr  s    r/   r  zIndentedBuffer.prefix  s    dllT]]233rp   c                &    | j                  d       y )NrH  	writeliner  s    r/   newlinezIndentedBuffer.newline  s    trp   c                   t        |t              r| j                  j                  |       y t        |t              r9| j                  j                  |j                  | j                                      y |j                         r.| j                  j                  | j                          |        y | j                  j                  d       y Nr.  )r   r  r  r   r  with_prefixr  stripr  r  s     r/   r  zIndentedBuffer.writeline  s    dK(KKt$./KKt//>?ZZ\KK$++-78KKr"rp   c                4    |D ]  }| j                  |        y r+   r  )r  linesr  s      r/   
writelineszIndentedBuffer.writelines  s    DNN4  rp   c                F     t         j                   fd       } |       S )Nc               3     K   xj                    z  c_         	 d  xj                    z  c_         y # xj                    z  c_         w xY wwr+   )r  )offsetr  s   r/   ctxz"IndentedBuffer.indent.<locals>.ctx  s9     LLF"L'&&s   A4 AAA)
contextlibcontextmanager)r  r  r  s   `` r/   indentzIndentedBuffer.indent  s$    		"	"	' 
#	' urp   c           	        t        |t              rt        d      }|j                  D ]E  }t        |t              r|st        |t        |      t        |j                               z
        }G t        j                  |      rd}|j                  D ]P  }t        |t              r| j                  j                  |       /t        j                  | |t        |      d         R y t        j                  |      }|r|j                         }|sy |j                         }|j!                  d      D ]  }| j                  |        y )Ninfr   rH  )r   r  floatr  r  minrF   lstripmathisinfr   r  r4   textwrapdedentrstripsplit)r  
other_coder  r  r  s        r/   splicezIndentedBuffer.splice  s   j.15\F"))!$4 TS5G)GHF * zz&!"))dK0KK&&t,",,T4F3FG	 * "4J'..0
#**,J"((.t$ /rp   N)r   )r  z)tuple[str, list[tuple[int, LineContext]]]r  r   )r   )F)r  r  r   r  r  r  r  r  r  r  r  r  r  r  r
  r  r   rp   r/   r  r    sC    H&'$$!4#!	%rp   r  c                  D    e Zd ZdZd ZddZddZd Zd Zd Z	d Z
d	 Zy
)r  z.A line that can be 'unwritten' at a later timec                6    |j                         sd}|| _        y r  )r  r  r   s     r/   r  zDeferredLineBase.__init__  s    zz|D	rp   c                    t               )zJReturns either self.line or None to indicate the line has been 'unwritten'rc   r  s    r/   r  zDeferredLineBase.__call__      !##rp   c                    t               )z3Returns a new deferred line with the same conditionr  r   s     r/   	_new_linezDeferredLineBase._new_line  r  rp   c                @    | j                  | | j                         S r+   r  r  )r  r  s     r/   r  zDeferredLineBase.with_prefix   s    ~~455rp   c                T    | j                  | j                  j                               S r+   )r  r  r  r  s    r/   r  zDeferredLineBase.lstrip#  s    ~~dii..011rp   c                >    | j                  | j                  |         S r+   r!  )r  r   s     r/   r  zDeferredLineBase.__getitem__&  s    ~~dii.//rp   c                ,    t        | j                        S r+   )r  r  r  s    r/   r  zDeferredLineBase.__bool__)  s    DIIrp   c                ,    t        | j                        S r+   )rF   r  r  s    r/   __len__zDeferredLineBase.__len__,  s    499~rp   N)r  zOptional[str])r  r   r  r  )r  r  r   __doc__r  r  r  r  r  r  r  r&  r   rp   r/   r  r    s-    8
$$620rp   r  c                    t         j                  j                  |       j                  }|dk  rt        j                  d       yy)NP   z,not enough SMs to use max_autotune_gemm modeFT)r1   r    get_device_propertiesmulti_processor_countr?   warning)r   smss     r/   
is_big_gpur.  0  s6    
**
*
*5
1
G
GC
RxBCrp   c                 j    t         j                  xs" t         j                  xs t         j                  S r+   )r   max_autotunemax_autotune_gemmsearch_autotune_cacher   rp   r/   use_max_autotuner3  9  s&    Wv77W6;W;Wrp   c                    t               xrN | j                  j                  dk(  xr3 | j                  |v xr# t	        | j                  j
                  xs d      S )Nr    r   )r3  r"   r   r!   r.  r   )layoutallowed_layout_dtypess     r/   _use_template_for_cudar7  ?  sT     	1MM&(	1LL11	1 v}}**/a0	rp   c                    | j                         t        j                  j                         j                  d      D cg c]  }|j	                          c}v S c c}w )N,)upperr   max_autotune_gemm_backendsr  r  )backendr~   s     r/   _use_autotune_backendr=  H  sR    ==?!<<BBDJJ3OOa	O   s   AF)enable_int32c                  t         j                  t         j                  t         j                  g}|r>t         j                  t         j                  t         j                  t         j                  g}t        | |      xr t        d      S )NTRITON)r1   float16bfloat16float32int32r7  r=  )r5  r>  layout_dtypess      r/   use_triton_templaterF  N  sZ    ]]ENNEMMBMu{{S!&-8 =R> rp   c                   ddl m} t        j                  j                  ryt        j
                  t        j                  t        j                  g}t        | |      xr t        d      }|r |       st        j                  d       y|S )Nr   )try_import_cutlassFCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir is set correctly. Skipping CUTLASS backend for now.)codegen.cuda.cutlass_utilsrH  r1   versionhiprA  rB  rC  r7  r=  r?   r,  )r5  rH  rE  rZ   s       r/   use_cutlass_templaterM  W  sq    > }}]]ENNEMMBM
 
7 <Q=C !#KK4
 Jrp   c                 2    t                xs t        d      S )NATEN)r3  r=  r   rp   r/   use_aten_gemm_kernelsrP  n  s    !!B%:6%BBrp   c                  N    e Zd ZU  ej                  d      Zded<   d Zd Zd Z	y)DebugDirManagerr   r   prev_debug_namec                @    t        t        j                        | _        y r+   )nextrR  counterr   r  s    r/   r  zDebugDirManager.__init__v  s    ../rp   c                    t         j                  j                  j                  | _        | j                   d| j
                   | _        | j                  t         j                  j                  _        y )N_tmp_)r1   _dynamor   debug_dir_rootrS  r   new_namer  s    r/   	__enter__zDebugDirManager.__enter__y  sM    $}}33BB//0dggY?.2mm+rp   c                    t        j                  | j                         | j                  t        j
                  j                  _        y r+   )shutilrmtreer[  rS  r1   rY  r   rZ  )r  rm   s     r/   __exit__zDebugDirManager.__exit__~  s*    dmm$.2.B.B+rp   N)
r  r  r   rf  r  rV  r  r  r\  r`  r   rp   r/   rR  rR  r  s(    iooa G0<
Crp   rR  c                    ddl m} |j                  g fd}t        j                  j                  |d|      5  t        j                  j                           | |i |}d d d        |fS # 1 sw Y   fS xY w)Nr   )GraphLoweringc                     |       }t        |j                        5 }j                  |j                                d d d        |S # 1 sw Y   |S xY wr+   )open__file__r   read)r  modr  compile_to_modulesource_codess      r/   patched_compile_to_modulez3run_and_get_code.<locals>.patched_compile_to_module  sD    %#,,1)  
  
s    A		Arh  )	r   rb  rh  r   r  objectr1   rY  reset)rK   rm   rn   rb  rj  r   rh  ri  s         @@r/   run_and_get_coderm    s    $%77L 
		*,E
 	T$V$	

 <

 <s   'A11A=c                    t        | g|i |\  }}dt        |      cxk  rdk  sn J dt        |              |d   S )Nr   r   z%expected one or two code outputs got r   )rm  rF   )rK   rm   rn   rQ   ri  s        r/   run_and_get_triton_codero    sW    &r;D;F;OA| 	
S#!#C	.s</@.ABC#?rp   c              #     K   ddl m} |j                  |    }	 t        j                  ||      |j                  | <   d ||j                  | <   y# ||j                  | <   w xY ww)z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)torch._inductorrq  	loweringsrk   partial)aten_opoverride_fnrq  orig_fns       r/   override_loweringrx    s`      )  )G.&/&7&7W&M7#&-7#g7#s   A$'A  A$A!!A$c                     ddl m} |j                   fd}t        j                  j
                  j                  |d|      S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerc                B     | |        | |      }r	 | |       |S r+   r   )	schedulerrT  outrw  post_fnpre_fns      r/   r&  z(add_scheduler_init_hook.<locals>.wrapper  s+    y% i'Iu%
rp   r  )torch._inductor.schedulerrz  r  unittestr   r  rk  )r  r~  rz  r&  rw  s   ``  @r/   add_scheduler_init_hookr    s9    
 4  G ==%%iWEErp   c                z    t         j                  rt        j                  |        yt        j	                  |        y)z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)r   developer_warningsr?   r,  infomsgs    r/   developer_warningr    s$       Crp   )num_in_out_argsc                >     t         fdt        |      D              S )z
    Return the total number of bytes the arguments of tensor type takes.

    For in/out args, tensor sizes are counted twice: once for reading and
    once for writing.

    The first num_in_out_args arguments are in out tensors.
    c              3     K   | ]T  \  }}t        |t        j                        r5|j                         |j	                         z  d t        |k        z   z   V ywr   N)r   r1   r   numelr  r4   )r-   rV   r   r  s      r/   r0   z get_num_bytes.<locals>.<genexpr>  sS      %FAsc5<<( 			c&&((AA4G0H,HI%s   AA)rJ   rH   )r  rm   s   ` r/   get_num_bytesr    s#      o  rp   c                    | | dd|dd|dd| }	 dd l }| dkD  r5|dk  r0|j                  j                  |z   |j                  j                  z   }|S # t        $ r t
        j                  d	       Y |S w xY w)
Nz.3fzms    	z GB 	 z7.2fzGB/sr   g~jt?i  z@Colorama is not installed. Install it if you want colored output)coloramaForeREDRESETrb   r?   r,  )msnum_gbgb_per_sr  suffixinfo_strr  s          r/   create_bandwidth_info_strr    s    "S&WXdO4PVxXHX:(S.}}((83hmm6I6IIH O  XVWOXs   >A A65A6c                    	 t         j                  j                  d      } | dz   t        t         j                        k  rTt        t         j                  | dz            dkD  r2t         j                  | dz      d   dk7  rt         j                  | dz      S t         j                  D ]#  }|j                  d      s|t        d      d c S  y# t        $ r Y Bw xY w)a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr   r   -z--only=N)sysargvr   rF   
ValueErrorr  )idxr   s     r/   get_benchmark_namer    s    	hhnnX&!Gc#((m#CHHS1W%&*q!!$+88C!G$$ xx>>)$s9~'((   s   BC 	CCc                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    ywr  r   r   s     r/   r0   zis_ones.<locals>.<genexpr>       %u!qAvu   r   rM  s    r/   is_onesr        %u%%%rp   c                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    yw)r   Nr   r   s     r/   r0   zis_zeros.<locals>.<genexpr>  r  r  r  r  s    r/   is_zerosr    r  rp   c                &    t        d | D              S )Nc              3     K   | ]@  }t        |t        j                        r$|j                  t        j                  d       k(   B yw)r   N)r   r1   r   r"   )r-   items     r/   r0   z is_cpu_device.<locals>.<genexpr>  s8      DdELL) 	u||E**s   AAr  )inputss    r/   is_cpu_devicer    s       rp   c                    t        | t        j                        sJ d       | j                  rt        j
                  S t        j                  S )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)r   r   r   
is_integerr1   int64float64)rh  s    r/   get_sympy_Expr_dtyper    sD    UZZ BAB  ~~{{}}rp   c              /     K   | r-t        j                  j                  |i |5 }| d d d        y d  y # 1 sw Y   y xY wwr+   )r1   r;   r<   )should_profilerm   rn   rU   s       r/   maybe_profiler  %  s<     ^^##T4V4G 54 	 54s   "A7AA Ac                    t        | j                  j                               }|j                  d| j                  f       |j                  d| j
                  f       t        |      S )z~
    Convert triton config to a tuple that can uniquely identify it. We can use
    the return value as a dictionary key.
    	num_warps
num_stages)r  rn   rM  r   r  r  r   )cfgrM  s     r/   triton_config_to_hashabler  .  sO    
 3::##%&E	LL+s}}-.	LL,/0<rp   c                    t         s| S t        t        j                  |j	                               | z   t        j                  j
                  z   S r+   )HAS_COLORAMArw   r  r  r:  r  )r  colors     r/   _color_textr  @  s5    
8==%++-0369L9LLLrp   c                    t        | d      S )Ngreenr  r  s    r/   
green_textr  G  s    sG$$rp   c                    t        | d      S )Nyellowr  r  s    r/   yellow_textr  K  s    sH%%rp   c                    t        | d      S )Nredr  r  s    r/   red_textr  O  s    sE""rp   c                    t        | d      S )Nbluer  r  s    r/   	blue_textr  S  s    sF##rp   c                   ddl m}m} | t        j                  t        j
                  t        j                  fv sJ t        j                  j                  r| t        j                  t        j
                  fv r ||       S t        j                  j                  j                  j                  r |t        j                        S  |t        j                        S ddl m}  |dg      d   }| t        j                  t        j
                  fv r	 || |      S t        j                  j                  j                  j                  r |t        j                  |      S  |t        j                  |      S )Nr   )get_max_simd_tflopsget_max_tensorcore_tflops)nvsmizclocks.current.sm)ra   r  r  r1   rA  rB  rC  rK  rL  backendsr    matmul
allow_tf32r  )r!   r  r  r  cur_sm_clocks        r/   get_device_tflopsr  W  s    MU]]ENNEMMBBBB}}U]]ENN33,U33>>%%00,U]];;&u}}55$-./2L//(==~~!!,,(EE"5==,??rp   c                     ddl m}   |        S )Nr   get_dram_gbps)ra   r  r  s    r/   get_gpu_dram_gbpsr  q  s    ,?rp   c                $    | j                  d      S )Nwelford)r  reduction_types    r/   is_welford_reductionr  x  s    $$Y//rp   c                     t        |       rdS dS )N   r   )r  r  s    r/   reduction_num_outputsr  |  s    $^41;!;rp   c                 0    t        j                         dk(  S )NLinux)platformsystemr   rp   r/   is_linuxr    s    ??''rp   c                &    t        d | D              S )Nc              3  n   K   | ]-  }t        |t        j                        xr |j                    / y wr+   )r   r   r   	is_numberr   s     r/   r0   z#has_free_symbols.<locals>.<genexpr>  s*     Jcz!UZZ(<_<cs   35)r   )itrs    r/   has_free_symbolsr    s    JcJJJrp   c                 `   ddl m} | D ]"  }t        ||j                        r`t	        |j
                  j                               s;t        |j
                  d      sTt	        |j
                  j                               sx yt        ||j                  |j                  |j                  f      rOt        |d      rt        |d      sJ t	        |j                               st	        |j                               s yt        ||j                        st        dt        |              y)Nr   r,  
get_strideTget_sizezunexpected type for is_dynamic F)r.  r-  r   r`  r  ra  r  rv   r  rb  BaseViewComputedBufferrc  	TypeErrorr   )rm   r-  ts      r/   
is_dynamicr    s    a& 12-2B166CTCTCV2WBMM2;;8I8IJK1j)ga.FFF

-1A!,,.1QAryy)=d1gYGHH  rp   c                      e Zd ZdZdZy)PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r  r  r   r  r  r   rp   r/   r  r    s      K *rp   r  c                J    |dk(  rd|  dS |dk(  rd|  dS t        d|       )Nr    z
            #include <torch/csrc/inductor/aoti_model_container_runner_cuda.h>

            torch::inductor::AOTIModelContainerRunnerCuda runner("a  ");

            std::vector<at::Tensor> run(std::vector<at::Tensor>& input_tensors) {
                return runner.run(input_tensors);
            }

            std::vector<const char*> get_call_spec() {
                return runner.get_call_spec();
            }
        r   z
            #include <torch/csrc/inductor/aoti_model_container_runner.h>

            torch::inductor::AOTIModelContainerRunnerCpu runner("zUnsupported device: )rG   )so_pathr"   s     r/   aot_inductor_launcherr    sf    C DK) 	L	 	 
5B CJ 	K	 	 1&:;;rp   )   d   )rK   zCallable[[], Any]r  r  )r  r  )r"   z"Union[Optional[torch.device], str]r  ztorch.device)r   zIterable[_T]r  zValuesView[_T])r   Union[int, sympy.Expr]r   r  r  r  )r   r4   r  r4   )r   z"Iterable[Union[int, torch.SymInt]]r  zList[sympy.Expr])r   z Iterable[Union[int, sympy.Expr]]r  zList[Union[int, torch.SymInt]])r   ztorch._ops.OpOverload)r    )r"   r   )r   r    )r   zCallable[..., Any]r   r4   r"   r   r  r  )r   
   r   g      ?r    )r   r   r   r   )r   r   r  z	List[str])r  r4   )rK   z!Callable[Concatenate[Any, P], RV]r  zCachedMethod[P, RV]r+   )rY  zIterable[torch.fx.Node]r  zSet[torch.fx.Node])r   
sympy.Exprr  r   )rE   r   r  zsympy.Symbol)r   r  r  zDict[Any, Any]r  r  )r   r  r  r   )r   r  r  r   r  )r  z	List[int])r6  zList[torch.dtype]r  r  )r<  r   r  r  )rm   ztorch.Tensorr  r4   r  r4   )r.  r.  )rh  r  r  ztorch.dtype)r  zIterable[Any])r  r   r"   r   )
__future__r   rI  r  enumrk   r  rd   rf  loggingr  r|   r  r  r  r^  r  r  r  r   r  ior   typingr   r   r   r   r	   r
   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r1   torch._dynamo.device_interfacer   torch.autogradr   torch.autograd.profiler_utilr   torch.utils._sympy.functionsr   r   r   r   r.  r   	getLoggerr  r?   r   r   	VarRangesr[   r^   rl   ry   r   r   r   r   r   r   r   r   r   r   r   r   r2   r   r   r   r  r  r  r  r  r  r  r*  r5  rC  rU  r]  rk  rr  rz  r  r  r  r  
namedtupler   r  r  r	  r  r  r  r  r  r  r.  r3  r7  r=  rF  rM  rP  rR  rm  ro  rx  r  r  r  r  r  r  r  r  r  r  r  r  r  rb   r  r  r  r  r  r  r  r  r  r  r  r  Enumr  r  r   rp   r/   <module>r     s]   "           	  	  
           4  C % 2 U U g!T]UZZ'(	Od/> T B@@
+!*@

	+

	)#&G2.' NT69GJ
  CI<?)'#$ cNTT"71b5>8 "&)>26 9=*&>"$>F>B -k,,Mguw1  T	 	  6A Q7 7* i% i%X @ T  16 .CC C" ( . .F&	 ?@  
)>&&   
M%&#$ T@ @2 T 0<(K,*$)) *<[  Ls    M MM