
    Ph^                       d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
mZmZmZmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$ dd	l%m&Z&m'Z' dd
l(m)Z) ddl"m*Z*m+Z+ ddl#m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z: ddl;m<Z<  ejz                  e>      Z?ej                  j                  e>d      ZB G d d      ZCd ZD G d d      ZEd+dZFej                  j                  j                  ej                  j                  j                  ej                  j                  j                  ej                  j                  j                  dZM G d d      ZN G d deN      ZO G d deN      ZP G d deN      ZQ G d  d!eN      ZR G d" d#eR      ZSd,d$ZTej                   G d% d&             ZV ej                         ZX G d' d(      ZY G d) d*      ZZy)-    N)
AnyCounterDefaultDictDictListOptionalSequenceSetTupleUnion)dynamo_timed)get_metric_tableis_metric_table_enabled)free_unbacked_symbols)
has_triton   )commsconfigdependenciesirmetrics)get_scheduling_for_deviceKernel) estimate_nccl_collective_runtime)StarDepWeakDep)ComputedBufferMultiOutputMultiOutputLayout)SimplifyIndexing)	cache_on_selfcmpfree_symbol_hasget_device_tflopsget_dtype_sizeget_gpu_dram_gbps
green_textred_textsympy_product)Vfusionc                   H    e Zd ZU g dZeed<   eedf   ed<   d	dZd Z	d Z
y)
	WhyNoFuse)node1node2reasonargsr0   .r1   c                      || _         || _        y Nr.   r/   selfr.   r/   s      dC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/_inductor/scheduler.py__init__zWhyNoFuse.__init__>   s    

    c                 J    || _         || _        t        j                  |        y r3   )r0   r1   
fusion_logdebug)r6   r0   r1   s      r7   __call__zWhyNoFuse.__call__B   s    	r9   c                     d| j                   j                          d| j                  j                          d| j                  | j                  z  z   S )Nzcannot fuse z with : )r.   get_namer/   r0   r1   r6   s    r7   __str__zWhyNoFuse.__str__G   sK    djj1134F4::;N;N;P:QQSTKK$))#
 	
r9   Nr.   BaseSchedulerNoder/   rD   )__name__
__module____qualname__	__slots__str__annotations__r   r   r8   r=   rB    r9   r7   r-   r-   7   s+     5IK
S/

r9   r-   c                     t        | t              rt        | t              } t	        j
                  | d      }d|v rdt        j                  |d       S |S )Nkey   )indent
    )
isinstancesetsortedrI   pprintpformattextwraprP   )objresults     r7   rW   rW   M   sM    #sSc"^^C*Fv~HOOFE2344Mr9   c                   (    e Zd Zd Zd Zd Zd ZeZy)
OutputNodec                 "    |h| _         g | _        y r3   )unmet_dependenciesinverse_usersr6   deps     r7   r8   zOutputNode.__init__X   s    #&%r9   c                      yNFrK   rA   s    r7   is_reductionzOutputNode.is_reduction\       r9   c                      y)NrK   rK   rA   s    r7   get_alias_nameszOutputNode.get_alias_names_       r9   c                      y)NOUTPUTrK   rA   s    r7   r@   zOutputNode.get_nameb   s    r9   N)rE   rF   rG   r8   rd   rg   r@   __repr__rK   r9   r7   r\   r\   W   s      Hr9   r\   rD   c                     | j                         s|j                         rt        j                  | |      S t        j                  | |      S r3   )
is_foreachForeachKernelSchedulerNodefuseFusedSchedulerNoder4   s     r7   ro   ro   h   s?    U--/)..ue<<!&&ue44r9   )zextern_kernels.convolutionzextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmc                      e Zd Zdddej                  fdZd ZdefdZdefdZ	d	 Z
d
eeef   fdZd Zd Zded   fdZdee   deeef   fdZd Zd Zd Zdej.                  fdZd Zdee   fdZdee   fdZd Zd Zd ZdefdZdefd Z dee   fd!Z!de"d    fd"Z#d# Z$d$ Z%d% Z&d& Z'd' Z(d(ejR                  fd)Z*d* Z+d+ Z,d, Z-d- Z.d2d.Z/de0fd/Z1de2fd0Z3y1)3rD   	scheduler	Schedulernodec                     || _         || _        g | _        g | _        g | _        | j                  |j                                t               | _        |  |  t               | _	        d| _
        y rc   )rr   rt   usersr_   
node_usersset_read_writesget_read_writesrT   	ancestors
last_usagewritten)r6   rr   rt   s      r7   r8   zBaseSchedulerNode.__init__y   sc    $-#	%'
6835T1134#&5 E 	 r9   c                 T    t        |       j                   d| j                         dS )Nz(name=)typerE   r@   rA   s    r7   rk   zBaseSchedulerNode.__repr__   s'    t*%%&fT]]_,?qAAr9   returnc                 H   | j                         }| dt        |       j                   dt        t        | dd            j                   d| dt	        | j
                  j                         | dt	        | j                         | dt	        | j
                  j                  | j                  z
         | d	| j                   g}	 || j                         gz  }dj                  |      j                         S # t        $ r t        j                  d
d       Y Aw xY w)z#Longer form printout for trace logsr?   (rt   Nr~   z
.writes = z.unmet_dependencies = z.met_dependencies = z	.users = zIgnoring error in debug_str()T)exc_inforQ   )r@   r   rE   getattrrW   read_writeswritesr^   readsrv   debug_str_extra	Exceptionlogwarningjoinrstripr6   nameliness      r7   	debug_strzBaseSchedulerNode.debug_str   s#   }}fBtDz**+1T'$2M-N-W-W,XXYZfJwt'7'7'>'>?@Af*743J3J+K*LMf(1A1A1G1G$JaJa1a)b(cdfIdjj\*
	H$$& E yy&&((  	HKK7$KG	Hs   C> > D! D!c                      y)N rK   rA   s    r7   r   z!BaseSchedulerNode.debug_str_extra   rh   r9   c                 p    t         j                  d| | j                  | j                  j                         y )Nz(%s: unmet_dependencies = %s, writes = %s)r   infor^   r   r   rA   s    r7   log_detailszBaseSchedulerNode.log_details   s,    6####		
r9   renamesc                 X    | j                  | j                  j                  |             y r3   )rx   r   renamer6   r   s     r7   update_mutated_namesz&BaseSchedulerNode.update_mutated_names   s!    T--44W=>r9   c                 X    | j                  | j                  j                  |             y r3   rx   r   	with_readr`   s     r7   add_mutation_depz"BaseSchedulerNode.add_mutation_dep   !    T--77<=r9   c                 X    | j                  | j                  j                  |             y r3   r   r`   s     r7   add_fake_depzBaseSchedulerNode.add_fake_dep   r   r9   rv   NodeUserc                 ,   i }|D ]o  }t        |j                        |v r>|j                  |t        |j                                 |t        |j                        <   X||t        |j                        <   q t        |j	                               | _        y r3   )idrt   mergelistvaluesrv   )r6   rv   rZ   uses       r7   	set_userszBaseSchedulerNode.set_users   so    &(C#((|v%'*yy3881E'Fr#((|$'*r#((|$	 
 &--/*
r9   future_used_buffersmutation_real_namec                     | j                         }|D ch c]  }|j                  ||       }}||z
  | _        y c c}w r3   )used_or_aliased_buffer_namesgetr{   )r6   r   r   used_buffersks        r7   set_last_usagez BaseSchedulerNode.set_last_usage   sG     88:>JKl*..q!4lK&)<< Ls   ;c                 6    | j                   j                         S r3   )rt   rg   rA   s    r7   get_aliaseszBaseSchedulerNode.get_aliases   s    yy((**r9   c                 6    | j                   j                         S r3   )rt   get_mutation_namesrA   s    r7   get_mutationszBaseSchedulerNode.get_mutations   s    yy++--r9   c                 X    t        | j                         xs | j                               S r3   )boolr   r   rA   s    r7   has_aliasing_or_mutationz*BaseSchedulerNode.has_aliasing_or_mutation   s$    D$$&>$*<*<*>??r9   rwc                 h    || _         | j                   j                  | _        | j                          y r3   )r   r   r^   
prune_deps)r6   r   s     r7   rx   z!BaseSchedulerNode.set_read_writes   s(    46"&"2"2"8"8r9   c                 .    | j                   j                  S r3   )r   	op_countsrA   s    r7   r   zBaseSchedulerNode.op_counts   s    )))r9   c                     t        j                  | j                  j                  | j                  j                        D ch c]  }|j
                   c}S c c}w r3   )	itertoolschainr   r   r   r   r`   s     r7   used_buffer_namesz#BaseSchedulerNode.used_buffer_names   sP     !t'7'7'='=t?O?O?V?VW
W HHW
 	
 
s   Ac                 D   t               }t        j                  | j                  j                  | j                  j
                        D ]  }|j                  |j                         t        j                  j                  j                  |j                        sRt        j                  j                  |j                     j                         }t        |t        j                        s|j                  |j                   j"                  j%                                 |S r3   )rT   r   r   r   r   r   addr   r*   graphname_to_bufferr   
get_layoutrS   r   AliasedLayoutviewdatar@   )r6   
used_namesra   layouts       r7   r   z.BaseSchedulerNode.used_or_aliased_buffer_names   s    U
??4#3#3#9#94;K;K;R;RSCNN388$ww%%))#((3//9DDF fb&6&67NN6;;#3#3#<#<#>? T r9   c                     | j                   D ch c]&  }|j                  | j                  j                  vr|( c}| _         y c c}w r3   )r^   r   rr   available_buffer_namesr`   s     r7   r   zBaseSchedulerNode.prune_deps   sE     ..#
.xxt~~DDD .#
 #
s   +Ac                     d }| j                   j                  D ch c]  } ||      s| }}| j                  | j                   j                  |             y c c}w )Nc                 r    t        | t              xr& | j                  t        j                  j
                  v S r3   )rS   r   r   r*   r   removed_buffers)ra   s    r7   should_prunez7BaseSchedulerNode.prune_weak_deps.<locals>.should_prune   s'    c7+SAGG<S<S0SSr9   )r   r   rx   remove_reads)r6   r   ra   	to_removes       r7   prune_weak_depsz!BaseSchedulerNode.prune_weak_deps   sX    	T %)$4$4$:$:P$:Sl3>OS$:	PT--::9EF Qs
   AAc                     t        j                          j                  D ];  }t        |t              r|j
                     j                         xx   dz  cc<   =  fd} j                  D ch c]  } ||      s| }}|r? j                  |z
   _         j                   j                  j                  |             yyc c}w )a  
        Prunes weakdeps intended for mutation ordering
        on an upstream fused node if after fusion there is another dependency
        on the fused upstream node, making the weakdep redundant

        In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
        be incrementally removed, enabling other fusions, ensuring they are fused in order.
        r   c                     t        | t              r;| j                     j                            dkD  }| j                     k(  }|xs |S y)Nr   F)rS   r   r   r@   )ra   is_redundantis_self_depname_to_dep_countname_to_fused_noder6   s      r7   r   z<BaseSchedulerNode.prune_redundant_deps.<locals>.should_prune  sV    #w'%&8&B&K&K&MNQRR  1:dB#2{2r9   N)
collectionsr   r^   rS   r   r   r@   rx   r   r   )r6   r   ra   r   deps_to_pruner   s   ``   @r7   prune_redundant_depsz&BaseSchedulerNode.prune_redundant_deps   s     +6*=*=*?**Cc7+!"4SXX">"G"G"IJaOJ +	 )-(?(?U(?<PSCT(?U&*&=&=&MD#  !1!1!>!>}!MN  Vs   7CCc                 6    | j                   j                         S r3   rt   r@   rA   s    r7   r@   zBaseSchedulerNode.get_name      yy!!##r9   c                 "    | j                         S r3   r@   rA   s    r7   get_first_namez BaseSchedulerNode.get_first_name  s    }}r9   c                 $    | j                         hS r3   r   rA   s    r7   	get_nameszBaseSchedulerNode.get_names  s      r9   c                     | gS r3   rK   rA   s    r7   	get_nodeszBaseSchedulerNode.get_nodes!  s	    vr9   c                 6    | j                   j                         S r3   )rt   
get_devicerA   s    r7   r   zBaseSchedulerNode.get_device$  s    yy##%%r9   c                      yrc   rK   rA   s    r7   rd   zBaseSchedulerNode.is_reduction'  re   r9   c                      yrc   rK   rA   s    r7   is_templatezBaseSchedulerNode.is_template*  re   r9   c                      yrc   rK   rA   s    r7   	is_externzBaseSchedulerNode.is_extern-  re   r9   c                      yrc   rK   rA   s    r7   rm   zBaseSchedulerNode.is_foreach0  re   r9   read_depc                      yrc   rK   r6   r   s     r7   can_inplacezBaseSchedulerNode.can_inplace3  re   r9   c                      yrc   rK   rA   s    r7   has_side_effectsz"BaseSchedulerNode.has_side_effects6  re   r9   c                 	   | j                   j                         syt        | t        f      r5| j                   j	                         s| j                   j                         ryt        | t        f      sFt        | t              rt        | j                   t        j                  t        j                  f      rt        j                  rt        t        j                  t        j                  j                   j"                  j$                        rt'        t        j                  dd      Tddlm} t-        | j.                  j0                  d       }|D ]%  }| j2                  j4                  j7                  |j8                        }|s6t        j:                  j<                  j?                  ||       sa|j@                  J |j@                  D cg c]4  }|j                   jC                         | j2                  jD                  vr|6 }}tG        |      dk(  s|d   jH                  s|d   j                   | u st        |j                   jK                         t        jL                  t        jN                  t        jP                  f      r<t        |j                   t        jR                        r(tG        |j                   j	                               dkD  r ||j                          || j                         k(  stU        t        j                  d      r\t        j                  jV                  jY                  |jC                         | jC                                t        t        j                  t        j                  j                   j"                  j$                        rnt        j                  jZ                  j]                  |jC                                t        j                  jZ                  j]                  | jC                                | j^                  ja                  |jC                                |jC                         t        j                  jb                  | jC                         <    y yyyyyc c}w )	z~
        Decide if there should be inplace updates for the node
        and record the decision in the active kernel.
        N	mutationsr   )buffer_reuse_keyc                     | j                   S r3   r   xs    r7   <lambda>z9BaseSchedulerNode.decide_inplace_update.<locals>.<lambda>W  s    r9   rM   r   r1   )2rt   should_allocaterS   SchedulerNoderg   r   ExternKernelSchedulerNoder   	AllReduceInPlaceHintr   inplace_buffersr*   kerneltorch	_inductorcodegentritonTritonKernelr   codegen.wrapperr   rU   r   r   rr   name_to_noder   r   r   wrapper_code	can_reuserv   r@   r   lenr   r   r   MutationLayoutr   FallbackKernelhasattrr1   make_inplacer   r   r{   discardinplace_update_buffers)r6   r   ordered_readsread
input_noder   remaining_usess          r7   decide_inplace_updatez'BaseSchedulerNode.decide_inplace_update9  sH   
 yy((*d],-II%%'499+G+G+I 4-!12 t%>?"499r||R^^.LM &&qxx)@)@)G)G)T)TU188[$7C :"4#3#3#9#9?OPM% NN//33DII>  !''"6"6"@"@T"R%++777 ",!1!1&!1A66??,#~~DDE !1 # & N+q0*1-99*1-22d: *&OO668 " 4 4 " 1 1 " 0 0! 'z8I8IJ #JOO$C$C$E F J,Z__=+DII67 #188V4 HHMM66 * 3 3 5t}}  * !%//*A*A*H*H*U*U  !" 2 2 6 6z7J7J7L M ! 2 2 6 6t}} G !OO33J4G4G4IJ !+ 3 3 5 HH;; $ i & D ' N @(&s   9Rc                    | j                   j                         sy t        | t        f      rh| j                   j	                         s| j                   j                         r4t        j                  j                  j                  | j                          y t        t        j                  d      r| j                         t        j                  j                  v rt        j                  j                  j                  | j                  j                   t        j                  j                  | j                               j                   | j                          y t        j                  j                  j                  | j                          y )Nr1   )rt   r  rS   r  rg   r   r*   r   r  codegen_allocationr  r  r@   r  codegen_inplace_reuserr   r  rA   s    r7   allocatezBaseSchedulerNode.allocate  s    yy((*d],-II%%'499+G+G+IGG  33DII> AHHf%188#B#BBGG  66++HH33DMMOD$			 GG  33DII>r9   c                 ^    | j                   D ]  }t        |j                  t              s y y)NFT)rv   rS   rt   r\   )r6   r   s     r7   can_freezBaseSchedulerNode.can_free  s&    ::C#((J/  r9   c                 
   t         j                  sy |r| j                  ry | j                  j                  }g }|D ]  }|j
                  dk(  r|j                  d       |j                  d       d|j
                   d|j                   }d|j                  v r|d|j                  d    z   }|j                  |       d|j                  v s|j                  d    }|j                  d	      d
   }|j                  d|j                  dd      j                  dd      j                  dd      z          |j                  d       |j                  d       ! t        |      dk(  ry |j                  |       d| _        y )Noutputr   z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|{z{{}z}}rQ   \z#pragma CMT END ORIGINr   T)r   comment_originr|   rt   originsopappendtargetmetasplitreplacer  
writelines)	r6   buffer	only_oncer.  	out_linesoop_info_strr'  stack_trace_last_lines	            r7   codegen_originating_infoz*BaseSchedulerNode.codegen_originating_info  sk   $$))##	AttxR 23(az:K166!)hqvvh7G6H,II[)&!"!6 7(3(9(9#(>r(B%  "+33C>WS$'WT4()   !9:  $- 0 y>Q 	)$r9   c                     t         t              ryt         t              rt         j                  t              ryt         t
              rat        j                  j                  j                  t         j                         d         t         j                         d         z        }nt        d      }t        j                  t              } j                   j"                   j                   j$                  z  D ]   }||j&                     j)                  |       "  j                   j"                  D ch c]  }|j&                   }} j                   j$                  D ch c]  }|j&                   }} fd}t         t*              r-|D ch c]  } || j,                        r| }}||z
  }||z
  }d}||z  D ](  }	t/        ||	   D cg c]  }| c}      }
|	t        j                  j0                  v rt        j                  j0                  |	   }n;|	t        j                  j2                  v rt        j                  j2                  |	   }nd t        |j4                  t6              rF j8                  j:                  |j=                            j>                  }t/        fd|D              }n |      }|tA        ||
      tC        |jE                               z  z  }+ |S c c}w c c}w c c}w c c}w )aM  
        Counting the number of bytes accessed for a kernel is
        surprisingly tricky. In particular, there is a differentiation
        between 'theoretical' memory accesses and practical memory
        accesses. For example, a layernorm kernel may actually access an
        input 3 times, but in theory, it only needs to access its input
        once (and may be optimized to do so through say, persistent
        reductions)

        Another example is that even though a buffer is passed in, we may
        not access the entire buffer. This may occur if we are accessing
        a slice of the buffer. Another tricky case is for indirect
        indexing, where the amount of bytes accessed depends on the
        values of the input.

        What this function aims to compute is the memory accesses for
        worst-case inputs, best-case optimization. What this means is
        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

        1. Numel in ranges multiplied by number of deps the buffer has
        2. The buffer size
        r   r       eAc                     j                   j                  |    j                  }|D ch c]  }|j                   }}t	        |t        |      z
        dkD  S c c}w Nr   )rr   r  rv   rt   r  rT   )bufsnodesrv   userbuf_usesr6   s        r7   is_materializedzGBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.is_materialized  sS    NN//4::E.34ed		eH4x#f+-.22 5s   Ac                     t         j                  j                  j                  t	        | j                                     S r3   )r*   r   sizevars	size_hintr)   get_size)rA  s    r7   get_buf_elemszEBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.get_buf_elems  s(    ww''11-2OPPr9   c              3   V   K   | ]   } |j                   j                          " y wr3   )rt   ).0rC  rJ  s     r7   	<genexpr>zABaseSchedulerNode.get_read_write_buffers_sizes.<locals>.<genexpr>   s     P%$diinn =%s   &))#rS   NopKernelSchedulerNoder  rt   r   r  r*   r   rG  rH  r)   
get_rangesintr   defaultdictr   r   r   r   r   r0  rp   rB  sumr   graph_inputsr   r   rr   r  r@   rv   minr%   	get_dtype)r6   
node_numelbuf_accessesra   r   r   rE  r   
node_bytesbuf_namebuf_accessed_elemsrA  rv   	buf_elemsrJ  s   `             @r7   get_read_write_buffers_sizesz.BaseSchedulerNode.get_read_write_buffers_sizes  s   . d23d56:II{<
 dM*))33doo/23 1! 456J
 SJ"..t4##))D,<,<,C,CCC"))#. D &*%5%5%;%;<%;c%;<&*&6&6&=&=>&=s#((&=>	3
 d./%%_S$++-Nv   o-FO+E
H!$L<R%S<RSj<R%S!T177111gg,,X6QWW111gg**84Q
 #**&7833CLLNCIIP%PP	)#.	#i);<~@  J+ '2 S => &Ts   <K5)K:K?0K?	L
c           	         d}d}t        | d      r| j                  st        | t        t        f      sJ dt        |              | j                  sJ | j                  d   j                  sy| j                  d   j                  j                         }| j                  d   j                  j                         }n4| j                  j                         }| j                  j                         }d|j                  j
                  k7  ry	 t               }t        |      dz  }t        | t              rNt        | j                  t        j                        sJ dt        | j                               t         j#                  t%        | j                  dd	      d      }|dd
lm} ddlm}  |       5   |d      5 }ddlm}	 | j                  j0                  D 
cg c]  }
 |	|
d       }}
| j                  j2                  } |j4                  |g|i | j                  j6                   d}|j9                         }| j;                         }||z  |z  dz  }||z  }t=        ||      cddd       cddd       S t        | t              st        | j                  t>              r| j;                         |z  S t        | j                  t        j@                        rtC        |       S t        | j                  t        jD                        ryy# t        $ r Y yw xY wc c}
w # 1 sw Y   nxY wddd       ~# 1 sw Y   xY w)zB
        Returns estimated op runtime in nanoseconds (ns)
        Nrt   ztype(self)=r   cudal    J)type(self.node)=r  r   )FakeTensorMode)FlopCounterModeF)displayr   )ir_node_to_tensor)guard_shapeg      ?r>  )#r  rt   rS   rp   rn   r   rB  r   rU  devicer&   r$   r   r  r   ExternKernelkernel_name_to_opr   r   torch._subclasses.fake_tensorr`  torch.utils.flop_counterra  rc  inputs	__class__process_kernelkwargsget_total_flopsr\  maxr   CollectiveKernelr   Wait)r6   r   dtypegpu_memory_bandwidth	gpu_flopsr/  r`  ra  flop_counter_moderc  inputfake_inputsclsfactorcounted_flopscounted_bytescompute_timetransfer_times                     r7   get_estimated_runtimez'BaseSchedulerNode.get_estimated_runtime*  s    tV$DII)+EF  $t*   ;;;;;q>&&[[^((335FKKN''113EYY))+FII'')EV]]'''	#4#6 )%069I d56dii9P>Nd499o=O;PP9"&&wtyy(B'GNB ~HD#%!(&5 &*YY%5%5#%5E *%UC%5   # ))--C&C&&rLKL499;K;KL !F$5$E$E$GM$($E$E$GM$*]$:Y$F##ML$14H$HM |];)( (%%, 01ZII~6
 4469MMM dii!4!453D99		277+
 e  		"#( (%%%sI   L 7
L>L) L$1A?L)0	L>	L! L!$L))L2	.L>>MN)T)4rE   rF   rG   r   Bufferr8   rk   rI   r   r   r   r   r   r   r   r   r   r
   r   r   r   r   r   
ReadWritesrx   r   r   r   r   r   r   r@   r   r   r	   r   r   rd   r   r   rm   	MemoryDepr   r   r  r   r"  r<  rP  r\  floatr~  rK   r9   r7   rD   rD   x   s   + RYY B)3 )& 
?DcN ?>>+tJ/ +=#&s8=AEc3h=+.@,"9"9 
*
3s8 
c#h 
G OD$# $ !3s8 !8$78 &L$:$: Tl?0'RRc RhLu Lr9   c                   D    e Zd ZdefdZd Zd Zdej                  fdZ	y)r  r   c                 V    | j                          dt        | j                  dd        S )Nz.node.kernel = r  )r@   r   rt   rA   s    r7   r   z)ExternKernelSchedulerNode.debug_str_extraz  s(    --/"/'$))Xt2T1UVVr9   c                      yNTrK   rA   s    r7   r   z#ExternKernelSchedulerNode.is_extern}      r9   c                 f    t        | j                  d      xr | j                  j                         S )Nr   )r  rt   r   rA   s    r7   r   z*ExternKernelSchedulerNode.has_side_effects  s&    tyy"45V$)):T:T:VVr9   r   c                 t   | j                         s| j                         ry|j                  | j                  j                  vryt        | j                  t        j                  j                  j                  t        j                  j                  j                  f      syt        | j                  j                        dk(  rut        t!        | j                  j                              }|j#                         |j#                         z
  }t$        j&                  j(                  j+                  |      dk(  S y)NFr   r   )r   r   r   rr   r  rS   rt   r  r	  r   r  r  r  r   r   nextiter	get_numelr*   r   rG  simplify)r6   r   	write_dep
numel_diffs       r7   r   z%ExternKernelSchedulerNode.can_inplace  s    !1!1!3== ; ;; II**44eoo6H6H6T6TU
 t&&'1,T$"2"2"9"9:;I!++-	0C0C0EEJ77##,,Z8A==r9   N)
rE   rF   rG   rI   r   r   r   r   r  r   rK   r9   r7   r  r  y  s-    W WWL$:$: r9   r  c                       e Zd Zy)rN  N)rE   rF   rG   rK   r9   r7   rN  rN    s    r9   rN  c                        e Zd Zdddeej
                  ej                  f   f fdZdefdZ	d Z
d Zd	 Zd
 Zd Zd Zd Zd Zdej&                  fdZedee   fd       Zd Z xZS )r  rr   rs   rt   c                    t         |   ||       |j                         \  | _        | _        |j                          || j                        f| _        t        |t        j                        r | j                  |j                                y | j                  t        j                  | j                  g| j                  ddi       y )N	normalizeT)superr8   simplify_and_reorder_sizes_bodyr   grouprS   r   TemplateBufferrx   normalized_read_writesr   extract_read_writes)r6   rr   rt   group_fnrk  s       r7   r8   zSchedulerNode.__init__  s     	D) %%'	
KJ oo'$++)>?
dB--.  !<!<!>?  00JJ!%8<r9   r   c                    | j                         }| d| j                  d    | d| j                  d    | d| j                   g}| j                         r-|j	                  | dt        | j                                       | j                         r-|j	                  | dt        | j                                       t        | j                  t        j                        rR|j	                  d| d	       |j	                  t        j                  | j                  j                         d
             dj                  |      S )Nz.group.device = r   z.group.iteration = r   z	.sizes = z.aliases = z.mutations = zclass z_loop_body:rR   rQ   )r@   r  r  r   r0  rW   r   rS   r  r   LoopBodyrX   rP   r   r   r   s      r7   r   zSchedulerNode.debug_str_extra  s   }}f$TZZ]O4f'

17fIdkk]+

 LLD6WT5E5E5G-H,IJKLLD6wt7I7I7K/L.MNOdjj"++.LL6${34LL)=)=)?HIyyr9   c                     | j                   S r3   )r  rA   s    r7   rO  zSchedulerNode.get_ranges      {{r9   c                     t        | j                  t        j                  t        j                  f      sJ dt        | j                               t        | j                  j                               S Nr_  )rS   rt   r   r   r  r   r   get_reduction_typerA   s    r7   rd   zSchedulerNode.is_reduction  s`    II))2+<+<=
 	!d499o 	! 
 DII00233r9   c                 J    t        | j                  t        j                        S r3   )rS   rt   r   r  rA   s    r7   r   zSchedulerNode.is_template  s    $))R%6%677r9   c                 f    | j                          | j                          | j                  |       y r3   )r  mark_runr
  )r6   
index_varss     r7   runzSchedulerNode.run  s#    ""$Z r9   c                 $    | j                          y r3   )r   rA   s    r7   r  zSchedulerNode.mark_run  s    r9   c                 &   | j                   }t        t        t        |            t        t        t        |            k(  sJ t	        t        t        j                  j                  |      t        j                  j                  |                  }|S r3   )	r  rR  mapr  dictzipr   r   from_iterable)r6   r  sizes
var_rangess       r7   ranges_from_index_varsz$SchedulerNode.ranges_from_index_vars  sn    3sE?#s3sJ+?'@@@@--j9--e4

 r9   c                    | j                  |      }	 t        j                  t        t        j                         |            5  t        j
                  j                  |       5   | j                  |  d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w# t        $ r" t        j                  d| j                          w xY w)NzError in codegen for %s)r  r*   set_ops_handlerr    get_ops_handlerr  set_current_noder  r   r   fatalrt   )r6   r  r  s      r7   r
  zSchedulerNode.codegen  s    00<
	"" !2!2!4jAxx((.

J' / ..   	II/;	sA   1B  B$B4B<B B	
BBB B +Cc                 \      j                   \  } fd}t        j                  ||      S )zH
        Get the memory dependencies in the non-reduction axis.
        c           	      t    j                  | D cg c]  }t        j                  d       c}      S c c}w r@  )r  sympyInteger)index_reduction_sizesr6   s     r7   fnz/SchedulerNode.pointwise_read_writes.<locals>.fn  s/    ::e%P1emmA&6%PQQ%Ps   5
)r  r   r  )r6   r  r  r  s   `  @r7   pointwise_read_writesz#SchedulerNode.pointwise_read_writes  s.     "&	R //E::r9   r   c                    | j                         s| j                         ryt        | j                  j                        dk(  rt        |t        j                        rt        t        | j                  j                              }t        |t        j                        sJ dt        |             |j                  |j                  k(  xr |j                  |j                  k(  S y)NFr   ztype(write_dep)=)r   r   r  r   r   rS   r   r  r  r  r   r  size)r6   r   r  s      r7   r   zSchedulerNode.can_inplace  s    !1!1!3t&&'1,l,,2
 T$"2"2"9"9:;Ii)?)?@WEUT)_DVBWW@>>Y__4X)..9XXr9   c                 "   t               }t        | j                  t        j                        r| j                  j                         D ]  }|j                  dk(  s|j                  dk(  s#d|j                  v r|j                  d   dk(  s,t        |j                        dk(  s\|j                  d   dk(  so|j                  d|j                  v r|j                  d   n(t        |j                        dk\  r|j                  d	   nd
        |S )Ncall_methodstoremode
atomic_add   rO   r      r   r   )rT   rS   r  r   r  r   r/  r1  rm  r  r1   r   )r6   buffers_store_as_atomic_addrt   s      r7   _get_atomic_add_buffersz%SchedulerNode._get_atomic_add_buffers  s    &)e#djj"++.

,,.GG},w.4;;.4;;v3F,3V		Na/DIIaLL4P 033!T[[0 F+.1$))n.Adiilr / +*r9   c                 &    || j                         v S r3   )r  r6   	check_bufs     r7   has_atomic_addzSchedulerNode.has_atomic_add  s    D88:::r9   )rE   rF   rG   r   r   r   r  r8   rI   r   rO  rd   r   r  r  r  r
  r  r   r  r   r!   r
   r  r  __classcell__rk  s   @r7   r  r    s     B%%r'8'889.    48!
			;	L$:$: 	 +S + +&;r9   r  c                       e Zd ZdZededefd       Zdddee   fdZ	e
d	efd
       Zd	efdZe
d	ee   fd       Zd	efdZdee   deeef   f fdZe
d	ee   fd       Ze
d	ee   fd       Zd	ee   fdZd Ze
d        Ze
d        Ze
d        Zd Ze
d        Ze
d        Zd Zdeeef   fdZd Zded    fd!Z d" Z!d# Z"d$e#jH                  fd%Z%d& Z&d' Z' xZ(S )(rp   z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    r.   r/   c                    |j                   |j                   u sJ t        |t        t        f      rt        |t        t        f      sJ  | |j                   t	        |j                               t	        |j                               z         S r3   )rr   rS   r  rp   r   r   )rx  r.   r/   s      r7   ro   zFusedSchedulerNode.fuse$  su    %//111%-1C!DE*M#56K
 	
 
 5??D):$;d5??CT>U$UVVr9   rr   rs   rB  c                 z   || _         || _        d | _        g | _        g | _        g | _        t        |d       j                  | _        t        j                  |D cg c]  }|j                  |j                   c} | _
        | j                  t        j                  j                  |D cg c]  }|j                   c}             t        j                  |D cg c]  }|j                    c} D ch c]   }|j"                  | j%                         vr|" c}| j                  j&                  z
  | _        t)        | j                   D cg c]  }|j*                   c}      | _        t        | j                   D cg c]  }|j,                   c}      | _        y c c}w c c}w c c}w c c}w c c}w c c}w )Nc                 4    t        | j                               S r3   )rP  rd   r   s    r7   r   z-FusedSchedulerNode.__init__.<locals>.<lambda>4  s    s1>>3C/Dr9   rM   )rB  rr   rt   rv   r_   rw   ro  r  rT   unionrz   rx   r   r  
merge_listr   r^   r   r   r   rT  	min_order	max_order)r6   rr   rB  r   ra   s        r7   r8   zFusedSchedulerNode.__init__,  sy   "#	%'
%DEKK
#)E6aQ[[-Dakk6E
 	##..v/Nv!v/NO	
 yy"HA1#7#7"HI#
Ixxt~~// I#
 ##	#$
 4;;?;aakk;?@4;;?;aakk;?@ F 0O
 #I#

 @?s*   F-F,F$F)6%F.F3?F8r   c                 z    dj                  | j                  D cg c]  }|j                          c}      S c c}w )Nr  )r   rB  r@   r6   r   s     r7   r@   zFusedSchedulerNode.get_nameE  s-    xxt{{;{!{;<<;s   8c                 <    | j                   d   j                         S r@  )rB  r@   rA   s    r7   r   z!FusedSchedulerNode.get_first_nameI  s    {{1~&&((r9   c                 |    t        j                  | j                  D cg c]  }|j                          c} S c c}w r3   )rT   r  rB  r   r  s     r7   r   zFusedSchedulerNode.get_namesL  s-    yy$++>+Q1;;=+>??>   9c           
      
   t        | j                        D cg c]+  \  }}| j                          d| d|j                          - }}}t	        j
                  dj                  |      j                         d      S c c}}w )Nz.snodes[z] =
rQ   rR   )	enumeraterB  r@   r   rX   rP   r   r   )r6   irt   r   s       r7   r   z"FusedSchedulerNode.debug_str_extraP  s{     %T[[1
14 }}xs%0@/AB1 	 
 tyy/668&AA	
s   0A?r   r   c                     t         |   ||       t               }t        | j                        D ]/  }|j                  ||       |j                  |j                         1 y r3   )r  r   rT   reversedrB  updater{   )r6   r   r   rt   rk  s       r7   r   z!FusedSchedulerNode.set_last_usageW  sW    
 	24FG ),T[[)D 35GH&&t7 *r9   c                 |    t        j                  | j                  D cg c]  }|j                          c} S c c}w r3   )rT   r  rB  r   r  s     r7   r   z$FusedSchedulerNode.used_buffer_namesd  s0    yy$++F+Q1..0+FGGFr  c                 |    t        j                  | j                  D cg c]  }|j                          c} S c c}w r3   )rT   r  rB  r   r  s     r7   r   z/FusedSchedulerNode.used_or_aliased_buffer_namesh  s0    yyT[[Q[199;[QRRQr  c                     | j                   S r3   )rB  rA   s    r7   r   zFusedSchedulerNode.get_nodesl  r  r9   c                 T    t        |       j                   d| j                          dS )Nz(nodes=r~   r   rA   s    r7   rk   zFusedSchedulerNode.__repr__o  s'    t*%%&gdmmo->a@@r9   c                 :    t        d | j                  D              S )Nc              3   <   K   | ]  }|j                           y wr3   )rd   rL  r   s     r7   rM  z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>t  s     9[1>>#[   anyrB  rA   s    r7   rd   zFusedSchedulerNode.is_reductionr  s    9T[[999r9   c                 :    t        d | j                  D              S )Nc              3   <   K   | ]  }|j                           y wr3   )r   r  s     r7   rM  z1FusedSchedulerNode.is_template.<locals>.<genexpr>x  s     8Kq1==?Kr  r  rA   s    r7   r   zFusedSchedulerNode.is_templatev  s    8DKK888r9   c                 N    | j                   D ]  }|j                         s|c S  y r3   )rB  r   r6   rt   s     r7   get_template_nodez$FusedSchedulerNode.get_template_nodez  s&    KKD!   r9   c                      | j                   d   S r@  )r  rA   s    r7   r   zFusedSchedulerNode.get_device  s    zz!}r9   c                 :    t        d | j                  D              S )Nc              3   <   K   | ]  }|j                           y wr3   )r   r  s     r7   rM  z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s     EA1--/r  r  rA   s    r7   r   z+FusedSchedulerNode.has_aliasing_or_mutation  s    EEEEr9   c                     t        j                         }| j                  D ]!  }|j                  |j	                                # |S r3   )r   r   rB  r  r   )r6   r   rt   s      r7   r   zFusedSchedulerNode.op_counts  s9    "-"5"5"7	KKDT^^-.  r9   c                 H    t        fd| j                         D              S )Nc              3   d   K   | ]'  }t        |t              xr |j                         ) y wr3   )rS   r  r  )rL  sub_schedule_node1r  s     r7   rM  z4FusedSchedulerNode.has_atomic_add.<locals>.<genexpr>  s@      

 '7" -}= A&55i@A '7s   -0)r  r   r  s    `r7   r  z!FusedSchedulerNode.has_atomic_add  s&     

 '+nn&6
 
 	
r9   r   c                     t         r3   NotImplementedErrorr   s     r7   r   z'FusedSchedulerNode.update_mutated_names      !!r9   c                     t         r3   r  r6   r   s     r7   r   z#FusedSchedulerNode.add_mutation_dep  r  r9   rv   r   c                     t         r3   r  )r6   rv   s     r7   r   zFusedSchedulerNode.set_users  r  r9   c                     t         r3   r  rA   s    r7   r   zFusedSchedulerNode.get_aliases  r  r9   c                     t         r3   r  rA   s    r7   r   z FusedSchedulerNode.get_mutations  r  r9   r   c                     t         r3   r  r   s     r7   r   zFusedSchedulerNode.can_inplace  r  r9   c                     t         r3   r  rA   s    r7   r   zFusedSchedulerNode.allocate  r  r9   c                     t         r3   r  rA   s    r7   r"  zFusedSchedulerNode.can_free  r  r9   ))rE   rF   rG   __doc__classmethodrD   ro   r   r  r8   r!   rI   r@   r   r
   r   r   r   r   r   r   r   rk   rd   r   r  r   r   r   r  r   r   r   r   r   r   r  r   r   r"  r  r  s   @r7   rp   rp     s    W* W3D W WA+ AtM7J A2 =# = =) ) @3s8 @ @B B8#&s88AEc3h8 H3s8 H H Sc#h S S4. A : : 9 9   F F  
"DcN """tJ/ """"L$:$: """r9   rp   c                        e Zd ZdZd Zd Zed        Zed        Z	 	 dddde	e
   f fd	Zd
 Zd Zd Zd Zd Zd Zd Zd Z xZS )rn   z{Scheduler node which consists of a list of scheduler nodes that each operate on a
    distinct tensor in a list of tensors.c                 v    |j                         | j                  v r| j                  |j                            S y r3   )r@   read_to_node)r6   producers     r7   get_consumer_subnode_forz3ForeachKernelSchedulerNode.get_consumer_subnode_for  s6    $"3"33$$X%6%6%899r9   c                     |j                   j                  D ]5  }|j                  | j                  v s| j                  |j                     c S  y r3   )r   r   r   r  )r6   consumerrds      r7   get_producer_subnode_forz3ForeachKernelSchedulerNode.get_producer_subnode_for  sD    &&,,Bww$+++((11 - r9   c                 h   t        |      }j                         r{|j                         rkt        j                        t        |j                        k(  }|s |d       |xr2 t	        fdt        j                  |j                        D              S |j                         r8|j                        }||j                  j                  |      S  |d       yj                         r8j                  |      }|j                  j                  ||      S  |d       yt        d      )Nzforeach do not have same lengthc              3   \   K   | ]#  \  }}j                   j                  ||       % y wr3   )rr   can_fuse)rL  lrr  s      r7   rM  z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>  s0      )ADAq ""++Aq1As   ),z5candidate producer is not dep of any foreach consumerFz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r-   rm   r  rB  allr  r  rr   r  r  AssertionError)rx  r  r  whyforeach_matchconsumer_subnodeproducer_subnodes    `     r7   r  z#ForeachKernelSchedulerNode.can_fuse  s"   (+ X%8%8%:0C4HHM 56  S )A) &    "'@@J+))228=MNNGH  "'@@J+))223CXNNGHf
 	
r9   c                 0   |j                         s|j                         sJ d }d }|j                         rW|j                         rGt        |j                  |j                        D cg c]  \  }}t        j	                  ||       }}}n|j                         rh|j                  |      }g }|}d }|j                  D ]A  }	|	|u r*t        j	                  |	|      }
|
}|j                  |
       1|j                  |	       C nw|j                         rg|j                  |      }g }|}d }|j                  D ]A  }	|	|u r*t        j	                  ||	      }
|
}|j                  |
       1|j                  |	       C  | |j                  ||      S c c}}w r3   )	rm   r  rB  rp   ro   r  r0  r  rr   )rx  r  r  prev_node_1prev_node_2r  r  fused_nodesr  rt   new_noder  s               r7   ro   zForeachKernelSchedulerNode.fuse  s   ""$(;(;(=== X%8%8%:  AADAq #''1-A     "'@@JK"KK ++166tXFH"*K&&x0&&t, (   "'@@JK"KK ++166xFH"*K&&x0&&t, ( 8%%{KMM?s   * Frr   rs   nodesc                    i | _         i | _        ||qt        |   ||       |D ]Z  }|j                  j
                  D ]  }|| j                   |j                  <    |j                         D ]  }|| j                  |<    \ n|| _        || _	        d | _
        g | _        | j                  t        j                  j                  |j                  |j                  g             t         j#                  |j$                  |j$                        D ch c]   }|j                  | j                         vr|" c}| j                  j&                  z
  | _        t)        |j*                  |j*                  g      | _        t-        |j.                  |j.                  g      | _        |j1                         r|n|}	|j1                         r|n|}
|	j2                  | _        | j2                  j5                  |
j2                         |	j                  | _        |
j                         D ]  }|
| j                  |<    |d   j7                         df| _        t!               | _        y c c}w )Nr   foreach)r
  r  r  r8   r   r   r   r   rr   rB  rt   rv   rx   r   r  r  rT   r  r^   r   rT  r  ro  r  rm   rz   r  r   r  r.  )r6   rr   r!  r  r  rt   r  r   ra   foreach_node
other_noderk  s              r7   r8   z#ForeachKernelSchedulerNode.__init__
  s$    +"5GY. ,,22D37D%%dii0 3 !NN,D.2D%%d+ -	  'DNDK#'DI)+DJ  ''22 ,,k.E.EF 9922K4R4R'C 884>>#33	 '   '''(D# !+"7"79N9N!OPDN +"7"79N9N!OPDN*5*@*@*B;L(3(>(>(@kJ)33DNNN!!*"6"67 , 9 9D",,.*4!!$' / Ah))+Y7
+.5/'s   %Ic                     t         r3   r  rA   s    r7   r  z#ForeachKernelSchedulerNode.mark_runB  r  r9   c                     t        | j                  t        j                        sJ dt	        | j                                | j                  j                          | j                  j                                       y r  )rS   rt   r   r   r   get_store_functionmake_loaderrA   s    r7   r
  z"ForeachKernelSchedulerNode.codegenE  s\    $))R%6%67N<LDO;M9NN7&		$$&'>tyy'<'<'>'@Ar9   c                     t         S r3   r  rA   s    r7   r"  z#ForeachKernelSchedulerNode.can_freeI  s    ""r9   c                      yr  rK   rA   s    r7   rm   z%ForeachKernelSchedulerNode.is_foreachL  r  r9   c                 ,    t        | j                        S )zReturns a list of nodes which comprise the foreach kernel, operating on corresponding elements of our input lists.
        These nodes may be vertically fused.)r   rB  rA   s    r7   get_subkernel_nodesz.ForeachKernelSchedulerNode.get_subkernel_nodesO  s     DKK  r9   c           	          t        t        j                  | j                  D cg c]  }|j	                          c}       S c c}w )ziReturns all nodes contained in this kernel, unpacking fused nodes into their constituent scheduler nodes.)r   r   r   rB  r   r  s     r7   r   z$ForeachKernelSchedulerNode.get_nodesT  s2    IOOT[[%I[akkm[%IJKK%Is   Ac                 <    | j                   d   j                         S r@  )rB  r   rA   s    r7   r   z)ForeachKernelSchedulerNode.get_first_nameX  s    {{1~,,..r9   c                 H    | j                   D ]  }|j                  |        y r3   )rB  r   )r6   r   rt   s      r7   r   z/ForeachKernelSchedulerNode.prune_redundant_deps[  s    KKD%%&89  r9   )NN)rE   rF   rG   r  r  r  r  r  ro   r   r  r8   r  r
  r"  rm   r-  r   r   r   r  r  s   @r7   rn   rn     s    - 
 
< $N $NT 6161 M"61p"B#!
L/:r9   rn   c           
          t         j                   fd       }t        t        t	        t         d                           }t        |      dkD  r|D cg c]  } |   	 c} t        j                  r|j                  |       |S c c}w )z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    c                 P   |    dk(  s|   dk(  rt        |    dk(  |   dk(        S D cg c]  }||    	 }}D cg c]  }||   	 }}t        d t        ||      D              }t        d t        ||      D              }||kD  ry||kD  ryt        ||       S c c}w c c}w )Nr   c              3   :   K   | ]  \  }}|d k(  xs ||k    ywr   NrK   rL  sl_asl_bs      r7   rM  z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>q  )      
7VtDAI$$7V   c              3   :   K   | ]  \  }}|d k(  xs ||k    ywr4  rK   r5  s      r7   rM  z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>t  r8  r9  r)  )r"   rR  r  )	abslstride_len_astride_len_ba_firstb_firstr  stride_lengthss	          r7   	index_cmpz"pick_loop_order.<locals>.index_cmpf  s    8q=E!HMuQx1}eAh!m44(67"17(67"17  
7:<7V
 
  
7:<7V
 
 WW 1ay# 87s   B B#r   rM   )		functools
cmp_to_keyr   r  ranger  r   pick_loop_orderssort)rB  r  priority_idxrC  orderpis   ``    r7   pick_loop_orderrL  `  s      0 %N1$5 6789E
<17CD|.,|D

y
!L Es   B
c                   B    e Zd ZU eed<   dZeed<   dZeed<   d ZddZ	y)	r   rt   Fr   is_weakc                 6    | j                   j                         S r3   r   rA   s    r7   r@   zNodeUser.get_name  r   r9   c                     | j                   |j                   u sJ t        | j                   | j                  xr |j                  | j                  xr |j                        S r3   )rt   r   r   rN  )r6   others     r7   r   zNodeUser.merge  sP    yyEJJ&&&II2!2!2LL*U]]
 	
r9   N)rQ  r   r   r   )
rE   rF   rG   rD   rJ   r   r   rN  r@   r   rK   r9   r7   r   r     s(    
K GT$
r9   r   c                   V    e Zd Ze fd       Zd Zd Zd Zd Zd Z	d Z
d Zd	 Zd
 Zd Zd Zd Zd Zd Zd Zd ZdedefdZdedefdZd ZdedefdZd Zd Zd Zd Zd Zd Zd Z d Z!d e"fd!Z#d"e$jJ                  fd#Z&d"e$jJ                  fd$Z'd% Z(ed&        Z)d' Z* xZ+S )(rs   c                 Z    t                    i  _        i  _        t	        t
               _        g  _        h t        j                  j                  j                         t        j                  j                  j                          _        |D cg c]  } j                  |       c} _         j                  j                  t        j                  j                  j                                 j                  D ]  }|j!                            j                  D ci c]  }|j#                         | c} _        t'                _        i  _        i  _         j/                           j1                           j3                          t4        j6                  rt9        j:                   j                          j=                          t>        xj@                  tC         j                        z  c_         t        jD                  jG                   j                         tC         j                         _$         j                  D ci c]  }|j#                         | c} _         jK                           j1                          tM                _'         jQ                          t4        j6                  r4 jS                          t9        jT                   j                         _         jW                          t        jD                  jY                   j                         t        jD                  j[                   j                          j]                          d  _/        tM                _0        i  _1        te        d      jg                   fd       y c c}w c c}w c c}w )Ngraph_statsc                  ^     j                    j                  t         j                        dS )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesr  r!  rA   s   r7   r   z$Scheduler.__init__.<locals>.<lambda>  s%     33+/+>+>*-djj/r9   )4r  r8   backends
fuse_cacher  _post_grad_graph_counterrY  r!  r*   r   rS  keys	constantsr   create_scheduler_noder  r   r@   r  r  r   r   mutation_renamescompute_dependenciestopological_sort_scheduledead_node_eliminationr    reorder_for_compute_comm_overlapr   decide_global_ordering_of_commscompute_ancestorsr   ir_nodes_pre_fusionr  r<   ir_pre_fusionrZ  create_foreach_nodesrT   logged_slow_fusion
fuse_nodescompute_node_users$reorder_compute_and_comm_for_overlapcompute_last_usageir_post_fusiongraph_diagramdebug_draw_graphcurrent_devicebuffer_names_to_freeorigin_to_indexr   add_row)r6   r!  nrt   rk  s   `   r7   r8   zScheduler.__init__  s   "&'?"@
'
WW!!&&('
WW##%'
#
 >CCUd003UC
 	##**177+<+<+A+A+CDJJDOO  &*ZZ;
%/AJJL!OZ;

 F 	 #% !#!!#&&(""$2211$**= ##s4::6#	djj)!$**o<@JJ"GJq1::<?J"G!!#&&("%%22##%CCDJJODJ!	tzz*	djj) -1$'E!  "'//	
 D;
D #Hs   N%N#N(c                     t         j                  j                  dd      dk(  rddlm}  || j
                  d       yy)z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr   r<   r{  r!  )r6   r{  s     r7   rr  zScheduler.debug_draw_graph  s1    ::>>:DASH+6 Ir9   c                     t         j                  t        j                        r8t         j	                  d|       | j
                  D ]  }|j                           y y )Nz%s:)r   isEnabledForloggingINFOr   r!  r   )r6   labelrt   s      r7   debug_print_nodeszScheduler.debug_print_nodes  sA    GLL)HHUE"

  " # *r9   c                    |j                   J d       |j                         rt        | |      S t        |t        j
                  t        j                  f      r6| j                  |j                               j                  }t        | ||      S t        |t        j                        rt        | |      S t        |      )Nz2All nodes passed to scheduling must have an origin)r.  is_no_oprN  rS   r   r   r  get_backendr   r  r  rf  r  r  )r6   rt   r  s      r7   r`  zScheduler.create_scheduler_node  s    LL$	@?	@$==?)$55r00"2C2CDE''(9:CCH tX66boo.,T488%d++r9   c                 X   t               }g }| j                  j                         }t        j                  j
                  j                         D ]  }|D cg c]%  }||v rt        | j                  |   t              s|' }}|s6|j                  |       |D cg c]  }| j                  |    }}t        | |      }|j                  |       |D ]  }|| j                  |<     | j                  D cg c]  }|j                         |vs| c}|z   | _        y c c}w c c}w c c}w r3   )rT   r   r^  r*   r   listsr   rS   r  rN  r  rn   r0  r!  r@   )	r6   removed_node_namesfe_nodeskept_node_namesnamesr   rB  fe_nodert   s	            r7   rj  zScheduler.create_foreach_nodes  s,    U11668WW]]))+E "!D?*"4#4#4T#:<RS !   %%e,:?@%$d''-%F@0v>GOOG$07''- % ,, "ZZ
'T4==?BT+TDZ

) A
s   *DD"5D'D'c           
          t        j                  t               j                  D ]}  }|j	                         }|j                         D ]X  }|v r=|v r9|   }|   }||z   }j                         D ]  }|   |u s|   |u s||<    D|v r	|   |<   Q|   |<   Z   fd fddfd	}i }	 j                  D ]v  }
t        j                  d|
j                         |
j                  j                         D ](  }t        |t        j                        sJ ||	vs$|
|	|<   * |
j                  j                         D ]=  }||	v sJ | d|	        |
j                  t!        |	|   j	                                      ? t#        |
j%                               dk  sJ |
j%                         D ]  } |      } |||
       |
j'                  t!        |             |   D ]X  } |j	                               } |
j	                               }||vs4|
j'                  t)        |              |||
d       Z  |
j*                  j,                  D ]6  }t        |t(              } ||j.                  |
|
j1                  |      |       8 |
j3                   j4                         |
j%                         D ]y  }|
j	                          j4                   |      <   |
j	                          j4                  |<    j6                  j9                  ||       j6                  |
j	                         <   { y t:        j<                  j?                         D ]3  }t        j                  d	|        ||tA        t!        |                   5 t:        j<                  jB                  D ]  }
t        |
tD        jF                        stI        |
jJ                        D ]k  }||	v sJ | d|	j                                 |	|   j                  j.                  }t        j                  d
||        ||tA        t!        |                   m   j4                  D ]c  }|t:        j<                  jL                  v s  ||tA        t!        |                   t:        j<                  jN                  jQ                  |       e tS        t:        j<                  jL                  j                               D ci c]  \  }}||
 }}}t:        j<                  jN                  D cg c]  }||   	 c}t:        j<                  _*         j                  D ]$  }
|
jW                  |
j	                                   &  j                  D ]8  }
|
jX                  D ]'  }|j                  jZ                  j]                  |
       ) : yc c}}w c c}w )zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        c                 N    | j                   v r j                   |          S | S r3   )ra  )rw  r   r6   s    r7   r   z.Scheduler.compute_dependencies.<locals>.renameH  s,    D)))d33A677Hr9   c                    | h}j                   |    }t        t        |j                  j                              }|j                  j
                  D ]  }|j                  j                   v st        |t        j                        s7t        |t        j                        sR|j                  |j                  k(  sl|j                  |j                  k(  s|j                   |j                                |S r3   )r  r  r  r   r   r   r   rS   r   r  r  r  r  )	node_namereachable_namesrt   r  r   dep_closurer6   s        r7   r  z3Scheduler.compute_dependencies.<locals>.dep_closureM  s    (kO$$Y/DT$"2"2"9"9:;I ,,22MMT%6%66"8\-C-CD"9l.D.DE )//9 7#**;x}}+EF 3 #"r9   c                 P     |          j                  t        |||             y r3   )r0  r   )used_by_name	user_noder   rN  name_to_usersr   s       r7   add_userz0Scheduler.compute_dependencies.<locals>.add_user\  s'    &./66K9r9   zscheduling %sz not in r   T)rN  zscheduling output %sz+scheduling output %s for unbacked symint %sN)FF)/r   rQ  r   r!  r@   r   r^  r   r<   rt   get_unbacked_symbol_defsrS   r  Symbolget_unbacked_symbol_usesr   r   r  r   r   r   r   r   r   r   r   ra  r   r   r*   r   get_output_namesr\   graph_outputsr   ShapeAsConstantBufferr   shaperS  mutated_inputsr   r  mutated_input_idxsr   rv   r_   r0  )r6   r.   
node1_name
node2_namelist1list2combinedrN   r  unbacked_symbol_to_origin_nodert   salt_namer%  
other_nameknown_dep_node_namesr  rN  r  r   r  	inp_namesrC  r  r  r   s   `                      @@@r7   rb  zScheduler.compute_dependencies.  s:   
 ;F:Q:QRV:W
 ZZE)J#//1
.:3N)*5E)*5E$u}H,113(-6-:LPU:U19M#.  4  =00=j0IM*-0=j0IM*- 2   	
	#	
 *,&JJDIIotyy1 YY779!!U\\222 ::8<215 : YY77977BS!? @AB7!!'*H*K*T*T*V"WX	 : t))+,111 ..0!(+4(%%gh&78"/"9J!'
(;(;(=!>J+6t}}+G(!)== --gj.AB T4@ #: 1  ((..$T73D$*:*:4*@'J / %%d&;&;< !..0:>--/%%fX&6726--/%%h/;?;R;R;V;Vh<''8 1[ j 113III,i8Y
79+= >? 4
 GG))D$ 8 89.tzz:A;;MH%C%H%H%J$KLM; >q A F F K KIIIEyRS Y
793E(FG ; * ))Dqww+++z'$-89&&**40 * ,5QWW5I5I5N5N5P+Q
+QKE4D%K+Q 	 
 )*(>(>&
(>IdO(>&
"
 JJDNN=9:  JJD

		''..t4 # 
&
s   <W(Wc                 t   i }| j                   D ]I  }t        |t              r$|j                  D ]  }|||j	                         <    |||j	                         <   K | j                   D ]  }g |_        g |_         | j                   D ]L  }g }|j                  D ]2  }|j                  |v sJ ||j                     }|j                  |       4 ||_        N i }| j                   D ]4  }|j                  D ]#  }|j                  |g       j                  |       % 6 |j                         D ]  \  }}	|	|_         y r3   )r!  rS   rp   rB  r@   rw   r_   r^   r   r0  
setdefaultitems)
r6   buf_to_snodert   r   r_   ra   dep_nodenode_to_usersinverse_userrv   s
             r7   rm  zScheduler.compute_node_users  s,   JJD$ 23A15L. %,0L)	  JJD DO!#D 
 JJDM..xx<///'1$$X. / "/D  KMJJD $ 2 2((r:AA$G !3  )..0KD%#DO 1r9   c                     d}|rg }| j                   D ]  }dt        fd|j                          xr t        fd|j                  D              }|s|j                  |       Rt        j                  d|j                                t        j                  j                  j                  |j                                 t        | j                         t        |      kD  }|| _         |r| j                   D ]  }|j                           y)z0
        Remove any nodes without users
        TrC  c                 r    | j                   xs* | j                         t        j                  j                  v S r3   )rN  r@   r*   r   r   )rC  s    r7   can_eliminate_userz;Scheduler.dead_node_elimination.<locals>.can_eliminate_user  s&    <<U4==?agg>U>U+UUr9   c              3   .   K   | ]  } |        y wr3   rK   )rL  ur  s     r7   rM  z2Scheduler.dead_node_elimination.<locals>.<genexpr>  s      D3=a&q):s   zremoved dead node: %sN)r!  r   r   r  rv   r0  r   r<   r@   r*   r   r   r   r  r   )r6   againupdated_nodesrt   can_eliminater  s        @r7   rd  zScheduler.dead_node_elimination  s     M

VX V %)$9$9$; ; ! D37::D A %!((. II5t}}GGG++//@ #  

Oc-&88E&DJ' , JJD  " r9   c                     t               t               g fd| j                  D ]  }|j                         D ]  }||<   	  | j                  D ]
  } |        | _        y)zD
        Ensure self.nodes is in topologically sorted order
        c                     | vrUj                  |        t        | j                  d       D ]  } |j                             j	                  |        y y )Nc                     | j                   S r3   r   )ds    r7   r   zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>	  s    affr9   rM   )r   rU   r^   r   r0  )rw  ra   r  rZ   seenvisits     r7   r  z2Scheduler.topological_sort_schedule.<locals>.visit  sP    }!!"6"6<LMC,sxx01 Na 	 r9   N)rT   r  r!  r   )r6   rt   r   r  rZ   r  r  s      @@@@r7   rc  z#Scheduler.topological_sort_schedule  sf      #u-1V"$	! JJD(%)T" )  JJD$K 
r9   c                 D   i }| j                   D ]d  }t               }|j                  D ]/  }|j                  |j                         |||j                     z  }1 |||j                         <   ||_        f t        | j                         D ]  \  }}||_        ||_	         y)z.
        Populate each node.ancestors
        N)
r!  rT   r^   r   r   r@   rz   r  r  r  )r6   name_to_ancestorsrt   rz   ra   rJ  s         r7   rg  zScheduler.compute_ancestors  s    
 24JJDI..chh'.sxx88	 / 2;dmmo.&DN  %TZZ0KE4"DN"DN 1r9   c                 L   t        d      D ]  }t        | j                        }t        j	                  d|dz   |       | j                          t        | j                        }t        j	                  d|dz   ||       ||k(  s|dk(  s}t        j	                  d|dz           y y)zO
        Mutates self.nodes to combine nodes into FusedSchedulerNodes.
        
   z/===== attempting fusion (%d/10): %d nodes =====r   z=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)rF  r  r!  r;   r<   fuse_nodes_once)r6   r  old_lennew_lens       r7   rl  zScheduler.fuse_nodes&  s     rA$**oGA1q5'   "$**oGPA	 '!W\  !NPQTUPUV r9   c                     t        |      dkD  sJ |d   j                         }| t        j                  _        || _        | j                  |      }|j                  |      S )
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        r   )r  r   r*   r   rr   rs  r  benchmark_fused_nodes)r6   r!  re  backends       r7   r  zScheduler.benchmark_fused_nodes;  s[    
 5zA~~q$$& $""6*,,U33r9   c           
        	
 t         j                  sy|j                         ry|j                         }|d   j	                         }|j
                  dk(  ry|j                         }||z   }t        d |D              ryddlm} 	 | j                  |      \  	t        j                  	      rt        j                  d       y| j                  |      \  
t        j                  
      rt        j                  d       y| j                  |      \  t        j                        rt        j                  d	       y	 t        j                  t         j"                        r	
z   k  rFt        j                  d|j%                         |j%                         t'        	
z   z  d             nEt        j                  d|j%                         |j%                         t)        	
z   z  d             t+        d      rW	
z   k\  rOf| j,                  vr?| j,                  j/                  f       t1        d      j3                  	
fd       	
z   k  S # |$ r}d
t        |      v rY d}~y d}~ww xY w)z
        If config.benchmark_fusion is False, always return True.
        Otherwise, return True if fusion can brings speedup.
        Tr   cpuc              3      K   | ]a  }t        |j                  d       xrE t        |j                  j                  d      xr# |j                  j                  j                  dk(   c yw)r   scatter_moder  N)r  rt   r   r  )rL  rw  s     r7   rM  z.Scheduler.speedup_by_fusion.<locals>.<genexpr>`  s^      
 % AFFF# 9^49((L89 %s   A'A))CompilationErrorz>cannot fuse (benchmark): register spilling of the first kernelFz?cannot fuse (benchmark): register spilling of the second kernelz>cannot fuse (benchmark): register spilling of the fused kernelzLoop-carried variableNz9can fuse (benchmark): fusing %s with %s cause %sx speedupz.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdownslow_fusionc            	      $      z   z  dS )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratiorK   )ms1ms2ms_fusedpath1path2
path_fuseds   r7   r   z-Scheduler.speedup_by_fusion.<locals>.<lambda>  s&    $)'*$)'*)3,4'/39'=r9   )r   benchmark_fusionr   r   r   r   r  triton.compiler.errorsr  r  mathisinfr;   r<   rI   r  r  DEBUGr   r'   r(   r   rk  r   r   rv  )r6   r.   r/   node_list_1re  node_list_2node_list_fusedr  er  r  r  r  r  r  s            @@@@@@r7   speedup_by_fusionzScheduler.speedup_by_fusionG  s\   
 &&oo'Q**, ;;%oo'%3
  
 %	
 
 ;	33K@JCzz#  T 33K@JCzz#  U #'#=#=o#N Hjzz(#  T 	 $ ""7==1#)#  OOO%OO%39"8!=?	   SOO%OO%C#I 6s;=	 $M2C#I%d&=&==##''7]+33 
 #)##Q   	&#a&0	s*   >I) 
>I) 	>I) )J.J JJc                    t        | j                        }| j                         D ]  \  }}| j                  |j	                            }| j                  |j	                            }| j                  ||      sS| j                  ||      rf| j                  ||      syt        ||      }|j                  |       |j                  |       |j                  |       | j                  j                  |j                         D ci c]  }|j                         | c}        t        |d       | _        | j                          | j!                          yc c}w )a  
        Mutates self.nodes to combine nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuses(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        c                     | j                   S r3   )r  r   s    r7   r   z+Scheduler.fuse_nodes_once.<locals>.<lambda>  s    q{{r9   rM   N)rT   r!  get_possible_fusionsr   r   r  will_fusion_create_cycler  ro   remover   r  r   r@   rU   rc  r   )r6   r  r.   r/   node3rw  s         r7   r  zScheduler.fuse_nodes_once  s,    $**o 557LE5++E,@,@,BCE++E,@,@,BCE}}UE*43P3Pu4 --eU;UE*""5)""5)&''..27//2CD2CQQZZ\5(2CD 8 K-BC
&&(!!#	 Es   E
c                 \    | j                   D ]  }|j                  | j                          y r3   )r!  r   r   r  s     r7   r   zScheduler.prune_redundant_deps  s$    JJD%%d&=&=> r9   c                 *   	
 g 	t               
	
 fd}t        j                  t              } j                  D ]+  }|j                         D ]  }||   j                  |        - |j                         D ]
  } ||        t        j                  rkt        j                  t              } j                  D ]&  }t        |dd      }|s||   j                  |       ( |j                         D ]
  } ||        	j                   j                  d       t        j                  t        j                         rQt        j#                  dt%        	             	D ]  }t        j#                  d|        t        j#                  d       	S )	z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        c                 P   t        |       D ]  \  }}| |dz   d  D ]  }||f}|v rj                  |       j                  ||      rj                  |       A|j	                         s|j                         sbj                  ||      suj                  ||f         y )Nr   )r  r   r  r0  r   rm   )r!  node1_indexr.   r/   rN   possible_fusionsr  r6   s        r7   check_all_pairsz7Scheduler.get_possible_fusions.<locals>.check_all_pairs  s    &/&6"U";?#45E %.Cd{ HHSM}}UE2(//4++-1A1A1CuJ )//? 6 '7r9   r  NT)rN   reversez
found %d possible fusions:z%sr   )rT   r   rQ  r   r!  r   r0  r   r   aggressive_fusionr   rH  score_fusion_keyr;   r  r  r  r<   r  )r6   r  buffer_names_groupingrt   rA  node_groupinggroup_groupingr  r+   r  r  s   `        @@r7   r  zScheduler.get_possible_fusions  sX    u	@  !, 7 7 =JJD--/%c*11$7 0  399;MM* < ##(44T:N

gt4"5)006 # "0!6!6!8. "9 	$"7"7F""7==1;SAQ=RS*  v. +R r9   c                      fdt               |j                         |j                         z  |j                  |j                  z  z
  t         fdD              }|r t	        ||      d       |S )z~
        Finds whether there's a path from node1 to node2 (or vice-versa)
        caused indirectly by other fusions.
        c                    t        | t              rq| vrmj                  |        | j                         j	                        ryt        | j                  z        xs" t        fd| j                  z
  D              S y)NFc              3   H   K   | ]  } j                   |           y wr3   r   rL  rw  
found_pathr6   s     r7   rM  zIScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>  s*      H!DA #4#:#:1#=>!D   ")rS   rp   r   r   issubsetr   rz   r  )rt   combined_ancestorscombined_namesr  r6   visiteds    r7   r  z6Scheduler.will_fusion_create_cycle.<locals>.found_path  s    $ 23G8KD!>>#,,-?@ !   ?@ C H!%2D!DH E  r9   c              3   H   K   | ]  } j                   |           y wr3   r  r  s     r7   rM  z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>  s$     WDVqJt66q9:DVr  zwill create cycle)rT   r   rz   r  r-   )r6   r.   r/   cycler  r  r  r  s   `   @@@@r7   r  z"Scheduler.will_fusion_create_cycle  sq    	 	. %*U__->>#oo?>QWDVWW#IeU#$78r9   r.   r/   c                     t        t        |j                  |j                  z
        t        |j                  |j                  z
              }|dkD  S )aB  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heurisitic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )ro  absr  r  )r6   r.   r/   proximity_scores       r7   can_fusion_increase_peak_memoryz)Scheduler.can_fusion_increase_peak_memory  sE    * %//12%//12
 ##r9   c                 Z    |u ryt        |      }t        t        t        f      rj	                         s	 |d       yt        |t        t        f      r|j	                         s	 |d       yj                         s|j                         rt        j                  |      S |j                         j                  z  r	 |d       yt        t        t        f      rlt        |t              r\t        |j                  t        j                        r8t         fd|j                  j                   j#                         D              ry|j	                         r	 |d       yj	                         r9|j%                         s |j'                         st(        j*                  s	 |d       yj-                         }|j-                         }||k7  r |d||       y~ j/                  |      d	k(  }|r9t(        j0                  r j'                         s|j'                         r	 |d
       yj                         s]|j                         sMt3        j5                               t3        |j5                               z   t(        j6                  kD  r	 |d       yj                         |j                  z  r4 j9                  |      sy j;                  |      j9                  |      S  j=                  |      r	 |d       y j;                  |      j?                  |      S )zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        Fznode1 is extern or nopznode2 is extern or nopznode1 must go before node2c              3   z   K   | ]2  }|j                   v xr j                  j                   |          4 y wr3   )ra  r  )rL  node2_used_bufr.   r6   s     r7   rM  z%Scheduler.can_fuse.<locals>.<genexpr>_  sO      
 'IN #d&;&;; T,,T-B-B>-RST 'Is   8;z!templates can only fuse epiloguesztemplate epilogue not satisfiedzdevice mismatch (%s vs %s)r   zno shared datazexceeds max fusionzwill increase peak memory) r-   rS   r  rN  r   rm   rn   r  r   rz   rp   r  r  r   r  r  reads_name2exprr^  r   rd   r   epilogue_fusionr   score_fusion_memoryr  r  r   max_fusion_sizecan_fuse_verticalr  r  can_fuse_horizontal)r6   r.   r/   r  re  device2no_shared_datas   ``     r7   r  zScheduler.can_fuse6  s    E>u% u8:PQR%%'()u8:PQR%%'()!1!1!3-66ueDD??u.,- u1=AB5-05;;4  
 ',kk&A&A&F&F&H  34**,!!#))12!!#""$W,fg>11%?1D((E,>,>,@EDVDVDX !   "$$&EOO%&U__->)??&BXBXX$%??u.))%7##F+==eUKK33E5A/0##F+??uMMr9   c                 F   |j                         }t               }t        ||      }|j                  D ]  }|j                  j
                  D ]  }|j                  |j                  k(  st        |      t        |      k(  s5t        |j                  d      rLt        |j                  d      rc|j                  |j                  k(  s}t        |j                        t        |j                        k\  s|j                  dt        |j                         |j                  k(  s|j                  |         |j                  |z
  D ch c]  }|j                   }	}|	|z  r	 |d       y|	D ](  }
|| j                  |
   j                  z  s  |d        y yc c}w )a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.
        tmpNzmemory deps did not matchFz(intermediate nodes between node1 & node2T)r   rT   r-   r^   r   r   r   r   r#   r  r  r  r   r   rz   )r6   r.   r/   node1_namescomputed_depsr  r  cdra   remaining_depsr   s              r7   r  zScheduler.can_fuse_vertical  sS    oo'u%**B''.. GGrww&RDH,+BHHe<+BHHe<BHH,BGGBGG4#bgg,/277:!%%b) / +" /4.F.F.VW.Vs#((.VWK'
 +,"DT44T:DDD>? #  Xs   Fc                 d   | j                  ||      }t        t        |j                  |j                  z
        t        |j                  |j                  z
               }|j                         t        j                  k(  xr |dkD  |j                         |j                         k(  xr |dkD  ||fS )a\  
        Assign a score (higher comes first) to the fusion of node1
        and node2.  When different fusions conflict with each other,
        this is the way we decide what order to run them in.

        Our current score is based on:
        - Estimate of the saved memory operations
        - Fusions closer together in original order
        r   )	r  ro  r  r  r  r   r   epilogue_fusion_firstrd   )r6   r.   r/   memory_scorer  s        r7   score_fusionzScheduler.score_fusion  s     //u=%//12%//12
 

 6#?#??TLSTDT E$6$6$88M\A=M	
 	
r9   c                 &   |j                   j                  |j                   j                  z  |j                   j                  |j                   j                  z  z  }|D ch c]  }|j                         r| }}t	        d |D              S c c}w )zf
        The first term in our fusion score that estimates number of saved memory operations.
        c              3   <   K   | ]  }|j                           y wr3   )numbytes_hint)rL  ra   s     r7   rM  z0Scheduler.score_fusion_memory.<locals>.<genexpr>  s     E2D33$$&2Dr  )r   r   r   has_unbacked_symbolsrR  )r6   r.   r/   common_memory_depsra   s        r7   r  zScheduler.score_fusion_memory  s     $//558I8I8P8PP##e&7&7&>&>>
 .
-CS5M5M5OC- 	 
 E2DEEE
s    B6Bc                 0    |\  }}| j                  ||      S )z-
        Shim for list.sort(key=...)
        )r$  )r6   r!  r.   r/   s       r7   r  zScheduler.score_fusion_key  s      u  ..r9   c                 "   t               }t        j                  j                         D ]  }|j	                  |        t        | j                        D ]9  }|j                  || j                         |j                  |j                         ; y)zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)rT   r*   r   r  r   r  r!  r   r   r  r{   )r6   r   r  rt   s       r7   ro  zScheduler.compute_last_usage  so    
 "e113I##I. 4 TZZ(D 3T5L5LM&&t7 )r9   c                    t        | j                  t        j                  j                  z
  t        j                  j
                  j                  z
        D ]  }|| j                  v rT| j                  |   }|j                         s2t        j                  j
                  j                  |j                         f|t        j                  j                  v st        j                  j                  |   j                  }t        |t        j                        r|j!                         sJ t        j                  j
                  j                  |j                          | j                  j#                          y)z*Free any buffers that are no longer neededN)rU   rt  r*   r   r   r  freedr  r"  codegen_freert   rS  r   rS   r   
StorageBoxis_input_bufferclear)r6   r   rt   storages       r7   free_bufferszScheduler.free_buffers  s   %%gg%%&gg""(()
D
 t(((((.==?GG((55dii@---''..t499!'2==9g>U>U>WWW$$11',,?
 	!!'')r9   c                 T   	 t         j                  j                  }g 	t         j                  j                  D ]l  } j                  |   j                  }|J |D ch c]  }|j
                  r|j                         ! }}|j                  |      s\	j                  |       n  fd}t        t        |	            		D ]  }|t         j                  j                  j                  v rt         j                  j                  j                  |   }t        |t              r|j                  d      rrt!        	fd|j"                  D              }|r j%                  |       t         j                  j&                  j)                  |       ͉ j+                  |        yc c}w )zr
        Any buffers that are both created and have a last use in the
        same kernel can be removed.
        Nc                     | t         j                  j                  vxrF | t         j                  j                  j                  vxr | j
                  vxr | j                  vS r3   )r*   r  must_keep_buffersr1   input_buffersra  r   )rw  r6   s    r7   remove_filterz<Scheduler.remove_kernel_local_buffers.<locals>.remove_filter  s]    333 5QXX]]8885T2225 T444	r9   REMOVEDc              3   &   K   | ]  }|v  
 y wr3   rK   )rL  rw  names_to_removes     r7   rM  z8Scheduler.remove_kernel_local_buffers.<locals>.<genexpr>'  s     K?aQ/1?s   )r*   r  store_buffer_namesr  rv   rN  r@   r  r0  r   filterr1   r  rS   rI   
startswithr  other_namesremove_inplace_bufferinplaced_to_remover   remove_buffer)
r6   fused_node_namesout_bufrv   rC  r8  r   rA  r  r;  s
   `        @r7   remove_kernel_local_buffersz%Scheduler.remove_kernel_local_buffers  sD    8866xx22G%%g.44E$$$16KdllT]]_EK~~./&&w/ 3	 vm_EF#Dqxx}}444hhmm33D9c3'CNN9,EK3??KK..t4++//5""4( $ Ls   F%/F%c                     t         j                  d|       dt        j                  j                  j
                  |<   t        j                  j                  j                  |       y )Nzremove_buffer(%r)r9  )r   r<   r*   r  r1   output_buffersr   r   r  s     r7   rB  zScheduler.remove_buffer.  sC     			%t,-6$$T*	  $$T*r9   c                 R   t         j                  d|       t        j                  j                  j
                  |   j                  }|j                  dd      t        j                  j                  j
                  |<   t        j                  j                  j                  |       y )Nzremoving_inplace_buffer(%r)
in_out_ptrr9  )
r   r<   r*   r  r1   r  
inner_namer4  r   r   )r6   r   rJ  s      r7   r@  zScheduler.remove_inplace_buffer6  sq    		/6XX]]2248CC
.8.@.@)/
%%d+ 	
  $$T*r9   c                     | j                   j                         D ]  }|j                           | j                          y r3   )r[  r   flushr3  )r6   r  s     r7   rL  zScheduler.flush>  s.    }}++-GMMO .r9   scheduler_nodec                    t        |t              sJ t        j                  t	        d            5  |j                          |j                          d d d        |j                  }t        |t        j                        sJ dt        |             |j                  t        j                  j                         | j                          y # 1 sw Y   |xY w)NF)increase_kernel_countztype(node)=)rS   r  r*   set_kernel_handlerr   r  r   rt   r   rf  r   r
  r   r  r3  )r6   rM  rt   s      r7   codegen_extern_callzScheduler.codegen_extern_callC  s    .*CDDD
 !!&u"EF002##% G ""$0B[T$ZM2BB0QWW))* GFs   !CCre  c                     |j                   dk7  s|j                  
J | d       t        j                  j	                  |       t        |j                         }|t        d|j                          |j                   dk(  rut               skt        j                  j                  |      }|j                  dk  r2t        d|j                   d|j                   d|j                         t        d       ||       S )	Nr^  z( should have been normalized in loweringzUnsupported device type:    zFound z which is too old to be supported by the triton GPU compiler, which is used as the backend. Triton only supports devices of CUDA Capability >= 7.0, but your device is of CUDA capability .zCannot find a working triton installation. More information on installing Triton can be found at https://github.com/openai/triton)r   r  r*   r   add_device_infor   RuntimeErrorr   r  r^  get_device_propertiesmajorr   minor)r6   re  device_schedulingdevice_propss       r7   create_backendzScheduler.create_backendQ  s'   KK6!V\\%=	?X=>	?=	'5fkkB$!:6;;-HII;;&  ::;;FCL!!A%"\../  0j  kw  k}  k}  j~  ~  @L  @R  @R  S  T  # X  !&&r9   c                 x    || j                   vr| j                  |      | j                   |<   | j                   |   S r3   )r[  r\  )r6   re  s     r7   r  zScheduler.get_backendh  s6    &$($7$7$?DMM&!}}V$$r9   c                      fd}|j                         D cg c]'  }|j                  j                  D ]  } ||      |f ) }}}|r8t        |      \  }}t        j
                  j                  j                  |       y y c c}}w )Nc                     | j                   vrLj                   j                  t        | j                  j                        D  ci c]  \  }} | |
 c} }       j                       S c c} }w r3   )ru  r  r  r   r!  )rw  r  r6   s     r7   	get_orderz*Scheduler.enter_context.<locals>.get_ordern  sb    ,,,$$++i>V,W>VdaQT>V,WX''** -Xs   A+
)r   rt   r.  ro  r*   r   r  enter_context)r6   rt   r`  rw  r  r.  r  lasts   `       r7   ra  zScheduler.enter_contextm  st    	+
 /3nn.>W.>1IaL!$$.>W'lGAtGG  ..t4  Xs   ,Bc                 8   | j                   D ]  }	 t        j                  d|j                         |j	                                | j                  |       t        |t              sU|j                         }|| j                  k7  s |j                         s|j                         r| j                          || j                  k7  r|j                  dk(  r| j                  rA| j                  j                  dk(  r(t        j                   j"                  j%                          |j&                  J d       t        j                   j"                  j)                  |j&                         nM| j                  rA| j                  j                  dk(  r(t        j                   j"                  j%                          || _
        | j*                  j-                  |j.                         |j                         r4|j1                         ^}}| j3                        j5                  ||       n|j                         r| j7                  |       n|j9                         r!| j3                        j;                  |       ngt        |t<        t>        f      r/| j3                        jA                  |j1                                n"t        |t              sJ |jC                          tD        jF                  r)t        j                   j"                  jI                  |       tD        jJ                  jL                  r| j3                        jO                          | jP                  j-                  |jS                                t        |t              r|j                         }| j3                  |      jU                         s| j                           | j                          y # t
        $ r/}t        j                  d|j                                Y d }~d }~ww xY w)Nz5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0r^  zdevice should have an index)+r!  r   r<   r@   r~  r   ra  rS   rN  r   rs  r   r   rL  r   r*   r   r  codegen_device_guard_exitr  codegen_device_guard_enterrt  r  r{   r   r  codegen_templaterQ  rm   codegen_foreachrp   r  codegen_nodesr   r   debug_check_inf_and_nangenerate_inf_and_nan_checkerr  debug_sync_kernelcodegen_syncr   r   ready_to_flush)r6   rt   r  re  epilogues        r7   r
  zScheduler.codegenx  s   JJD
		KMMO..0 t$d$:;*d111~~''')JJLT000{{f,..43F3F3K3Kv3UGG00JJL%||7V9VV7,,GGU,,1D1D1I1IV1S,,FFH*0D'%%,,T__=!"&.."2x  (99$I!((."  (88>D#5}"EF  (66t~~7GH!$(>???--$$AA$G}}..  (557''..t~~/?@d$:;*##F+::<JJLw z 	

m  		LMMO s   3O!!	P*$PPc                    |t         j                  j                  v s|t         j                  j                  v ry| j                  |   }|j
                  j                         }t        |t        j                        r|j                          S yrc   )r*   r   rS  r_  r  rt   r   rS   r   r   maybe_guard_aligned)r6   rY  rt   r   s       r7   is_unaligned_bufferzScheduler.is_unaligned_buffer  so    qww+++x177;L;L/L  *%%'fb../11333r9   ),rE   rF   rG   r   r8   rr  r  r`  rj  rb  rm  rd  rc  rg  rl  r  r  r  r   r  r  rD   r  r  r  r$  r  r  ro  r3  rE  rB  r@  rL  r  rQ  r  re  r\  r  ra  r
  rq  r  r  s   @r7   rs   rs     s1   Q
 Q
f7#,<O5b$B#<,#$*
4^$@$8?- ^#J$&$/@$6\N/ \N8I \N|)V
"3 
<M 
,
F/8*$%)N++
2K 'U\\ '.%%,, %
	5 > >@	r9   rs   c                   x    e Zd ZdedefdZdedefdZd Zdedee   fdZ	d	ee   fd
Z
d ZdefdZd Zd Zy)BaseSchedulingr.   r/   c                     t               )zO
        Check whether node1 and node2 can be vertically fused or not.
        r  r5   s      r7   r  z BaseScheduling.can_fuse_vertical       "##r9   c                     t               )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        r  r5   s      r7   r  z"BaseScheduling.can_fuse_horizontal  ru  r9   c                     t               )z[
        Process the iteration sizes in case a transformation needs to be applied.
        r  )r6   r  s     r7   r  zBaseScheduling.group_fn  ru  r9   template_nodeepilogue_nodesc                     t               )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        r  )r6   rx  ry  s      r7   rf  zBaseScheduling.codegen_template  s     "##r9   r!  c                     t               )zD
        Generate a kernel given a list of pre-fused nodes.
        r  r6   r!  s     r7   rh  zBaseScheduling.codegen_nodes  ru  r9   c                     t               )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        r  rA   s    r7   rl  zBaseScheduling.codegen_sync  ru  r9   r   c                      y)z
        Check whether the backend is requesting the scheduler to flush the generated kernel.
        If not supported, please return False.
        FrK   rA   s    r7   rm  zBaseScheduling.ready_to_flush  s    
 r9   c                     t               )z]
        Flush the generated kernel and python wrapper code to the source code file.
        r  rA   s    r7   rL  zBaseScheduling.flush  ru  r9   c                     t               )r  r  r|  s     r7   r  z$BaseScheduling.benchmark_fused_nodes  s    
 "##r9   N)rE   rF   rG   rD   r  r  r  r  r   rf  rh  rl  r   rm  rL  r  rK   r9   r7   rs  rs    sx    $'8 $AR $$): $CT $$	$*	$<@<O	$$4(9#: $$ $$r9   rs  rC   )rK   )[r   dataclassesrD  r   r  r  r}  rV   rX   typingr   r   r   r   r   r   r	   r
   r   r   r  r  torch._dynamo.utilsr   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr   torch.utils._tritonr   r   r   r   r   r   r   codegen.commonr   r   comm_analysisr   r   r   r   r   r   rG  r    utilsr!   r"   r#   r$   r%   r&   r'   r(   r)   virtualizedr*   	getLoggerrE   r   _logginggetArtifactLoggerr;   r-   rW   r\   ro   opsatenconvolutionmmbmmaddmmrg  rD   r  rN  r  rp   rn   rL  	dataclassr   countr]  rs   rs  rK   r9   r7   <module>r     s         	       , M G * 6 6 = ; * > > &
 
 
  g!^^--hA

 
, "5 #()).."<"<**))..,,!IINN00	 ~ ~B 1 @	. 	};% };@S"* S"lj:!3 j:Z%P 
 
 
( +9??, b bJ!<$ <$r9   