
    Ph:                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZm	Z	 d dl
mZmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZ d	d
l m!Z!m"Z"m#Z#m$Z$ d	dl%m&Z& d	dl'm(Z( d	dl)m*Z*m+Z+ d	dl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 d	dl3m4Z4m5Z5 ddl6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZC ej                  j                  eFd      ZGej                  dej                  dej                  dej                  dej                  dej                  dej                  dej                  dej                  dej                  dej                  diZSej                  dej                  dej                  d ej                  d!ej                  d"ej                  d#ej                  d$ej                  d%ej                  d&ej                  d'ej                  d(ej                  d)ej                  d*iZVd+d,d-ZWdZXh d.ZYd/d0d1d2d3d4d5d6d7d7d8
ZZh d9Z[d:dddd;d<d=d>d?d@dA
Z\dBdCdDZ]ej                  ej                  gZ^dE Z_dF Z`dG ZadH ZbdI ZcdJ ZddK ZedL ZfdagdM ZhdN Ziej                  dOej                  dPej                  fdQ       Zm G dR dSe>      Zn en       j                  ZpdT Zq G dU dV      ZrdWej                  j                  dXeCfdYZudXeCfdZZv G d[ d\e:      Zw G d] d^eB      Zx G d_ d`ex      Zy G da dbe@      Zz G dc ddez      Z{ G de dfe{      Z| G dg dhe{      Z} G di djez      Z~ G dk dle*      Z G dm dn      Z G do dpe      Z G dq dr      Zej                   G ds dt             Zej                   G du dv             Zy)w    N)copydeepcopy)DictListOptionalSetTupleUnion)dependencies)
StorageBox	TensorBox)is_float_dtype)FloorDiv)bound_sympyValueRanges   )	codecacheconfigirmetrics)WrapperCodeGen)range_expressable_in_32_bits)BaseSchedulingSchedulerNode)cache_on_selfget_fused_kernel_nameis_welford_reductionsympy_product
sympy_subssympy_symbol)opsV   )BracesBufferCppWrapperKernelArgsCSECSEVariableDataTypePropagationDeferredLineDTYPE_TO_COMPUTATION_DTYPEExprPrinterIndentedBufferKernel
KernelArgsOpOverridesOptimizationContextschedulefloatdoublehalflongintshortzsigned charzunsigned charboolbfloat16	complex64z
at::kFloatzat::kDoublez	at::kHalfz	at::kLongzat::kIntz
at::kShortz	at::kCharz	at::kBytez	at::kBoolzat::kBFloat16zat::kComplexFloatzat::kFloat8_e4m3fnzat::kFloat8_e5m2zat::kCPUz	at::kCUDA)cpucuda>   *+^maxmin||r>   r=   r?   rA   r@   argminargmaxrB   welford)
sumprodxor_sumrA   r@   rC   rD   anywelford_reducewelford_combine>   r@   rA   rF   rG   rH   rJ   rK   z
at::Tensorzstd::stringzc10::ScalarTypezat::MemoryFormatz
at::Layoutz
at::Devicez
at::Scalar)
Tensorr6   r2   r8   str
ScalarTypeMemoryFormatLayoutDevicenumberzstd::vectorzc10::optional)r   r   c                    | t        d      k(  rd| dS | t        d      k(  rd| dS t        | t              r d| dt        |       j	                          dS t        j                  |       rd| d	S d| dt        |        dS )
Nz-inf-std::numeric_limits<>::infinity()infstd::numeric_limits<static_cast<>()z>::quiet_NaN())r2   
isinstancer8   rM   lowermathisnanrepr)valuecpp_types     fC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/_inductor/codegen/cpp.pyvalue_to_cpprc      s    f&xj>>	%,	%hZ}==	E4	 hZr#e**:*:*<)=Q??	E	%hZ~>>hZr$u+a88    c                 2   |t         v rt        j                  }| dv ry| dk(  ry| dv r%t        |      rdt        |    dS dt        |    d	S | d
v r%t        |      rdt        |    dS dt        |    dS t        |       rdt        |    dS t        |       )N)rH   rF   rI   r   rG   r#   >   r@   rD   rT   rU   rW   z>::min()>   rA   rC   z>::max()Welford<>())DTYPE_LOWP_FPtorchfloat32r   DTYPE_TO_CPPr   AssertionError)reduction_typedtypes     rb   reduction_initro      s     22** e$ $L$7#8F	
 (U(;'<HE	

 ** e$ #<#6"7}E	
 (U(;'<HE	

 N+,u-.c22

((rd   c                 x    t         t        |      }d| d}t        |       rd| dS t        | |      }| d| dS )Nat::vec::Vectorized<>rf   rg   (rZ   )rk   r*   r   ro   )rm   rn   scalar_typevec_typescalar_inits        rb   reduction_init_vecrw      sV    9%@AK%k]!4HN+(3'' 7KZqQ''rd   c                 T    | dvsJ t         t        |      }t        |       rd| dS |S )N>   rD   rC   rf   rr   rk   r*   r   )rm   rn   rt   s      rb   reduction_acc_typerz      s>    !55559%@AKN++a((rd   c                 `    | dvsJ t         t        |      }d| d}t        |       rd| dS |S )N>   rD   rC   rq   rr   rf   ry   )rm   rn   rt   ru   s       rb   reduction_acc_type_vecr|      sK    !55559%@AK%k]!4HN+(1%%Ord   c           	      (   | dk(  r| d| S | dk(  r| d| S | dk(  r| d| S | dk(  r| d| S | d	v r|  d
| d| dS | dk(  r	d| d| dS | dk(  r6t        |t              r|\  }}}nt        | |      \  }}}d| d| d| d| d	S t        |       )NrF    + rG    * rH    ^ rI    || )rA   r@   z_propagate_nan(, rZ   rJ   welford_combine(rK   , {}))r[   tuplereduction_projectrl   rm   var
next_valuemeanm2weights         rb   reduction_combiner      s   c*&&c*&&"c*&&d:,''' !R
|1EE))!#bA66**j%()D"f0LD"f!#d4&2$bDD

((rd   c           	      (   | dk(  r	d| d| dS | dk(  r	d| d| dS | dk(  r| d| S | d	k(  r| d
| S | dk(  r| d| S | dk(  r	d| d| dS | dk(  r6t        |t              r|\  }}}nt        | |      \  }}}d| d| d| d| d	S t               )Nr@   at::vec::maximum(r   rZ   rA   at::vec::minimum(rF   r~   rG   r   rH   r   rJ   r   rK   r   r   )r[   r   r   NotImplementedErrorr   s         rb   reduction_combine_vecr      s   "3%r*Q77	5	 "3%r*Q77	5	 c*&&	6	!c*&&	9	$c*&&	+	+!#bA66	,	,j%()D"f  1LD"f!#d4&2$bDD!##rd   c                 J    t        |       r| d| d| dfS | dv r| dS |S )Nz.meanz.m2z.weight>   rD   rC   z.index)r   )rm   accs     rb   r   r      sC    N+e}SkcU'?::	/	/f~Jrd   c                 0   dt          }t         dz  a d| dt        |    d| d| dt        | |       dg}| d	k(  r+|j                  d
d| ddddt        | |       ddg       |S | dk(  r)|j                  d
d| ddddt        | |       ddg       |S )NIndexValue_r#   zstruct z {size_t index; z	 value;}; z{0, z};rD   z4#if !defined(__clang_major__) || __clang_major__ > 9z'#pragma omp declare reduction(argmax : z :\zQ    omp_out.value = omp_in.value < omp_out.value ? omp_out.value : omp_in.value,\zQ    omp_out.index = omp_in.value < omp_out.value ? omp_out.index : omp_in.index)\z	initializer(omp_priv = {0, r   z#endifrC   z'#pragma omp declare reduction(argmin : zQ    omp_out.value = omp_in.value > omp_out.value ? omp_out.value : omp_in.value,\zQ    omp_out.index = omp_in.value > omp_out.value ? omp_out.index : omp_in.index)\)index_value_name_counterrk   ro   extend)rm   	src_dtypetmpvarstruct_nameprefixs        rb   argmax_argmin_prefixr   	  s     89:K! +/Y0G/H
S-qn^Y&O%PPSTF !F9+dKdd0PY1Z0[[^_		
* M 
8	#F9+dKdd0PY1Z0[[^_		
 Mrd   c                  l    t         j                  j                  } | dk  rt        j                         } | S Nr#   )r   cppthreadsri   get_num_threads)r   s    rb   parallel_num_threadsr   ,  s+    jj  G{'')Nrd   r   indexc                 X    | | dz   i}t        ||      }t        j                  ||z
        S r   )r   sympysimplify)r   r   replacement	new_indexs       rb   	stride_atr   3  s/    a.K5+.I>>)e+,,rd   c                   N    e Zd Zd Zd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zy)
CppPrinterc                     t        |       dS )NL)r6   selfexprs     rb   _print_IntegerzCppPrinter._print_Integer;  s    d)Ard   c                 $   | j                  | j                  |j                  d               }| j                  | j                  |j                  d               }| j                  | j                  |j                  d               }| d| d| S )Nr   r#   r    ?  : )parendoprintargs)r   r   cpqs        rb   _print_WherezCppPrinter._print_Where>  sv    JJt||DIIaL12JJt||DIIaL12JJt||DIIaL12Cs#aS!!rd   c           	      X   |j                   \  }}}| j                  | j                  |            }|dk7  r?| j                  | j                  |            }|j                  r
d| d| d}n	d| d| d}| j                  | j                  |            }dt         d	| d
t         d	| d	S )Nr#   c10::div_floor_integer(r   rZ   ,c10::div_floor_floating(static_cast<double>(), static_cast<double>())rX   rY   z) % static_cast<)r   r   r   
is_integer
INDEX_TYPE)r   r   xdivmods        rb   _print_ModularIndexingz!CppPrinter._print_ModularIndexingD  s    ii3JJt||A'!8**T\\#./C-aS3%q9B1#E\]`\aacdjjc*+j\A3.>zl"SEQRSSrd   c                     |j                   \  }}| j                  | j                  |            }| j                  | j                  |            }|j                  r	d| d| dS d| d| dS )Nr   r   rZ   r   r   r   )r   r   r   r   )r   r   r   r   s       rb   _print_FloorDivzCppPrinter._print_FloorDivP  sq    3JJt||A'jjc*+??,QCr#a88=aS@WX[W\\^__rd   c                     t        |j                        dk(  sJ d| j                  |j                  d          d}|j                  rdt         d| dS |S )Nr#   std::floor(r   rZ   rX   rY   lenr   _printr   r   r   r   rs      rb   _print_floorzCppPrinter._print_floorX  sX    499~"""$++diil34A648OOj\A3a0JJrd   c           	         |j                   \  }}| j                  |      }|dk(  s|dk(  r|dk(  rd| dS d| dS |j                  sJ t        |      }|dkD  r%dj	                  | j                  |      g|z        }nG|dk  r@d| j                  dj	                  | j                  |      gt        |      z              z   }nd	}|j                  rd
t         d| dS |S )Ng      ?g      
std::sqrt(rZ   z1.0/std::sqrt(r   r=   z1.0/z1.0rX   rY   )r   r   r   r6   joinr   absr   )r   r   baseexpr   s        rb   
_print_PowzCppPrinter._print_Pow]  s    II	c{{4 #:+.#:ZvQ'S^D6QR;SS~~~#h7$**T*+c12A1WCHHdjj.>-?#c(-J$KLLAA48OOj\A3a0JJrd   c                     |j                   dk(  r|j                   }n|j                   d|j                    d}|j                  rdt         d| dS |S )Nr#   z.0/z.0rX   rY   rZ   )r   r   r   r   r   s      rb   _print_RationalzCppPrinter._print_Rationalo  sR    66Q;66(A66(#dffXR(A48OOj\A3a0JJrd   c                     t        |j                        dk(  sJ d| j                  |j                  d          d}|j                  rdt         d| dS |S )Nr#   
std::ceil(r   rZ   rX   rY   r   r   s      rb   _print_ceilingzCppPrinter._print_ceilingw  sX    499~"""TYYq\231548OOj\A3a0JJrd   c                     |j                   D cg c]  }| j                  |       }}t        |      dk(  rd|d    d|d    dS ddj                  |      z   dz   }d| dS c c}w )	Nr   z	std::min(r   r   r#   rZ   {}r   r   r   r   r   r   ar   ils        rb   
_print_MinzCppPrinter._print_Min|  w    (,		2	1A	2t9>tAwir$q'!44 tyy&,Brd!$$ 3   A#c                     |j                   D cg c]  }| j                  |       }}t        |      dk(  rd|d    d|d    dS ddj                  |      z   dz   }d| dS c c}w )	Nr   z	std::max(r   r   r#   rZ   r   r   r   r   s        rb   
_print_MaxzCppPrinter._print_Max  r   r   c                 z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr#   	std::abs(r   rZ   )r   r   r   r   s     rb   
_print_AbszCppPrinter._print_Abs  s9    499~"""4;;tyy|45Q77rd   N)__name__
__module____qualname__r   r   r   r   r   r   r   r   r   r   r    rd   rb   r   r   :  s?    "
T`K
K$KK
%%8rd   r   c                 .    dt          dt        |        dS )NrX   rY   rZ   )r   cexpr)r   s    rb   cexpr_indexr     s    *Re~Q77rd   c                   2    e Zd ZddefdZd Zd Zd Zd Zy)	RecordOptimizationContext	func_namec                 .    || _         d | _        d | _        y N)r   current_nodeopt_ctx)r   r   s     rb   __init__z"RecordOptimizationContext.__init__  s    "596:rd   c                    t         j                  sJ t         j                  j                  sJ t         j                  j                  | _        | j                  J t        j                  | j                  j
                  v r-| j                  j
                  t        j                     | _        nt               | _        | j                  J | j                  | j                  _        | S r   )	r"   interpreterr   r0   keymetar   r   ops_namer   s    rb   	__enter__z#RecordOptimizationContext.__enter__  s    }}}}}))))MM66  ,,,""d&7&7&<&<<,,112E2I2IJDL.0DL||''' $rd   c                     | j                   sJ | j                  sJ | j                  | j                   j                  t        j                  <   y r   )r   r   r  r0   r   r   exc_typeexc_valexc_tbs       rb   __exit__z"RecordOptimizationContext.__exit__  s>        |||:>,,2667rd   c                     | j                   S r   )r   r  s    rb   get_opt_ctxz%RecordOptimizationContext.get_opt_ctx  s    ||rd   c                 6    | j                   sJ | j                   S r   )r   r  s    rb   get_fx_nodez%RecordOptimizationContext.get_fx_node  s           rd   N) )	r   r   r   rM   r   r  r
  r  r  r   rd   rb   r   r     s#    ;# ;
G
!rd   r   nodereturnc                 V    | j                   j                  t        j                  d       S r   )r  getr0   r   )r  s    rb   r  r    s    99==,00$77rd   c                      t         j                  j                  sJ t        t         j                  j                        S r   )r"   r   r   r  r   rd   rb   get_current_node_opt_ctxr    s*    ==%%%%q}}1122rd   c                   j     e Zd Zdef fdZd Zdej                  fdZdej                  fdZ
 xZS )CppCSEVariableboundsc                 `    t         |   ||       d| _        d | _        t	               | _        y NF)superr   is_vecrn   setdependent_itervars)r   namer  	__class__s      rb   r   zCppCSEVariable.__init__  s*    v&,0
58Urd   c           	         |dk(  r| j                  |d          ns | j                  j                  |D cg c]  }t        |t              r|j                    c}  |dk(  r| j                  |d          t        d |D              rd| _        t        t        j                  d      r%t               t               j                  | _        y y y c c}w )Nloadr#   
index_exprr   c              3   V   K   | ]!  }t        |t              s|j                   # y wr   r[   r  r  .0args     rb   	<genexpr>z0CppCSEVariable.update_on_args.<locals>.<genexpr>  s     Q#C1P3::   ))Tr   )_set_dependent_itervarsr  updater[   r  rI   r  hasattrr"   r   r  rn   )r   r  r   kwargsr(  s        rb   update_on_argszCppCSEVariable.update_on_args  s    6>((a1 +D##**  $#!#~6 **# |#,,T!W5QQQ"AMM>2(*61399DJ 7 3s   #Cr   c                    |j                   D ]  }|t        j                  j                  v r| j                  j                  |       ;|j                  t        j                  j                  j                  v sl| j                  j                  t        j                  j                  j                  |j                     j                          y)z
        Set the relevant itervars for this variable based on the `index` expression.
        This includes the itervars directly used in the `index` as well as relevant itervars
        of other cse variables used in the `index`.
        N)
free_symbolsr"   kernelitervarsr  addr  csevarname_mapr,  )r   r   ss      rb   r+  z&CppCSEVariable._set_dependent_itervars  s     ##AAHH%%%''++A.188<<333''..HHLL,,QVV4GG	 $rd   itervarc                     || j                   v S r   )r  )r   r8  s     rb   
depends_onzCppCSEVariable.depends_on  s    $1111rd   )r   r   r   r   r   r/  r   Exprr+  Symbolr:  __classcell__r   s   @rb   r  r    s4    ;[ ;:.UZZ 2%,, 2rd   r  c                      e Zd ZdZed        Zed        Zed        ZedJd       Zed        Z	ed        Z
ed	        Zed
        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Z ed        Z!ed         Z"ed!        Z#ed"        Z$ed#        Z%ed$        Z&ed%        Z'ed&        Z(ed'        Z)ed(        Z*ed)        Z+ed*        Z,ed+        Z-ed,        Z.ed-        Z/ed.        Z0ed/        Z1ed0        Z2ed1        Z3ed2        Z4ed3        Z5ed4        Z6ed5        Z7ed6        Z8ed7        Z9ed8        Z:ed9        Z;ed:        Z<ed;        Z=ed<        Z>ed=        Z?ed>        Z@ed?        ZAed@        ZBedA        ZCedB        ZDedCeEj                  dDeEj                  fdE       ZGedCeEj                  dDeEj                  fdF       ZHedCeEj                  dDeEj                  fdG       ZIedH        ZJedI        ZKy)KCppOverrideszMap element-wise ops to C++c                     d|  d|  d| dS )N	decltype()(r~   rZ   r   r   bs     rb   r4  zCppOverrides.add      1#Rs#aS**rd   c                     d|  d|  d| dS )NrB  rC   - rZ   r   rD  s     rb   subzCppOverrides.sub  rF  rd   c                     d|  d|  d| dS )NrB  rC  r   rZ   r   rD  s     rb   mulzCppOverrides.mul   rF  rd   Nc                 T    |t         v sJ | dt         d       dt         |    d|  dS )N missing from .DTYPE_TO_CPPzc10::convert<rY   rZ   rk   r   )r   rn   r   s      rb   to_dtypezCppOverrides.to_dtype  s=    $U~hZ}&UU$|E232aS::rd   c                 T    |t         v sJ | dt         d       dt         |    d|  dS )NrM  rN  zc10::bit_cast<rY   rZ   rO  )r   rn   s     rb   to_dtype_bitcastzCppOverrides.to_dtype_bitcast	  s=    $U~hZ}&UU$U 34Bqc;;rd   c                     d|  dS )Nr   rZ   r   r   s    rb   r   zCppOverrides.abs      1#Qrd   c                     d|  dS )Nz	std::sin(rZ   r   rT  s    rb   sinzCppOverrides.sin  rU  rd   c                     d|  dS )Nz	std::cos(rZ   r   rT  s    rb   coszCppOverrides.cos  rU  rd   c                     d|  d|  dS )NrB  z)(-rZ   r   rT  s    rb   negzCppOverrides.neg      1#S1%%rd   c                     d|  dS )Nz	std::exp(rZ   r   rT  s    rb   r   zCppOverrides.exp  s     1#Qrd   c                     d|  dS )Nz
std::exp2(rZ   r   rT  s    rb   exp2zCppOverrides.exp2#      A3a  rd   c                     d|  dS )Nzstd::expm1(rZ   r   rT  s    rb   expm1zCppOverrides.expm1'      QCq!!rd   c                     d|  dS )Nz	std::erf(rZ   r   rT  s    rb   erfzCppOverrides.erf+  rU  rd   c                     d|  dS )Nz
std::erfc(rZ   r   rT  s    rb   erfczCppOverrides.erfc/  r`  rd   c                     d|  dS )Nzcalc_erfinv(rZ   r   rT  s    rb   erfinvzCppOverrides.erfinv3      aS""rd   c                     d|  dS )Nr   rZ   r   rT  s    rb   sqrtzCppOverrides.sqrt7  r`  rd   c                     d|  dS )Nz1 / std::sqrt(rZ   r   rT  s    rb   rsqrtzCppOverrides.rsqrt;  s    s!$$rd   c                 |    t         j                  j                  }|dk(  r|  d|  dS |d|  dS t        d|      )Naccuracy + decltype()(1)zstd::log1p(rZ   8unrecognized config cpp.inject_log1p_bug_TESTING_ONLY = r   r   inject_log1p_bug_TESTING_ONLYrl   r   bugs     rb   log1pzCppOverrides.log1p?  sW    jj66*SQCt,,[ 1%% J3'R rd   c                     d|  dS )Nz	std::tan(rZ   r   rT  s    rb   tanzCppOverrides.tanK  rU  rd   c                     d|  dS )Nz
std::tanh(rZ   r   rT  s    rb   tanhzCppOverrides.tanhO  r`  rd   c                     d|  dS )Nzstd::signbit(rZ   r   rT  s    rb   signbitzCppOverrides.signbitS  s    qc##rd   c                     d|  d| dS )Nz	std::pow(r   rZ   r   rD  s     rb   powzCppOverrides.powW  s    1#Rs!$$rd   c                     d|  dS )Nz	std::log(rZ   r   rT  s    rb   logzCppOverrides.log[  rU  rd   c                     d|  dS )Nzstd::nearbyint(rZ   r   rT  s    rb   roundzCppOverrides.round_  s     1%%rd   c                     d|  dS )Nr   rZ   r   rT  s    rb   floorzCppOverrides.floorc  rc  rd   c                 H    |  d| }|  d| }d|  d| d| d| d| d| d	S )
N /  % ((z
 < 0) != (z	 < 0) ? (z != 0 ? z - 1 : ) : rZ   r   )r   rE  quotrems       rb   floordivzCppOverrides.floordivg  sR     Cs|3qclA3j9SE$wtfDQUPVVWXXrd   c                     d|  dS )Nr   rZ   r   rT  s    rb   ceilzCppOverrides.ceiln  r`  rd   c                     d|  dS )Nzstd::trunc(rZ   r   rT  s    rb   trunczCppOverrides.truncr  rc  rd   c                     |  d| S Nr  r   rD  s     rb   truncdivzCppOverrides.truncdivv       Cs|rd   c                     d|  d| dS )Nz
std::fmod(r   rZ   r   rD  s     rb   fmodzCppOverrides.fmod{  s    A3b1%%rd   c                     d|  dS )Nzstd::isinf(rZ   r   rT  s    rb   isinfzCppOverrides.isinf  rc  rd   c                     d|  dS )Nzstd::isnan(rZ   r   rT  s    rb   r^   zCppOverrides.isnan  rc  rd   c                     d|  dS )Nzstd::lgamma(rZ   r   rT  s    rb   lgammazCppOverrides.lgamma  rj  rd   c                     d|  dS )Nz
std::acos(rZ   r   rT  s    rb   acoszCppOverrides.acos  r`  rd   c                     d|  dS )Nzstd::acosh(rZ   r   rT  s    rb   acoshzCppOverrides.acosh  rc  rd   c                     d|  dS )Nz
std::cosh(rZ   r   rT  s    rb   coshzCppOverrides.cosh  r`  rd   c                     d|  dS )Nz
std::sinh(rZ   r   rT  s    rb   sinhzCppOverrides.sinh  r`  rd   c                     d|  dS )Nz
std::asin(rZ   r   rT  s    rb   asinzCppOverrides.asin  r`  rd   c                     d|  dS )Nzstd::asinh(rZ   r   rT  s    rb   asinhzCppOverrides.asinh  rc  rd   c                     d|  d| dS )Nzstd::atan2(r   rZ   r   r   ys     rb   atan2zCppOverrides.atan2      QCr!A&&rd   c                     d|  dS )Nz
std::atan(rZ   r   rT  s    rb   atanzCppOverrides.atan  r`  rd   c                     d|  dS )Nzstd::atanh(rZ   r   rT  s    rb   atanhzCppOverrides.atanh  rc  rd   c                     d|  d| dS )Nzstd::copysign(r   rZ   r   r  s     rb   copysignzCppOverrides.copysign  s    s"QCq))rd   c                     d|  d| dS )Nzstd::hypot(r   rZ   r   r  s     rb   hypotzCppOverrides.hypot  r  rd   c                     d|  dS )Nzstd::log10(rZ   r   rT  s    rb   log10zCppOverrides.log10  rc  rd   c                     d|  d| dS )Nzstd::nextafter(r   rZ   r   r  s     rb   	nextafterzCppOverrides.nextafter  s     2aS**rd   c                     t         j                  j                  }|dk(  ry|dk(  r|  dS |dk(  r|  d|  dS ||  d|  d	S t        d
|      )Ncompile_errorcompile error!runtime_error	; throw 1rp  rq  rr  z * (z>0)7unrecognized config cpp.inject_relu_bug_TESTING_ONLY = r   r   inject_relu_bug_TESTING_ONLYrl   rv  s     rb   reluzCppOverrides.relu  sz    jj55/!#O#S	?"JSQCt,,[SQCs## I#Q rd   c                     d|  d| dS )Nzmin_propagate_nan(r   rZ   r   rD  s     rb   minimumzCppOverrides.minimum      #A3b1--rd   c                     d|  d| dS )Nzmax_propagate_nan(r   rZ   r   rD  s     rb   maximumzCppOverrides.maximum  r  rd   c                     |  d| d| S )Nr   r   r   r   rE  r   s      rb   wherezCppOverrides.where  s    Cs#aS!!rd   c                     d|  d| dS )Nzmod(r   rZ   r   rD  s     rb   r   zCppOverrides.mod  s    aS1#Qrd   c                     t               }|r|j                  J |j                  }|t        v rt        j                  }t        | t        |         S r   )r  rn   rh   ri   rj   rc   rk   )valrn   r   s      rb   constantzCppOverrides.constant  sJ    '?'A7==444M! MMECe!455rd   c                     t               }|r|j                  J |j                  }t        j                  t	        t
        j                  j                  |             |      S r   )r  rn   r!   rP  r   r"   r2  rename_indexing)r   rn   r   s      rb   r#  zCppOverrides.index_expr  sJ    '?'A7==444||E!((":":4"@A5IIrd   c                 "   t               }t        j                  j                  j	                         }|j                  d| d       t        j                  j                  |      5  |j                         5   |       }|j                  d| d       d d d        d d d        |j                  d       t        j                  j                  j                  |       t        |d| d      }|  d| d| S # 1 sw Y   exY w# 1 sw Y   ixY w)	Nauto  = [&]return ;rB  ())r   () : )r$   r"   r2  r5  newvar	writelineswap_buffersindentcomputesplicerc   )maskbodyothercodebody_varresult
other_codes          rb   maskedzCppOverrides.masked  s    ~ 88<<&&(xj/0XX""4($++-VFNNWVHA./ +8( 	s	% "%9XJc)BC
s8*E*66 +8-((s$   'D8C9D9D	>DDc                     |  d| S )Nz && r   rD  s     rb   logical_andzCppOverrides.logical_and      D}rd   c                     d|  S )N!r   r   s    rb   logical_notzCppOverrides.logical_not  s    1#wrd   c                     |  d| S )Nr   r   rD  s     rb   
logical_orzCppOverrides.logical_or
  r  rd   c                     |  d| S )N != r   rD  s     rb   logical_xorzCppOverrides.logical_xor  r  rd   c                     d|  d|  d| dS )NrB  rC   & rZ   r   rD  s     rb   bitwise_andzCppOverrides.bitwise_and  rF  rd   c                     d|  d|  dS )NrB  z)(~rZ   r   r  s    rb   bitwise_notzCppOverrides.bitwise_not  r\  rd   c                     d|  d|  d| dS )NrB  rC  z | rZ   r   rD  s     rb   
bitwise_orzCppOverrides.bitwise_or  rF  rd   c                     d|  d|  d| dS )NrB  rC  r   rZ   r   rD  s     rb   bitwise_xorzCppOverrides.bitwise_xor  rF  rd   c                     d|  d|  d| dS )NrB  rC  z << rZ   r   rD  s     rb   bitwise_left_shiftzCppOverrides.bitwise_left_shift"      1#Rs$qc++rd   c                     d|  d|  d| dS )NrB  rC  z >> rZ   r   rD  s     rb   bitwise_right_shiftz CppOverrides.bitwise_right_shift&  r  rd   seedoffsetc                     d|  d| dS )Nznormalized_rand_cpu(r   rZ   r   r  r  s     rb   randzCppOverrides.rand*  s    %dV2fXQ77rd   c                     d|  d| dS )Nz
randn_cpu(r   rZ   r   r  s     rb   randnzCppOverrides.randn.  s    D6F81--rd   c           	           d|  d| d| d| d	S )Nzrandint64_cpu(r   rZ   r   )r  r  lowhighs       rb   	randint64zCppOverrides.randint642  s#    vRxr#ba@@rd   c                     d|  d|  d|  dS )NrB  z)(1) / (decltype(z)(1) + std::exp(-r   r   rT  s    rb   sigmoidzCppOverrides.sigmoid6  s    1#.qc1B1#RHHrd   c                 "   t               }t        j                  j                  j	                         }t        j                  j                  j	                         }t        j                  j                  j	                         }d|  d}d|  d}|j                  d| d|  d| d| d	       |j                  d| d|  d	| d| d	       |j                  d| d| d
| d       t        j                  j                  j                  |       |S )NrB  )(0)rr  r   = z > 0 ? r   r  z < 0 ? rH  r$   r"   r2  r5  r  r  r  r  )r   r  leftrightr  scalar_zero
scalar_ones          rb   signzCppOverrides.sign:  s    ~xx||""$##%$$&!!D) 4(
tfCs'*SQOPugS7:,c+aPQvhc$s5';<	%rd   r   )Lr   r   r   __doc__staticmethodr4  rI  rK  rP  rR  r   rW  rY  r[  r   r_  rb  re  rg  ri  rl  rn  rx  rz  r|  r~  r  r  r  r  r  r  r  r  r  r  r^   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r#  r  r  r  r  r  r  r  r  r  r  r   r   r;  r  r  r  r  r  r   rd   rb   r@  r@    s   %+ + + + + + ; ; < <             & &     ! ! " "     ! ! # # ! ! % % 	 	     ! ! $ $ % %     & & " " Y Y ! ! " "   & & " " " " # # ! ! " " ! ! ! ! ! ! " " ' ' ! ! " " * * ' ' " " + +   . . . . " "     6 6 J J 7 7          + + & & + + + + , , , , 85:: 8uzz 8 8 .EJJ .

 . . A

 AEJJ A A I I  rd   r@  c                       e Zd ZdZ fdZed        Zed        Zed        Zed        Z	ed        Z
ed        Zed	        Zed
        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Z ed        Z!ed        Z"ed         Z#ed!        Z$ed"        Z%ed#        Z&ed$        Z'ed%        Z(ed&        Z)ed'        Z*ed(        Z+ed)        Z,ed*        Z-ed+        Z.ed,        Z/ed-        Z0ed.        Z1ed/        Z2ed0        Z3ed1        Z4ed2        Z5ed3        Z6ed4        Z7ed5        Z8ed6        Z9ed7        Z:ed8        Z;ed9        Z<ed:        Z=ed;        Z>ed<        Z?ed=        Z@ed>        ZAedBd?       ZBed@        ZCedA        ZD xZES )CCppVecOverridesz.Map element-wise ops to aten vectorization C++c                     t         |   |       fd}t        |       j                         D ]=  \  }}t	        |dd       t
        k(  s|dk7  s!t        | ||j                               ? S )Nc                       fd}|S )Nc                  $   t        d | D              }t        d | D              }t        |       }|r|rg }| D ]  }t        |t              r]|j                  sQt        t
        j                  t              sJ t
        j                  j                  |      }|j                  |       p|j                  |        |r 
|i |S t        t              }t        |
j                  |j                  
j                              }|J  || i |S )Nc              3   X   K   | ]"  }t        |t              s|j                    $ y wr   r%  r&  s     rb   r)  zICppVecOverrides.__new__.<locals>.wrap.<locals>.wrapper.<locals>.<genexpr>Z  s#      !.2sjn6U

Nds   **c              3   V   K   | ]!  }t        |t              s|j                   # y wr   r%  r&  s     rb   r)  zICppVecOverrides.__new__.<locals>.wrap.<locals>.wrapper.<locals>.<genexpr>]  s!      !*.3*S.2QCJJ$r*  )rI   listr[   r  r  r"   r2  CppVecKernel	broadcastappendr  r  getattrr   __getattr__)r   r.  
has_scalar
has_vectornew_argsr(  new_arg
scalar_opsscalar_funcr   funcr   s            rb   wrapperz6CppVecOverrides.__new__.<locals>.wrap.<locals>.wrapperY  s     !.2! 
 ! !*.! 
  :*!H#%c>:3::#-ahh#EE#E&'hh&8&8&=G$OOG4$OOC0  $ 4V44 "'!=J")"DMM:3I3I$--3X#K '222&777rd   r   )r,  r-  r   r   s   ` rb   wrapz%CppVecOverrides.__new__.<locals>.wrapP  s    8: Nrd   r   r  )r  __new__varsitemsr$  r  setattr__func__)clsr   kargsr.  r  methodr   r   s         @rb   r/  zCppVecOverrides.__new__M  sf    ws#&	P !IOO-LD&v{D1\AdhFVdD$9: . rd   c                     |  d| S )Nr~   r   rD  s     rb   r4  zCppVecOverrides.add}      Cs|rd   c                     |  d| S )NrH  r   rD  s     rb   rI  zCppVecOverrides.sub  r8  rd   c                     |  d| S Nr   r   rD  s     rb   rK  zCppVecOverrides.mul  r8  rd   c                     |  d| S r  r   rD  s     rb   truedivzCppVecOverrides.truediv  r8  rd   c                     |  dS )Nz.abs()r   rT  s    rb   r   zCppVecOverrides.abs      F|rd   c                     |  dS )Nz.sin()r   rT  s    rb   rW  zCppVecOverrides.sin  r?  rd   c                     |  dS )Nz.cos()r   rT  s    rb   rY  zCppVecOverrides.cos  r?  rd   c                     |  dS )Nz.exp()r   rT  s    rb   r   zCppVecOverrides.exp  r?  rd   c                     |  dS )Nz.exp2()r   rT  s    rb   r_  zCppVecOverrides.exp2      G}rd   c                     d|  d}|  d| S )NrB  rr  z	.exp() - r   r   vec_ones     rb   rb  zCppVecOverrides.expm1  s#     aS%IgY''rd   c                     |  dS )Nz.erf()r   rT  s    rb   re  zCppVecOverrides.erf  r?  rd   c                     |  dS )Nz.erfc()r   rT  s    rb   rg  zCppVecOverrides.erfc  rD  rd   c                     |  dS )Nz	.erfinv()r   rT  s    rb   ri  zCppVecOverrides.erfinv      Ird   c                     |  dS )Nz.sqrt()r   rT  s    rb   rl  zCppVecOverrides.sqrt  rD  rd   c                     d|  d| dS )Nto_float_mask( == rZ   r   r  s     rb   eqzCppVecOverrides.eq      s$qc++rd   c                     d|  d| dS )NrN  r  rZ   r   r  s     rb   nezCppVecOverrides.ne  rQ  rd   c                     d|  d| dS )NrN   < rZ   r   r  s     rb   ltzCppVecOverrides.lt      s#aS**rd   c                     d|  d| dS )NrN  z > rZ   r   r  s     rb   gtzCppVecOverrides.gt  rW  rd   c                     d|  d| dS )NrN  z <= rZ   r   r  s     rb   lezCppVecOverrides.le  rQ  rd   c                     d|  d| dS )NrN  z >= rZ   r   r  s     rb   gezCppVecOverrides.ge  rQ  rd   c                     |  d| S )Nr  r   r  s     rb   and_zCppVecOverrides.and_  r8  rd   c                     |  dS )Nz.rsqrt()r   rT  s    rb   rn  zCppVecOverrides.rsqrt      H~rd   c                     |  d| dS )Nz.pow(rZ   r   rD  s     rb   r  zCppVecOverrides.pow  s    E!Ard   c                     |  dS )Nz.log()r   rT  s    rb   r  zCppVecOverrides.log  r?  rd   c                     |  dS )Nz.round()r   rT  s    rb   r  zCppVecOverrides.round  ra  rd   c                     |  dS )Nz.floor()r   rT  s    rb   r  zCppVecOverrides.floor  ra  rd   c                     |  dS )Nz.ceil()r   rT  s    rb   r  zCppVecOverrides.ceil  rD  rd   c                     |  dS )Nz.trunc()r   rT  s    rb   r  zCppVecOverrides.trunc  ra  rd   c                     |  d| dS )Nz.fmod(rZ   r   rD  s     rb   r  zCppVecOverrides.fmod  s    F1#Qrd   c                     |  dS )Nz	.lgamma()r   rT  s    rb   r  zCppVecOverrides.lgamma  rK  rd   c                     d|  d| dS )Nrs   z
 != 0) & ( != 0)r   rD  s     rb   r  zCppVecOverrides.logical_and      1#Zs&))rd   c                     |  dS )Nz == 0r   r  s    rb   r  zCppVecOverrides.logical_not  s    E{rd   c                     d|  d| dS )Nrs   z
 != 0) | (rk  r   rD  s     rb   r  zCppVecOverrides.logical_or  rl  rd   c                     d|  d| dS )Nrs   z
 != 0) ^ (rk  r   rD  s     rb   r  zCppVecOverrides.logical_xor  rl  rd   c                     |  dS )Nz.tan()r   r  s    rb   rz  zCppVecOverrides.tan  r?  rd   c           	      F    d|  d}d|  d}d|  d}| d| d| d|  d| 	S )	NrB  rr  z)(2)z)(-2)z / ( + (r   z).exp()) - r   )r   rG  vec_twovec_minus_twos       rb   r|  zCppVecOverrides.tanh  sO    aS%aS%#A3e,$witM?#aSG9UUrd   c                     |  dS )Nz.reciprocal()r   r  s    rb   
reciprocalzCppVecOverrides.reciprocal  s    M""rd   c                     |  dS )Nz.atan()r   rT  s    rb   r  zCppVecOverrides.atan  rD  rd   c                     |  dS )Nz.acos()r   rT  s    rb   r  zCppVecOverrides.acos  rD  rd   c                     |  dS )Nz.asin()r   rT  s    rb   r  zCppVecOverrides.asin  rD  rd   c                     |  dS )Nz.cosh()r   rT  s    rb   r  zCppVecOverrides.cosh"  rD  rd   c                     |  dS )Nz.sinh()r   rT  s    rb   r  zCppVecOverrides.sinh&  rD  rd   c                     |  dS )Nz.log10()r   rT  s    rb   r  zCppVecOverrides.log10*  ra  rd   c                     |  dS )Nz.nextafter()r   rT  s    rb   r  zCppVecOverrides.nextafter.  s    L!!rd   c                     |  d| dS )Nz
.copysign(rZ   r   rD  s     rb   r  zCppVecOverrides.copysign2  s    Jqc##rd   c                     |  d| dS )Nz.atan2(rZ   r   rD  s     rb   r  zCppVecOverrides.atan26      GA3a  rd   c                     |  d| dS )Nz.hypot(rZ   r   rD  s     rb   r  zCppVecOverrides.hypot:  r  rd   c           
      <    d|  d}d|  d}| d| d|  d| d|  d
S )	NrB  rr  z)(0.5)z * ((r~   z)/(rH  z)).log()r   )r   rG  vec_one_halfs      rb   r  zCppVecOverrides.atanh>  sE     aS%"1#V,uWIS3wis1#XNNrd   c           	      ,    d|  d}d|  d| d|  d|  d	S )NrB  rr  rs   rr  r~   r=   ).sqrt()).log()r   rF  s     rb   r  zCppVecOverrides.asinhE  s3     aS%1#T'#aS!O<<rd   c           	      ,    d|  d}d|  d|  d|  d| d	S )NrB  rr  rs   rr  r=   rH  r  r   rF  s     rb   r  zCppVecOverrides.acoshK  s3     aS%1#T!AaSG9O<<rd   c                     t         j                  j                  }|dk(  ry|dk(  r|  dS |dk(  r|  d|  dS |	d|  d	|  d
S t        d|      )Nr  r  r  r  rp  rq  rr  zat::vec::clamp_min(z, decltype(z)(0))r  r  rv  s     rb   r  zCppVecOverrides.reluQ  s|    jj55/!#O#S	?"JSQCt,,[(;qc?? I#Q rd   c                     d|  d|  d|  dS )NrB  z)(1)/(decltype(z)(1) + z.neg().exp())r   rT  s    rb   r  zCppVecOverrides.sigmoidb  s    1#_QCwqcGGrd   c                     |  dS )Nz.neg()r   rT  s    rb   r[  zCppVecOverrides.negf  r?  rd   c                 l    d|  d}|  d| }|  d| }d|  d| d| d| d| d	| d
| d| d| d| dS )NrB  rZ   r  r  r  rU  z	(0)) != (z(0)) ? (r  z(0) ? rH  z(1) : r  r   )r   rE  _tr  r  s        rb   r  zCppVecOverrides.floordivj  s     1Cs|3qclA3c"YqcRDT"VD6QTUWTXX^_c^ddhimhnnopprd   c                     |  d| S r  r   rD  s     rb   r  zCppVecOverrides.truncdivr  r  rd   c                     d|  d| dS )Nr   r   rZ   r   rD  s     rb   r  zCppVecOverrides.minimumw      "1#Rs!,,rd   c                     d|  d| dS )Nr   r   rZ   r   rD  s     rb   r  zCppVecOverrides.maximum{  r  rd   c                     |  d|  S r;  r   r  s    rb   squarezCppVecOverrides.square  r8  rd   c           	           d| d| d| d|  d	S )NrB  
)::blendv(r   rZ   r   r  s      rb   r  zCppVecOverrides.where  s#    1#Zs"QCr!A66rd   c                 R   t               }d|  d}d|  d}d|  d| d| d| d|  d}t        j                  j                  j	                         }|j                  d| d	| d
       d|  d| d| d|  d| d}t        j                  j                  j	                         }|j                  d| d	| d
       t        j                  j                  j	                         }|j                  d| d	| d| d
       t        j                  j                  j                  |       |S )NrB  r  rr  r  r   rU  rZ   r  r  r  rH  r  )r   r  vec_zerorG  blendvr  r  r  s           rb   r  zCppVecOverrides.sign  s+   ~qc&aS%QCz(2gYb
#aSPQRxx||""$tfCxq12 QCz(2gYb3xjPQR##%ugS23$$&vhc$s5';<	%rd   c                    |t         j                  t         j                  t         j                  t         j                  t         j
                  fv sJ t         d|        t        j                  j                  }|r$t        |t         j                  j                        sJ t        |j                  d         }|sJ |j                  t         j                  t         j                   fv r|t         j                  k(  rd|  dS |j                  t         j                  k(  r(|t         j                  t         j                   fv rd|  dS |j                  t         j                  t         j                   fv r|t"        v rdt$        |    d|  dS |j                  t"        v r<|t         j                  t         j                   fv rdt$        |j                      d|  dS |j                  t         j
                  k(  r(|t         j                  t         j                   fv rd	|  dS |j                  t         j                  t         j                   fv r|t         j
                  k(  rd
|  dS d|  dS )Nz does not support r#   zvec_convert_to_mask(rZ   zmask_convert_to_float(zcvt_fp32_to_lowp_fp<rY   zcvt_lowp_fp_to_fp32<z at::vec::convert_uint8_to_float(z at::vec::convert_float_to_uint8(rs   )ri   r8   r2   r9   float16uint8r   r"   r   r   r[   fxNoder  r   rn   rj   rh   rk   )r   rn   r   r  	opt_ctx_xs        rb   rP  zCppVecOverrides.to_dtype  s   JJKKNNMMKK
 
 	2 Z)%1	2 
  mm88
4777		!-	y??u{{EMM::u

?R)!A..??ejj(Uu{{EMM6R-R+A3a00??u{{EMM::u?U),u*=)>b1EE??m+%++u}}9U0U),y*G)H1#QOO??ekk)eU]]7S.S5aS::??u{{EMM::u?S
 6aS:: 1#Qxrd   c                 z    t         j                  j                  }|dk(  r|  d|  dS ||  dS t        d|      )Nrp  rq  rr  z.log1p()rs  rt  rv  s     rb   rx  zCppVecOverrides.log1p  sT    jj66*SQCt,,[S>! J3'R rd   c                    t               }t        j                  j                  j	                         }t        j                  j                  |       5 }|j                  d| d       t        j                  j                  |      5  |j                         5   |       }|j                  d| d       d d d        d d d        d d d        |j                  d       t        j                  j                  j                  |       t        |d      }d| d}j                  r[d| d	}	d
 d}
t        j                  j                  j                  t        j                  j                  |	 d| d| d|
 d      }nJt        j                  j                  j                  t        j                  j                  |  d| d|       }|j                  d| |||fi        |S # 1 sw Y   0xY w# 1 sw Y   5xY w# 1 sw Y   :xY w)Nr  r  r  r  r2   zat::vec::Vectorized<float>(rZ   rB  r  rN  z	::blendv(r   z(), r   r  r  )r$   r"   r2  r5  r  r  r  r  r  r  r  rc   r  generater/  )r  r  r  r  r   new_maskr  r  other_code_vectype
float_maskcsevars               rb   r  zCppVecOverrides.masked  s   ~hhll!!#XX__T"hNNU3%v./&&t,dkkm23 /<, #
 	s	%!%1
6zl!D==se3'D)(15JXX\\**  &	.!1C5ZLJF
 XX\\**  TF#cU%
|"DF
 	htUF(CRH/ /<m,, #"s<   5G=G0G#5G0=G=#G-(G00G:	5G==Hr   )Fr   r   r   r  r/  r  r4  rI  rK  r=  r   rW  rY  r   r_  rb  re  rg  ri  rl  rP  rS  rV  rY  r[  r]  r_  rn  r  r  r  r  r  r  r  r  r  r  r  r  rz  r|  rv  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r[  r  r  r  r  r  r  r  rP  rx  r  r=  r>  s   @rb   r  r  J  s   8.`                   ( (
         , , , , + + + + , , , ,                       * *   * * * *   V V # #             " " $ $ ! ! ! ! O O = =
 = =
    H H   q q   - - - -   7 7  $  B 	 	  rd   r  c                   0    e Zd ZeZeZdZdZ fdZ	e
j                  d        Z	 ddej                  fdZdej                  defdZd	edej                  fd
ZddZd Zd Zd Zd Zd Zd Zedefd       Zd Ze
j                  d        Zd Z xZS )	CppKernelr  r  c                 L   t         |   |       d | _        g | _        g | _        d | _        t               | _        t               | _        i | _	        t        | j                  | j                  d      | _        t               | _        t               | _        || _        i | _        y )Ntmp_acc)name_prefix)r  r   call_rangesrangesr3  reduction_depthr,   reduction_prefixreduction_suffixreduction_var_mapr&   newvar_prefixsuffixreduction_csepreloads
poststoresnum_threadsreduction_omp_dec)r   r   r  r   s      rb   r   zCppKernel.__init__  s    =A(*,.# . 0 . 0!# !3!3T[[iX&((*&=?rd   c              #      K   | j                   }|r+| j                  j                  | j                  | d|       }|| _         	 | || _         y# || _         w xY ww)z>Context manager to add an additional mask to loads and stores.r  N)
_load_maskr5  r  r  )r   r  priors      rb   r  zCppKernel.masked  sX      88$$T\\dV3ug3FGD	$J#DOeDOs   AAA A	AAr   c                 P    | j                   |   }|||z  |z   i}t        ||      }|S r   )r3  r   )r   r   scaleitervar_idxr  r   r   r   s           rb   scale_index_with_offsetz!CppKernel.scale_index_with_offset
  s7     mmK(C%K&01uk2	rd   r  c                 6    t        | j                  |            S )z
        Convert an index expr to a string that can be used in cpp code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the cpp kernel.
        )r   r  r   r   s     rb   index_to_strzCppKernel.index_to_str  s    
 T))%011rd   r  c                 b   | j                   j                  |      }| j                  |      }| dt        |       d}t        j
                  j                  |      t        j                  fv rd| d}| j                  j                  | j                  |      }|j                  d||fi        |S )N[]zstatic_cast<float>(rZ   r"  )r   inputr  r   r"   graph	get_dtyperi   r  r5  r  loadsr/  )r   r  r   r   liner  s         rb   r"  zCppKernel.load  s    iiood#$$U+aE*+1-77T"u}}o5(a0D""4::t4ftUmR8rd   c                    d|v sJ | j                   j                  |      }| j                  |      }|| dt        |       d| d}ng|dk(  rTt        j
                  j                  s$| j                  dk(  r| dt        |       d| d}n$d| dt        |       d	| d
}nt        d|       | j                  j                  t        ||             y )Nbufr  ] = r  
atomic_addr#   z] += zatomic_add(&z], );zstore mode=)r   outputr  r   r   r   dynamic_threadsr  r   storesr  r)   )r   r  r   r`   moder   r  s          rb   storezCppKernel.store#  s    }}iit$$$U+<U!K./tE7!<D\!::--$2B2Ba2GaE 235qA%cU!K,>+?s5'L%D6&:;;l467rd   c                    |dv }|||f}|| j                   j                  v r| j                   j                  |   S | j                   j                  | j                  d| d      }|| j                  |<   |r| j
                  j                  t        |||             |dk(  rdnd}| j                  J | j                  | j                     }	t        | j                  dz   t        | j                              D ]$  }
|	| j                  |
   z  | j                  |
   z   }	& | j                  j                  d	| d
| d| dd| dt        |	       d| d| d	dg       nt        ||      }||f| j                   vrkt"        |   t$        vrD| j
                  j'                  dt"        |    d| dt)        |dd       dt+        ||       d	       t"        |   | j                   ||f<   | j
                  j-                  | d| dt+        ||       d       | j                  j-                  | dt)        |||       d       t/        ||      }|| j                   j                  |<   |S )N>   rD   rC   
reduction FwriterD   <rr   r#   zif (z.value r   z) {z    z	.index = ; z	.value = r  r   z&    #pragma omp declare reduction(    :z:    omp_out = omp_outomp_inz)     initializer(omp_priv={z})
                r  )r  reduction_cacher  r  r  r  
writelinesr   r  r3  ranger   r  r  r   rz   r  RTYPE_TO_CPPNATIVE_OMP_RTYPESr  r   ro   r  r   )r   rn   r   rm   r`   argmax_or_argminreduction_keyr   
compare_opr   iacc_typer  s                rb   	reductionzCppKernel.reduction2  s   )-AA!>58D..>>>%%55mDD  ))JJ*]O4E * 
 '5s#!!,,$^YD !/( :J''333MM$"6"67E4//!3S5GHA.q1AA IKK""3%wzl!E7$?3%yU);(<Bse9UGSTU *.%@H)1G1GG/7HH))00.!"!H: . HEF G*>5AB C	 DP"D&&~x'?@ !!++*AcU#n^U&K%LAN KK!!%s,^S%HIK #>37<B**=9rd   c                     | j                  |      }| j                  j                  |      }| j                  j	                  t        || dt        |       d| d             y )Nr  r  r  )r  r   r  r  r  r)   r   )r   r  r   r`   r   s        rb   store_reductionzCppKernel.store_reductionj  s]    $$U+iit$''#aE(:';4waHI	
rd   c                    | j                   ri| j                   t        |      t        |      z   k(  s+J | j                    dt        |       dt        |              | j                  t        |      k(  sJ t        |      t        |      z   | _         | j                   D cg c]  }| j	                  |       c}| _        t        t        | j
                              D cg c]  }t        d|        c}| _        t        |      | _        | j                  d | j                   | j                  | j                  d  fS c c}w c c}w )NrO  r~   r   )	r  r   r  r   r  r  r  r    r3  )r   lengthsreduction_lengthsr   ns        rb   
set_rangeszCppKernel.set_rangesq  s:   ##uW~!9 (  V""#4g'7s5AR;S:TUV  ''3w<777$W~6G0HHD<@<L<LM<Lq4//2<LMDK<A#dkkBR<ST<Sq\AaS'2<STDM#&w<D MM0D001MM$..01
 	
 NTs    D? Ec                 ~    t         j                  j                  j                  t	        | j
                        d      S )N    fallback)r"   r  sizevars	size_hintr   r  r  s    rb   r  zCppKernel.size_hint  s4    ww))$**+d * 
 	
rd   c           	        	
 t               
| j                  J | j                  | j                  d j                          
      }t	        j
                         5 }|rDj                         rj                          nj                  
       j                  |       n4
dkD  r/j                         r|j                  j                                fddd	d	dt        t           f	
fdd	dt        ffd|j                  j                                j                  r j                         n j                          d d d        y # 1 sw Y   y xY w)
Nr#   c                     t        j                         5 }| sJ t        | d      rKj                  | j                         | j                         |j                  j                                j                  | j                         j                  | j                         j                  | j                         d d d        t        | d      rj                  | j                         y y # 1 sw Y   2xY w)Ncodegen_inner_loops)
contextlib	ExitStackr-  r  r  r  enter_contextr  r  r  r  r  )r2  stackr  s     rb   
gen_kernelz0CppKernel.codegen_loops_impl.<locals>.gen_kernel  s    ))+u!M6v'<=FOO42248++DKKM:KK-KK/KK. , 6#89KK 1 12 : ,+s   B-C44C=c                 ~    | D ]8  }|j                         D ]#  }|r|j                  c c S |j                  c c S  : y r   )get_kernelsr  r  )loops	is_suffixloopr2  s       rb   get_reduction_code_bufferz?CppKernel.codegen_loops_impl.<locals>.get_reduction_code_buffer  s?    !D"&"2"2"4$#)#:#::#)#:#::	 #5 " rd   r  c                 8   t        j                         5 }| r| d   }|j                         r>|s< | d      }|r|j                  j	                                j                  |       j                         r|j                  r
j                  	       | D ]  } ||        | r\| d   }j                         r|j                  r
j                          |j                         r|sj                   | d             d d d        y # 1 sw Y   y xY w)Nr   F)r  T)	r  r  is_reductionr  r  r  is_reduction_onlyparallelclose)r  in_reductionstack_outerr   r  r  gen_loopr  	loop_nestr   worksharings        rb   	gen_loopsz/CppKernel.codegen_loops_impl.<locals>.gen_loops  s    ))+{$Qx,,.|/H %0,  0 + 9 9$++- H KK(89$668T]]'009 % |4 !& $Qx$668T]]'--/,,.| KK 9%4 P+ ,++s   C1DDr   c                    t        j                         5 }| j                         }|
	 d d d        y j                  |       |j	                  j                                | j                  r" | j                  | j                                n+| j                         }t        |      dk(  sJ  |d          d d d        y # 1 sw Y   y xY w)Nr#   r   )
r  r  linesr  r  r  innerr  r  r   )r   r  r  
loop_lineskernelsr  r  r  s        rb   r	  z.CppKernel.codegen_loops_impl.<locals>.gen_loop  s    ))+u!%J!) ,+ OOJ/''6zz!$**d.?.?.AB"&"2"2"4"7|q000"71:. ,++s   CB	CCT)F)r   r  decide_parallel_depthmax_parallel_depthr  r  r  r  r  mark_parallelsingler  r  r   	LoopLevelrootr2  )r   r
  r  r  	par_depthr  r  r	  r  r  r   s    ```  @@@@@rb   codegen_loops_implzCppKernel.codegen_loops_impl  s&   &(+++..=y;;=>
	 !!#u..0%%'((1''	21%%'''63i  4/y / .~~)..)9++,_ $##s   "C<E''E0c                 T    t         j                  |       }| j                  |||       y r   )LoopNestWithSplitbuildr  )r   r  r  r
  s       rb   codegen_loopszCppKernel.codegen_loops  s$    %++D1		4=rd   c                      y)NTORCH_CHECKr   r  s    rb   assert_functionzCppKernel.assert_function  s    rd   c                 p   | j                         }d}d}|D ]m  }t        j                  j                  j                  |d      }|d|z  k\  s||k(  r n3||z  t        j
                  j                  k  r n|dz  }||z  }||z  }o t        j
                  j                  r|dk(  rt        |      dkD  rd}|S )Nr#   r   r  r  r   )	r  r"   r  r  r   r   min_chunk_sizer  r   )r   r  r   seqpardepthr   hints           rb   r  zCppKernel.decide_parallel_depth  s    nnD77##--dT-BDa'k!SG^g~

 9 99QJE4KC4KC  ::%%%1*VqErd   c              #     K   | j                   | j                  | j                  | j                  f}t	               | _         t	               | _        t	               | _        | j                  j                         | _        d  | j                  j                  | j                          | j                  j                  | j                         | j                  j                  | j                         |\  | _         | _        | _        | _        y wr   )r  r  r  r5  r,   cloner  r  )r   r  s     rb   write_to_suffixzCppKernel.write_to_suffix  s     T\\4;;A#%
%'$&88>>#$$TZZ0$$T\\2$$T[[1<A9T\4;s   D
Dc                     t        |i |S r   )r  )r   r   r.  s      rb   create_cse_varzCppKernel.create_cse_var  s    t.v..rd   )r#   r   r   ) r   r   r   r@  	overridesr   sexprr  r  r   r  contextmanagerr  r   r;  r  rM   r  r"  r  r  r  r  r  r  r  propertyr!  r  r*  r,  r=  r>  s   @rb   r  r    s    IEMF@ 
$ 
$ BCZZ2%** 2 2 UZZ 86p

 

U-n>   * 
B 
B/rd   r  c                        e Zd ZeZddej                  f fd	Zdede	j                  f fdZd ZddZd	 Zd
 ZdefdZ xZS )r!  r   r-  c                     t         |   ||       t        j                         sJ |dk(  r$t        j                         j	                  |      }|| _        || _        t        xj                  dz  c_        y )Nr   rn   r#   )	r  r   r   pick_vec_isa	nelementstiling_factor
tiling_idxr   generated_cpp_vec_kernel_count)r   r   r  r7  r8  tiling_dtyper   s         rb   r   zCppVecKernel.__init__  sj     	{+%%'''A%224>>\>RM*$..!3.rd   r  r   c           	          t               } j                  j                  |      } j                  |      }t        j
                  j                  |      } j                   j                     |j                         }|t        j                  t        j                  fv xr |j                   } j                  rd j                   dnd }| xr t        |      dk7  xs t!         fd|j"                  D              }	|r| dt%        |       dn| dt%        |       }
|	rdn|
}|rt&         Q  ||      }||_        |S |t        j                  fv r|j                  r|r	d	| d
| dnd| d}nK|rd| d}nB|t,        v r)|r	d	| d
| dndt.        |    d| d
 j0                   d}n|r	d	| d
| dnd| d}|	r|rdn	t.        |    } j0                   }|t,        v r|dz  }d| d| d}t3         d      } j5                  | j                  |      }d| d| d j0                   d| d	}| dt%        |       d}|rd| d}|d| d| dz  }d | d!| d"| d#} j6                  j9                   j:                  |      }|j=                  d$||fi        t?        |t@              sJ d%|_!        |S )&NrN  rZ   r#   c              3      K   | ]Q  }|j                   j                  d       r4j                  j                  |j                      j	                         S yw)tmpN)r  
startswithr5  r6  r:  )r'  r7  r   
tiling_vars     rb   r)  z$CppVecKernel.load.<locals>.<genexpr>,  sI      +A66$$U+ $$QVV,77
C+s   AAr  r  r~   tmpbufzmasked_load(r   /at::vec::Vectorized<uint8_t>::loadu_one_fourth(zflag_to_float_vec(rq   	>::loadu("at::vec::Vectorized<float>::loadu(r2   z * 2z__at_align__  tmpbuf[z];_innerr  r  
for (long  = 0; rU  r  ++) zflag_to_float_scalar(ztmpbuf[r  r  z	([&]() { r   z return z; })()r"  T)"r  r   r  r  r"   r  r  r3  r8  hasri   r8   r  is_load_uint8_as_floatr  r   rI   r1  r   r  r"  rn   rh   rk   r7  r    r  r5  r  r  r/  r[   r  r  )r   r  r   r   r   rn   is_broadcastis_mask	load_masknon_contiguousvar_exprloadbufr  r  
tmpbuftype
tmpbufsizetmpbufdeclarer  r   tmpbufdefinerhsr?  r   s   `                    @rb   r"  zCppVecKernel.load  sg   '?'Aiiood#$$U+!!$']]4??3
 99Z00ejj%++..Uw7U7U3U 	 <@??nT__$5Q7PT	 2*e,1 ++  	  e1['(*5K./0 	
 -((W\$.F FLMu{{m#(F(F  wir)A6FwiqQ 
 'y2Dm#  wir)A6+L,?+@	'RTUYUgUgThhij   wir)A69'!D  $+L4G3HJ ../J%f$
+J<x
|2NM J<v!67E444??5 5 I UG6%D4F4F3Gr%PTU  E;y12!4C-cU!4geWDQ77La~XdV7SD""4::t4ftUmR8&.111rd   c                 ~   t        |t              s#t        |t              r|j                  sJ |       | j                  | j
                     }|j                  |      sJ | dt        |       }t        ||      dk7  xs d| v }|rd}|t        j                  k(  r	| d| d}n| d| d| j                   d}|rt        | d      }	| j                  || j
                  |		      }
| j                   d
t        |    d}dt        |    d| d| d|	 d|	 d| j                   d|	 d| dt        |
       d|	 d}|S )z
        Get a store line str that stores `value` into `var` at `index` of `dtype`.
        :param value: Vectorized type templaterized on `dtype`.
        :param var: buffer to store into.
        :index: index into the `var`.
        r~   r#   r=  r@  .store(r  r   rE  rF  z*sizeof(float)/sizeof(rZ   z{ __at_align__ rD  z]; z for (long rH  rU  r  rI  r  z] = tmpbuf[z]; })r[   rM   r  r  r3  r8  rJ  r   r   ri   r2   r7  r    r  rk   )r   r`   r   r   rn   r?  rP  rO  r  r  r   tmp_bufsizes               rb   get_vec_store_linezCppVecKernel.get_vec_store_linek  s    %%un-%,,		 
 ]]4??3
yy$$$U#k%012":u5:Qe%>QHEKKWGH:R0DWGH:R0B0B/C2FD J<v!67E444??5 5 I %%&&<\%=P<QQRS  #<#6"7x}CPTv V"G6%D4F4F3Gr%PT%qY/0E7%I 
 rd   c                    d|v sJ |J t        |t              sJ |       |j                  s| j                  |      }t	               }| j
                  j                  |      }| j                  |      }| j                  j                  t        || j                  |||t        j                  j                  |                         y )Nr  )r[   r  r  r"  r  r   r  r  r  r  r)   rZ  r"   r  r  )r   r  r   r`   r  r   r   s          rb   r  zCppVecKernel.store  s    }}||%07%70||NN5)E'?'Aiit$$$U+''sE177;L;LT;RS	
rd   c                 .   |dv sJ |t         j                  k(  sJ |t         j                  k(  sJ t        |t              r|j                  sJ |       d}| dt
        |    d}t        ||      }t        ||      }||f| j                  vrkt        |   t        vrD| j                  j                  dt        |    d| dt        |dd	       d
t        ||       d	       t        |   | j                  ||f<   ||f| j                  vr\| j                  j                  dt        |    d| dt        |dd	       d
t!        ||       d	       t        |   | j                  ||f<   |||f}	|	| j"                  j$                  v r| j"                  j$                  |	   S | j"                  j'                  | j(                  d|	 d      }
|
 d}|| j*                  |<   | j                  j-                  | d|
 dt        ||       d       | j                  j-                  | d| dt!        ||       d       | j.                  j-                  | dt        |||       d       | j0                  | j2                  k\  rtt5        |      rd| d}n3dt        |dd      z   dz   }| dt
        |    d}| d| d| d| d| d
}| j6                  j-                  |
 dt        ||
|       d       |
}n|}t9        ||      }|| j"                  j$                  |	<   |S )N>   r@   rA   rF   rG   rH   rJ   rK   zat::vecz::Vectorized<rr   z#pragma omp declare reduction(r  z:omp_out = r  r  z) initializer(omp_priv={z})
            r  Fr  _vecr   r  r  zwelford_vec_reduce_all(rZ   z	{ return r   r  z; }z::vec_reduce_all<z([](z& x, z& y) r   )ri   r2   r[   r  r  rk   rz   r|   r  r  r  r  r  r   ro   r   rw   r  r  r  r  r  r  r  r8  r  r   r  r   )r   rn   r   rm   r`   vec_nsvecr  acc_type_vecr  r   acc_vecr   reduce_all_bodyvec_reduce_all_funcr   r  s                    rb   r  zCppVecKernel.reduction  s    "
 
 	
 
 ###EKK'''%0U\\H5HAl5&9%:!<%ne<-neDH%T-C-CCN+3DD%%,,n az *^YA
B C&~u=> ?	 @L@D"">8#;< L)1G1GG!!((n a~ . HE
F G*>5AB C	 DPDD""><#?@ ">58D..>>>%%55mDD  ))JJ*]O4E * 
 E,*8w'''j#c."G!HJ	
 	''nAgYc*<^U*S)TTUV	
 	is0%PQQRS	

 ??d222#N36wiqA
  +NCEF  
 *00A,uBUAVVW&X# 34DU3%u_L]]_`g_hhij
!!++%s,^S*MNaP FF">6:<B**=9rd   c                    | j                  |      }| j                  j                  |      }t        j                  j                  |      }t        j                  }| j                  | j                  k\  rB| j                  j                  t        || dt        |       dt        |    d| d             y g }||k7  r[|t        v rB|t        j                  k(  r/t        |    d| }t        |d| dt        |    d| d      g}|}nt!        d| d	|       |t        || j#                  ||||            gz  }| j                  j%                  |       y )
Nr  z] = static_cast<rY   r  _r  z = cvt_fp32_to_lowp_fp<z Unsupported reduction type from z to )r  r   r  r"   r  r  ri   r2   r8  r  r  r  r)   r   rk   rh   rl   rZ  r  )	r   r  r   r`   r   	out_dtypern   store_lines_lowp_fp_tmpvar_vecs	            rb   r  zCppVecKernel.store_reduction  s}   $$U+iit$GG%%d+	??d222!!++e1[/00@iAX@YY[\a[bbde KE!-%5;;2F-9)-D,EQug*N'$ #$7#88OP\]fPgOhhjkpjqqst#K 0E(:5'i[Q  ++E3yI K !!,,[9rd   
scalar_varc                 2   |j                   s%| j                  | j                     |j                  vsJ |j                  t
        j                  k(  r5| j                  j                  | j                  d|j                   d      }nV|j                  J | j                  j                  | j                  dt        |j                      d|j                   d      }t        |t              sJ |j                  |_        |j                  |_        d|_         |S )NrN  rZ   rq   rY   T)r  r3  r8  r  rn   ri   r8   r5  r  r  r  rk   r[   r  )r   ri  vec_vars      rb   r"  zCppVecKernel.broadcast$  s    !!doo.j6S6SS	
T uzz)hh''z.?qAG ##///hh''&|J4D4D'E&FbHYYZ[G '>222"((%/%B%B"rd   r   )r   r   r   r  r.  ri   r2   r   rM   r   r;  r"  rZ  r  r  r  r  r"  r=  r>  s   @rb   r!  r!    s[    I [[4 K KUZZ KZ#J
"Zx%:NN rd   r!  c                   x     e Zd ZdZ fdZd Zd Zd Zdede	j                  f fdZd fd		Zd
 Z fdZ xZS )CppTile2DKernelan  
    A vector kernel that handles the 2d tiles with the tile size defined in `tiling_factor` on
    the inner-most loop level and one of the outer loop level (`outer_tiling_idx`). When the data
    tile is accessed in a contiguous way from the outer loop axis, a transposition is applied on the
    tile to make the access contiguous from the inner-most loop axis. Then, the same vectorization
    logic from its parent `CppVecKernel` is leveraged for load/store/compute. The transposed tile load
    and store are generated into kernel.preloads and kernel.poststores buffers.

    The loop structure looks like below:
    for ...
      for i_outer ...
        for ...
          for inner_most ...
            // generated by CppTile2DKernel
            float tmp0[16*16]; at::vec::transpose_mxn<...>(tmp0, in_ptr0 + ..., ...); // into kernel.preloads
            float tmp1[16*16]; // into kernel.preloads
            for i_inner ... { // the kernel inner loop
              vectorized loads/compute/stores (e.g., load tmp0, store tmp1) // into kernel.loads/compute/stores
            }
            at::vec::transpose_mxn(out_ptr0 + ..., tmp1, ...) // into kernel.poststores
          for inner_most ... (tail)
            // generated by CppVecKernel
            ...
      for i_outer ... (tail)
        for ...
          for ...
            // generated by CppKernel
            ...
    c                 @    t         |   ||||d   |       || _        y r   )r  r   tiling_indices)r   r   r  r7  ro  r:  r   s         rb   r   zCppTile2DKernel.__init__Y  s*    +}nQ.?	
 -rd   c                 L    t        | j                  | j                      d      S )NrE  )r    r3  	outer_idxr  s    rb   inner_itervarzCppTile2DKernel.inner_itervar_  s"    t}}T^^<=VDEErd   c                    t        | j                  | j                     |      dk(  xr |j                  | j                  | j                           xr t        | j                  | j                     |      j                  | j                  | j                            xrJ t        | j                  | j                     |      j                  | j                  | j                            S r   )r   r3  rq  rJ  r8  r  s     rb   need_vec_transposez"CppTile2DKernel.need_vec_transposeb  s    dmmDNN3U;q@ 		$--89dmmDOO<eDHHdoo. 
 dmmDOO<eDHHdnn- 		
rd   c                 P   t         j                  j                  |      }| j                  }| dt	        |       }d}t	        t        | j                  | j                     |             }	| }
|r||}}|
|	}
}	d}dt        |    d| d| d| d|	 d| d|
 d}|r| j                  j                         }n\|| j                  j                  vr)| j                  j                  | j                  |d	
      }nd	}| j                  j                  |   }|r3t        |    d| d| d| d| d
}| j                  j                  |       |j                  dt!        |            }|r'| j"                  j                  t%        ||             |S | j                  j                  |       |S )Nr~   __place_holder__Tzat::vec::transpose_mxn<,rY   r   r  Fr  r   r  r=   z] __attribute__ ((aligned (z)));)r"   r  r  r7  r   r   r3  r8  rk   r5  r  cacher  r  r  replacerM   r  r)   )r   r  r   r   is_storern   factorsrcdstld_srcld_dstneed_defineload_or_storetile_vardefine_lines                  rb   gen_transposed_tile_load_storez.CppTile2DKernel.gen_transposed_tile_load_storen  s   !!$'##SU+,- 	$--*H% PQR8CC#VFF1,u2E1FaxqQWPXXZ[^Z__abhaiiklokpprsyrzz|}xx(H$((..0xx((U(SHKxx~~m4H)%018*AfXQvhNijpiqquvKMM##K0%--.@#h-POO%%l4&GH  MM##M2rd   r  r   c                 <   t               }| j                  j                  |      }| j                  |      }| j	                         }| j                  |      r| j                  |||d      }| dt        || j                  z         }t        j                  j                  |      }|t        v rdt        |    d| d| j                   d}	nIt        j                  j                  |      t        j                  fv r|j                   rd| d}	nd	| d}	| j"                  j%                  | j&                  |	      }
|
j)                  d
||fi        t+        |
t,              sJ d|
_        |
S | j1                  || j2                  |      }t4        | m  ||      S )NFrz  r~   rq   rB  r   rZ   rA  rC  r"  TrF  )r  r   r  r  rr  rt  r  r   r7  r"   r  r  rh   rk   ri   r  rK  r5  r  r  r/  r[   r  r  r  rq  r  r"  )r   r  r   r   r   r  r  rQ  rn   r  r  r   r   s               rb   r"  zCppTile2DKernel.load  s   '?'Aiiood#$$U+""$""5)::c55 ; H "
#k%$:L:L2L&M%NOGGG%%d+E%-l5.A-B)G9TVW[WiWiVjjkl!!$'EKK=822H	QRS;G9AFXX&&tzz48F!!&4-<fn555 FMM44 NN 5 I
 7<i00rd   c                    d|v sJ t               }| j                  j                  |      }| j                         }| j	                  |      }|J | j                  |      r| j                  |||d      }| dt        || j                  z         }	t        j                  j                  |      t        v r| d|	 d| j                   d}
nNt        j                  j                  |      t        j                  fv r| d|	 d| j                   d}
n| d|	 d}
| j                  j!                  t#        ||
             y | j%                  || j&                  |      }t(        | U  ||||       y )	Nr  Tr  r~   rX  r   r  rF  )r  r   r  rr  r  rt  r  r   r7  r"   r  r  rh   ri   r  r  r  r)   r  rq  r  r  )r   r  r   r`   r  r   r   r  r  storebufr  r   r   s               rb   r  zCppTile2DKernel.store  su   }}'?'Aiit$""$$$U+||""5)::c54 ; H #3{54;M;M3M'N&OPHww  &-7zD4F4F3GrJ""4(U[[M9zD4F4F3GrJz4KK!!,tT":;44 NN 5 I
 GM$	5$7rd   c                 t    | j                         }|j                  d| d| d| j                   d| d	       y )NrG  rH  rU  r  z++))rr  r  r7  )r   r  r  s      rb   r  z#CppTile2DKernel.codegen_inner_loops  sA    ""$veWC0B0B/C2eWCP	
rd   c                     t         |   ||      }| j                  d   | j                  k  r| j                  nt	        | j                        \  | _        | _        |S r   )r  r  ro  r  reversedrq  r8  )r   groupreduction_groupr0  r   s       rb   r  zCppTile2DKernel.set_ranges  s\    w!%9 ""1%(<(<< $--. 	(
 rd   r   )r   r   r   r  r   rr  rt  r  rM   r   r;  r"  r  r  r  r=  r>  s   @rb   rm  rm  :  sJ    <-F

 D!1 !1UZZ !1F8:
 rd   rm  c                       e Zd Zd fd	ZddZdedeej                  j                  df   fdZ
dedeej                  j                  df   fdZded	ej                  j                  fd
Zdedej                  fdZdedej                  fdZddZd Zd Zdej                  j                  fdZd Zd Z xZS )CppVecKernelCheckerc                    t         |   ||||       t        xj                  dz  c_        t        xj                  dz  c_        d | _        d| _        g | _        t        j                  j                         D ]1  \  }}t        |t              s| j                  j                  |       3 t        j                         | _        t"        j$                  t"        j&                  t"        j(                  t"        j*                  t"        j,                  g| _        t"        j$                  t"        j&                  t"        j(                  t"        j,                  g| _        g | _        t"        j4                  | _        y )Nr#   T)r  r   r   generated_kernel_countr9  _orig_wrapper_codesimd_vecfast_vec_listr  __dict__r1  r[   r  r#  r  r  
exit_stackri   r2   r9   r  r8   r  load_supported_dtypesstore_supported_dtypesstore_dtypesrj   	vec_dtype)r   r   r  r7  r8  kvr   s          rb   r   zCppVecKernelChecker.__init__  s   {M:F 	&&!+&..!3. #'#,,224DAq!\*""))!, 5 %..0 KKNNMMJJKK9
" KKNNMMKK	:
# 02&+mmrd   Nc                     t         j                  t        j                        rt         j	                  d|       d| _        y )NzDisabled vectorization: %sF)schedule_logisEnabledForloggingDEBUGdebugr  )r   msgs     rb   disable_veczCppVecKernelChecker.disable_vec
  s,    $$W]]3;SArd   r  usersc                    t         j                  j                  |      }|t        j                  k(  r t        d |j                         D              S |t        j                  k(  ru	 t        d |j                         D              sy|j                         D ]?  }|j                  dk(  sJ t        d |j                  j                         D              r? y yy)Nc              3   8   K   | ]  }|j                   d v   yw)r  r  Ntargetr'  users     rb   r)  z.CppVecKernelChecker.is_mask.<locals>.<genexpr>  s     Sldt{{&99l   c              3   ~   K   | ]5  }|j                   d k(  xr  |j                  d   t        j                  k(   7 yw)rP  r-  N)r  r   ri   r8   r  s     rb   r)  z.CppVecKernelChecker.is_mask.<locals>.<genexpr>  s:      (D z)Idiimuzz.II(s   ;=FrP  c              3   8   K   | ]  }|j                   d v   ywr  r  r  s     rb   r)  z.CppVecKernelChecker.is_mask.<locals>.<genexpr>   s"       : KK#66 :r  T)
r"   r  r  ri   r8   allkeysr  r  r  )r   r  r  	load_typeto_dtype_nodes        rb   rM  zCppVecKernelChecker.is_mask  s    GG%%d+	

"SejjlSSS%++%  !JJL  !&$++z999  - 3 3 8 8 :  ! ". rd   c                    t         j                  j                  |      }|t        j                  uryt        |      dk(  rEt        t        |            }|j                  dk(  r!|j                  d   t        j                  k(  ryyy)z
        Check:
        1. load_type is torch.uint8
        2. has 1 user node of target to_dtype
        3. dtype of to_dtype is torch.float
        Fr#   rP  r-  T)r"   r  r  ri   r  r   nextiterr  r   r2   )r   r  r  r  r  s        rb   rK  z*CppVecKernelChecker.is_load_uint8_as_float)  sh     GG%%d+	EKK'u:?U$Dz)		"0Lrd   	store_var
value_nodec                     t         j                  j                  |      }|t        j                  fvry|j
                  dk(  r!|j                  d   t        j                  k(  ryy)z
        Check:
        1. store_type is torch.uint8
        2. value_node is of target to_dtype
        3. dtype of to_dtype node is torch.uint8
        FrP  r-  T)r"   r  r  ri   r  r  r   )r   r  r  
store_types       rb   can_store_fp32_as_uint8z+CppVecKernelChecker.can_store_fp32_as_uint8:  sQ     WW&&y1
ekk]*
*zr/Bekk/Qrd   r   c                    t         j                  j                  |      }t         j                  j                  |      }|t        j
                  t        j                  fv xra t        |t              xrO t        |j                  t              xr3 t        |j                  j                  j                        dk(  xr |dk(  S Nr   )r"   r  r  
get_bufferri   int32int64r[   r   datar   r   layoutsize)r   r  r   
load_dtypebuffers        rb   is_load_integer_scalar_tensorz1CppVecKernelChecker.is_load_integer_scalar_tensorI  s    WW&&t,
##D)5;;44 69-6;;
3 V[['',,-2 !	
rd   c                    t        t              5 }t        j                  j	                  |      }|j                         }|sJ ||_        | j                  ||j                         j                        |_
        | j                  ||j                         j                        |_        | j                  j                         }t        | j                        dk(  r| j!                  d       |cd d d        S |t"        j$                  t"        j&                  fv rd|j                  sX|j                  sL|j                  s| j!                  | d       n |j                  s| j!                  | d       |cd d d        S || j(                  vrY| j+                  ||      sG|j-                  | j                  | j.                           r| j!                  | d       |cd d d        S |cd d d        S # 1 sw Y   y xY w)Nr   
not a loopz not loaded as maskz not loaded as floatz not supported by load)r   r   r"   r  r  r  rn   rM  r  r  is_load_as_maskrK  r5  r  r   r3  r  ri   r8   r  r  r  rJ  r8  )r   r  r   node_ctxr  r   r   s          rb   r"  zCppVecKernelChecker.loadT  s   &x0H**40J+3+?+?+AGN7&GM&*ll49M9M9O9U9U&VG#-1-H-Hh**,22.G* ((//#C4==!Q&  . 10  ejj%++66''7+I+I..$$
|3F%GH 77$$
|3G%HI/ 104 4#=#==::4GIIdmmDOO<=  J</E!FG? 10B C 100s    CG/7A<G/=AG/$G//G8c                    t        t              5 }t        | j                        dk(  r&| j	                  d       | j
                  cd d d        S t        j                  j                  |      }|j                         }|sJ ||_
        |t        j                  k(  rt        j                  n|}| j                  j                  |       || j                   vr)| j	                  | d       | j
                  cd d d        S |t        j"                  fv rU|j%                         j&                  d   }| j)                  ||      s&| j	                  d       | j
                  cd d d        S d|v sJ | j+                  |      }|r)| j	                  d|        | j
                  cd d d        S |j,                  r| j	                  d|        | j
                  cd d d        S # 1 sw Y   y xY w)	Nr   r  z not supported by storer-  z"not support store float32 as uint8r  zstore mode: zconstant store index: )r   r   r   r3  r  r  r"   r  r  r  rn   ri   rj   r2   r  r#  r  r  r  all_input_nodesr  r  	is_number)	r   r  r   r`   r  r  store_dtyper   r  s	            rb   r  zCppVecKernelChecker.storex  s   &x0H4==!Q&  .}} 10
 ''++D1K+3+?+?+AGN7'GM)4)E%++;K$$[1$"="==  K=0G!HI}} 10" u{{m+%113CCBG
33D*E$$%IJ==+ 10. D= =((/E  <v!67}}9 10<   #9%!AB==A 100s%   5GB'G AG(8G*+GG(c                     |t         j                  k(  r|t         j                  k(  r	|t        v rn| j                  d| d| d|        t	        |      rt        | j                  gdz        S | j                  S )Nzreduction: dtype z, src_dtype z, reduction_type    )ri   r2   VECTORIZABLE_RTYPESr  r   r   r  )r   rn   r   rm   r`   s        rb   r  zCppVecKernelChecker.reduction  sw    U[[ U[[("55#E7,ykARSaRbc  /$--1,--}}rd   c                     | j                   S r   )r  )r   r  r   r`   s       rb   r  z#CppVecKernelChecker.store_reduction  s    }}rd   r  c                 <   d dt         j                  j                  ffd}t        |j                        dk\  sJ t        |j                  d         t        t        fv ryt        |j                  d         t        t        fv ry ||      \  }}||y||k(  S )	Nc                     t        |       t        j                  j                  k(  rt	               }|r|j
                  S d S y r   )r  ri   r  r  r  rn   )r  r   s     rb   get_node_dtypez<CppVecKernelChecker.is_supported_cmp.<locals>.get_node_dtype  s2    DzUXX]]*/G/I(/w}}9T9rd   r  c                 X     | j                   d          | j                   d         fS )Nr-  r   )r  r  s    rb   get_cmp_dtypesz<CppVecKernelChecker.is_supported_cmp.<locals>.get_cmp_dtypes  s(    !$))B-0.22OOOrd   r   r-  Tr  F)ri   r  r  r   r   r  r6   r2   )r   r  r  
left_dtyperight_dtyper  s        @rb   is_supported_cmpz$CppVecKernelChecker.is_supported_cmp  s    		P 	P 499~"""		"3,.		"3,."0"6
K!4,,rd   c                     | j                   J | j                   t        j                  _        | j                  j                  |||       y r   )r  r"   r  wrapper_coder  r
  r  s       rb   r
  zCppVecKernelChecker.__exit__  s<    &&222#66  7F;rd   c                 ^    t         j                  j                   _        t	               t         j                  _         G  fdd       j
                  j                  t        j                                        j
                  j                  t        j                                 S )Nc                      e Zd Zg dZefd       Ze fd       Zededej                  ffd       Z
edfd	       Zefd	       Zefd
       Ze fd       Ze fd       Zedd       Zefd       Zedfd	       Zy)6CppVecKernelChecker.__enter__.<locals>.VecCheckerProxy)rP  rS  r[  r]  rV  rY  c                     t         j                  j                  }j                  |      sj	                  d|        j
                  S )Nzbinary comparison op: )r"   r   r   r  r  r  )r   r  r   r   s      rb   _bin_cmp_opzBCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy._bin_cmp_op  s@    ./mm.H.H,,\:$$'=l^%LM}}$rd   c                       fd}|S )Nc                      j                   v rj                  | |      S j                  vrj                  d        j                  S )Nzop: )bin_cmp_opsr  r  r  r  )r   r.  VecCheckerProxyr  r   s     rb   r  zQCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.__getattr__.<locals>.inner  sP    :::.::4HH4#5#55((4v7==(rd   r   )r  r  r  r   s   ` rb   r%  zBCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.__getattr__  s    ) rd   r  r   c                 (    j                  | |      S r   r"  )r  r   r   s     rb   r"  z;CppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.load  s    yyu--rd   Nc                 .    j                  | |||      S )N)r  r  )r  r   r`   r  r   s       rb   r  z<CppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.store  s    zz$u4z@@rd   c                 ,    j                  | |||      S r   )r  )rn   r   rm   r`   r   s       rb   r  z@CppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.reduction  s    ~~eYNNrd   c                 *    j                  | ||      S r   )r  )r  r   r`   r   s      rb   r  zFCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.store_reduction  s    ++D%??rd   c                    t        t              5 }|j                         }|sJ t        j                  t        j
                        }|t        j                  k(  r3| |j                  k  r$| |j                  k\  rt        j
                  |_	        t        j                  t        j                        }|t        j                  k(  rZ| |j                  k  r| |j                  k\  s'| t        j                  k(  s| t        j                   k(  rt        j                  |_	        t        j                  t        j
                  t        j                  t        j                  g}|j                  |vsE|j                  t        j
                  k(  rFt!        fd|j"                  j$                  D              sj'                  d|j                          | cd d d        S # 1 sw Y   y xY w)Nc              3   N   K   | ]  }|j                   j                  v   y wr   r  r  r'  r  r  s     rb   r)  zRCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.constant.<locals>.<genexpr>	  s(      $(C !KK?+F+FF(C   "%zconstant dtype: )r   r   r  ri   iinfor  r  r@   rA   rn   finforj   r3   rV   r9   r  r  r   r  r  )	r  rn   r  r   	i32_iinfo	f32_iinfosupported_dtypesr  r   s	          rb   r  z?CppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.constant  sQ   .x8H3;3G3G3IG"N7 !&EKK 8I,9==09==0(- %EMM :I, IMM1cY]]6J #uyy 0 #		z 1,1MMGM 	($ }},<<4 # $(0(=(=(C(C$ !
 ((+;GMM?)KLM 988s   F>GG"c                     t        	j                        t        	j                        k(  sJ t        	j                        rt        d 	j                  D              s1	j	                  d  d|        	j
                  j                         S  	fd}t        t              5 }t        	j                        t        	j                        k(  sJ |j                         }|sJ |t        j                  k(  rE |       r>t        fd|j                  j                  D              rt        j                  |_        n||_        	j	                  d  d|        	j                  	j                      } j#                  |       }|s	j	                  d  d|        ||_        	j
                  j                         }|cd d d        S # 1 sw Y   y xY w)Nc              3      K   | ]@  }t        |t        j                         xs t        j                  |      j                   B y wr   )r[   r   r;  r   r  )r'  r  s     rb   r)  zTCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.index_expr.<locals>.<genexpr>*	  s=      3!, #5%**55X9N9X9XX!,s   AAzindex_expr: z, dtype c            
      <   t        j                        } t        j                  j                        D ci c]  \  }}|| v r|| }}}t        d |j                         D              ry|j                         D ci c]  \  }}|t        d|dz
         }}}|rt        |      t        |       k7  rQt        j                  t        j                        }j                  xr  |j                  k  xr |j                  k\  S t!        |      }t#        j$                  |j&                        st#        j$                  |j(                        ryt+        t        t-        |j&                        t-        |j(                        dz               S c c}}w c c}}w )Nc              3   &   K   | ]	  }|d k(    yw)r   Nr   )r'  r  s     rb   r)  zkCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.index_expr.<locals>.can_use_int32.<locals>.<genexpr>:	  s     :>a16>s   Tr   r#   F)r   r1  zipr3  r  rI   valuesr1  r   r   ri   r  r  r  r@   rA   r   r]   r  r\   upperr   r6   )	r1  r  r  sizesvars_rangesr  expr_rangesr   r   s	          rb   can_use_int32zXCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.index_expr.<locals>.can_use_int322	  se   #'(9(9#:L %(t{{$C$CDAq, 1$C   :5<<>::#HM"V11k!QU&;#;K"V&#k*:c,>O*O$)KK$<	 NN 6 $	 56 $	 5
 #.dK"@Kzz+"3"34

;CTCT8U$ 8# 1 12C8I8I4JQ4N / #Ws   FFc              3   N   K   | ]  }|j                   j                  v   y wr   r  r  s     rb   r)  zTCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.index_expr.<locals>.<genexpr>X	  s(       (C !KK?+F+FF(Cr  z"index_expr (tiling var relevant): )r   r  r3  r  r  r5  r  r   r   r  ri   r  r   r  r  rn   r8  rJ  is_most_inner_loop_irrevelant)
r   rn   r  r  r   r?  tiling_var_irrelevanttmp_varr  r   s
   `       rb   r#  zACppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.index_expr'	  s   4;;'3t}}+====4;;'s 3!%3 0
 $$|D6%%IJ88??,,> /x8Ht{{+s4==/AAAA3;3G3G3IG"N7,)O  (0(=(=(C(C  
 ).(-((<vXeW)MN!%t!?J040D,D)0((@hugV =RG9"hhoo/G"3 988s   'DGGc                 *    t        t        |             S r   )r    rM   )	index_varr  checks      rb   indirect_indexingzHCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.indirect_indexingl	  s    #C	N33rd   c                 F     |        j                   j                         S r   )r5  r  )r  r  r  r   s      rb   r  z=CppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.maskedp	  s    xx((rd   c                 4   t        t              5 }|j                         }|sJ |_        |j	                         }|j
                  d   }t        j                  k(  r{|j                  dv r|j                  dk(  r,t        j                  j                  |j                  d         n|j                  d   t        j                  t        j                  t        j                  t        j                  fv rnt        j                   t        j"                  fv r|j                  dk(  rt        j                  j%                  |j                  d         }t'        |t(              rGt'        |j*                  t,              r-t/        |j*                  j0                  j2                        dk(  sNj5                  d        n8j5                  d        n"t6        v rt9        d |j:                  D              sj5                  d       | cd d d        S |j:                  D cg c]  }|j                  d    }	}t9        fd	|	D              sj5                  d
       | cd d d        S t        j<                  k(  rnxt        j                  k(  rQt9        d |j:                  D              }
t9        d |j:                  D              }|
s+|s)j5                  d        nj5                  d        | cd d d        S c c}w # 1 sw Y   y xY w)Nr#   r  r"  r-  r   zto_dtype: dtype c              3   :   K   | ]  }|j                   d k(    ywr  Nr  r'  usrs     rb   r)  zRCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.to_dtype.<locals>.<genexpr>	  s     "SNS3::#8N   z9to_dtype: bfloat16/float16 expecting users are all storesc              3   b   K   | ]&  }t         j                  j                  |      fv  ( y wr   )r"   r  r  )r'  r  rn   s     rb   r)  zRCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.to_dtype.<locals>.<genexpr>	  s+      #KV4AGG--d3w>;s   ,/z7to_dtype: expecting all stores into bfloat16 or float16c              3   8   K   | ]  }|j                   d v   yw)r  Nr  r  s     rb   r)  zRCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.to_dtype.<locals>.<genexpr>	  s      4?MCJJ)3~r  c              3   |   K   | ]4  }|j                   d v xr  |j                  d   t        j                  k(   6 yw))rP  r   N)r  r   ri   rj   r  s     rb   r)  zRCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.to_dtype.<locals>.<genexpr>	  sE      7
 (6 !$

l : !A$'HHQK5==$@!A (6s   :<)r   r   r  rn   r  r  ri   r2   r  r"   r  r  r   r  r9   r  r  r  r  r[   r   r  r   r   r  r  r  rh   r  r  r8   )r   rn   r   r  r   cur_nodeinput_valuer  r  store_namesis_to_uint8_and_storeis_to_uint8_and_to_floatr   s    `          rb   rP  z?CppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.to_dtypeu	  s   .x8H3;3G3G3IG"N7$)GM'335H191I1I!1LK+&-- 2  $/#5#5#? !" 1 1+2B2B12E F%0%5%5b%9 "
  % % % % %	)   !% %%++u{{)C C$/$6$6&$@)*););K<L<LQ<O)P %/vy$A(26;;
(K(+FKK,>,>,C,C(D(I$($4$47Gw5O$P $ 0 03CE71K L-/""SHNN"SS ,, [ $%[ 98^ ?Gnn&Mnssxx{n&M" #KV#   !,, Y $%m 98n %**,%++- 14 4?G~~4 1-
 47 7
 (0~~7 40 !69Q ,,/?w-GH((+;E7)CD[ 98^ 'N_ 98s+   H LL*L	 (L2BL	LLr   r  )r   r   r   r  r  r  r%  rM   r   r;  r"  r  r  r  r  r#  r
  r  rP  )r  r   s   rb   r  r    s    >K% % 	 	 .3 .uzz . . A A O O @ @ ' 'R B# B#H 4 4 ) ) N Nrd   r  )	r"   r  r  r  r   r  r  set_ops_handlerset_kernel_handler)r   r  s   `@rb   r  zCppVecKernelChecker.__enter__  sw     #$''"6"6-/m	 m	^ 	%%a&7&78I&JK%%a&:&:4&@Ard   )r-  r   )r   r   r   r   r  rM   r   ri   r  r  rM  rK  r  r   r;  r  r"  r  r  r  r  r
  r  r=  r>  s   @rb   r  r    s    &4P
C UXX]]D-@(A 43 tEHHMM4<O7P " %((-- 	
# 	
ejj 	
" "UZZ "H!!F-UXX]] -4<{rd   r  c                   B     e Zd Z fdZd ZdefdZd Zd Zd Z	 xZ
S )CppKernelProxyc                     t         |   |j                  |j                  j                         || _        d | _        d | _        t        j                         | _
        y r   )r  r   r   wsr  kernel_groupr
  r  r   r5  picked_vec_isa)r   r   r   s     rb   r   zCppKernelProxy.__init__	  sJ    **LOO,G,GH(090F0F0Hrd   c                 `    |D ])  }t        |t              sJ t        j                  |       + y r   )r[   r   r(   propagate_scheduler_node)r   nodes_nodes      rb   data_type_propagationz$CppKernelProxy.data_type_propagation	  s*    Ee]33388? rd   scheduler_nodec                    t        |j                  t        j                        syd }t	        j
                  |       |j                  j                  gt        |j                  j                  j                               z   }|D ]  }|j                  j                  D ]  }|j                  dk(  s|j                  dv r!|j                  dvr  yt        |d      r|j                  rt         j"                  |j                  v sJ |j                  t         j"                     }|j$                  r|j$                  t&        vr  y|r||j$                  k(  rJ d       |j$                  }  y  ||_        y)NTplaceholder)	get_indexr#  )r"  r  r   r[  r  Fr  z+scheduler node do not support bf16/fp16 mix)r[   _bodyr   LoopBodyr(   r#  
root_blockr   	subblocksr  r  r$  opr  r-  r  r0   r   rn   rh   _lowp_fp_type)r   r'  r0  
sub_blocks	sub_blockr%  r   s          rb   is_lowp_fp_schedulerz#CppKernelProxy.is_lowp_fp_scheduler	  sQ   ...</3 	44^D$**556  **113:
 

 $I".. 88}, A 1  << (  !5&)ejj.22ejj@@@38::>Q>U>U3VG"==GMM,N$$)W]]:IHI: )0 ? / $D (5$rd   c                 $    dt         j                  j                  fddt        j                  ffd}t         fd|D              r|D ]  }|j                  j                  gt        |j                  j                  j                               z   }|D ]  }|j                  j                  D ]n  }|j                  dv s|j                  sJ t        j                   |j                  v sJ |j                  t        j                      }|j"                  t$        v rnJ    y |D ]f  }t'        |t(              sJ t'        |j                  t        j                        sJ |}dt(        fd}	 |	|       }
|
sS|j                  } ||       h y )	N	sub_graphc                 &   dt         j                  j                  fd}dt         j                  j                  fd}t        | j                        }g |D ]u  } ||      rt        d |j                  D              r)|j                  d   }| j                  |      5  | j                  d||t         j                  f      }|j                  }|j                  |       ||_        t        xj                  dz  c_        d d d         ||      r|j                  \  }}}	}
}	|
j                  d	k(  rt        d
 |
j                  D              rt        j                   j#                  |      }| j%                  |      5  | j                  d||
|f      }|j'                  |
|       t        xj                  dz  c_        d d d        v|j                  dk(  r|j                  \  }}}}}|t(        v s|t         j                  t         j*                  t         j,                  t         j.                  fv sJ ||t(        v rt         j                  n|t         j                  ||f|_        |j                  dk(  rP|j                  d   t(        v r;|j                  \  }}}	j1                  |       ||t         j                  f|_        vx dt         j                  j2                  ffd} ||        y # 1 sw Y   xY w# 1 sw Y   xY w)Nr  c                     | j                   dvryt        | j                        dk(  sJ t        j                  j                  | j                  d         }|t        v S )Nr  Fr  r#   )r  r   r   r"   r  r  rh   )r  r  s     rb   is_lowp_fp_loadzTCppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.is_lowp_fp_load
  sN    ;;h. 499~***WW..tyy|<
!]22rd   c                     | j                   dk7  ry| j                  \  }}}}}t        j                  j	                  |      }|t
        v S )Nr  F)r  r   r"   r  r  rh   )r  re  r  r  s       rb   is_lowp_fp_storezUCppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.is_lowp_fp_store
  sD    ;;') (,		%9aAgg//	:"m33rd   c              3   :   K   | ]  }|j                   d k(    ywr  r  r  s     rb   r)  zNCppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.<genexpr>
  s     Jkd4;;'1kr  r   rP  r  r#   r"  c              3   :   K   | ]  }|j                   d k(    ywr  r  r  s     rb   r)  zNCppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.<genexpr>-
  s      :;J4w.?r  r  r-  r5  c                 V    dt         j                  j                  ffd} ||        y )Nr5  c                 B   dt         j                  j                  fd}| j                  D cg c]  }|j                  dk(  s| }}|D cg c]  } ||      s||j
                  i }}|D ]  }|j                         D ]q  \  }| j                  v st        fd|D              sv s.t        d |D              sAj                  d   }j                  |       | j                         s  | j                  | j                          y y c c}w c c}w )Nto_nodec                 :    t        d | j                  D              S )Nc              3   :   K   | ]  }|j                   d k(    yw)rP  Nr  r  s     rb   r)  zCppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_to.<locals>.<genexpr>o
  s     "U}3::#;}r  )r  r  )r?  s    rb   _used_by_tozCppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_ton
  s    ""Uw}}"UUUrd   rP  c              3   \   K   | ]#  }|j                   d    j                   d    k(   % ywr-  Nr  )r'  r  r  s     rb   r)  zCppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<genexpr>z
  s'     #SUcCHHRLDIIbM$AUs   ),c              3   F   K   | ]  }|j                   d    t        v   ywrD  )r   rh   r  s     rb   r)  zCppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<genexpr>}
  s"      ,&QV#(EQVs   !r-  )ri   r  r  r$  r  r  r1  r  r  replace_all_uses_with
erase_nodeowning_modulelint)	r5  rB  r  all_to_nodesall_to_nodes_and_users
node_usersr  val_nodeto_lowp_fp_legalized_nodess	     `     rb   _eliminate_duplicate_to_nodez}CppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_nodeg
  s&   VUXX]] V *3$)8DKK:<U ! $ 8D.7Ct{SWGXtzz*| + . '=
+5+;+;+=KD%#y6 ##SU#S S$(,F$F(+ ,&QV,& )&
 ,0+?+?+C $ : :8 D ) 4 4T : ,> '=, !..6!( 79$.s   DDDD)ri   r  Graph)r5  rO  rN  s     rb   eliminate_to_dtypezWCppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.eliminate_to_dtypef
  s"    ')EHHNN ')R -Y7rd   )ri   r  r  r   r$  r  r  r   inserting_aftercall_methodr2   rF  r   cpp_to_dtype_countr  r"   r  r  inserting_beforereplace_input_withrh   r9   r  r  r#  rP  )r5  r8  r:  sub_graph_nodesr%  r!   to_type_nodeto_type_node_argsr  re  	value_varrn   r   rm   r`   r   rQ  rN  s                    @rb   add_to_dtypez;CppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype
  s   3ehhmm 34uxx}} 4 #9??3O)+&("5)JekkJJ **Q-C"2259'0'<'<&c5%++-F (= ( -9,=,=)33LA,=)22a72 :9 &e,16.Cq)Q ''61c :;D??: 7 !GG--d3E"33E:'0'<'<&c9e-D (= ( 00LI22a72 ;: \\[0 

!& M1  %!KK!NN!MM!KK	)       +0M+AEKKu!KK*!&
 \\Z/EJJrNm4S"'**KS!Q /55e<"%q%++!6EJQ )T*8ehhnn *8X y)a :9  ;:s    A"K9AL9L	L		loop_bodyc                     | j                   gt        | j                  j                               z   }|D ]  } |j                          y r   )r-  r   r.  r  r  )r\  r1  r2  r[  s      rb   _legalize_lowp_fpz@CppKernelProxy.legalize_lowp_fp_dtype.<locals>._legalize_lowp_fp
  s@    #../$y7J7J7Q7Q7S2TTJ'	Y__- (rd   c              3   d   K   | ]'  }t        |t              xr j                  |       ) y wr   )r[   r   r3  )r'  r%  r   s     rb   r)  z8CppKernelProxy.legalize_lowp_fp_dtype.<locals>.<genexpr>
  s3      
 um,Q1J1J51QQs   -0)r"  r  r  c                 b    | j                   j                  }t        |      dk(  xr
 d|v xr d|v S )Nr   r"  r  )read_writes	op_countsr   )r  rb  s     rb   is_memory_copy_scheduler_nodezLCppKernelProxy.legalize_lowp_fp_dtype.<locals>.is_memory_copy_scheduler_node
  s8     ,,66		Na'XFi,?XGyDXrd   )ri   r  rP  r   r,  r  r+  r-  r   r.  r  r  r$  r  r  r0   r   rn   rh   r[   r   )r   r$  r^  r%  r1  r2  fx_noder   r  rc  should_legalizer  r[  s   `           @rb   legalize_lowp_fp_dtypez%CppKernelProxy.legalize_lowp_fp_dtype

  si   G	*EHHNN G	*R	. 	.
  

 

 #kk445KK))0029 
 ",I#,??#8#8">>->>#*<</<#6#:#:gll#JJ#J;B<< 3 7 7<G $+==M#AA#A $9 ",	  Ee]333ekk2;;777"'DM  #@"EEO$(JJ!$' rd   c           	      h     j                          j                         t              dk\  sJ d   t        fdD              rj                  nt
        j                  } j                  t        d       j                  \  }\   j                         fd}fd |t              }t        j                  xj                  |j                  z  c_        t        j                  xj                  |j                  z  c_        t         j#                  |       _         j&                  sy  fdt
        j                  fd	t
        j(                  f fd
}t
        j*                  j,                  j/                  d      5   ||      \  }}t        |      t        |      k(  sJ t        |      dk(  rw j$                  j1                  |d   |d         \  }	}
|	j3                   |t4        |d   |d   |             |
j3                  |       d|	_        d|
_        |d   dz  |
_        nt        |      dk(  r|d   t         j<                        dz
  k(  r|d   |d   k(  sJ  j$                  j1                  |d   |d         \  }}|j3                  |       |j1                  |d   |d   z
  |d         \  }}|j3                   |t>        |d   ||             |j3                   |t4        |d   |d   |             d d d        y # 1 sw Y   y xY w)Nr#   r   c              3   l   K   | ]+  }t        |d       xr |j                  j                  k(   - yw)r0  N)r-  r0  )r'  r%  
first_nodes     rb   r)  z/CppKernelProxy.codegen_nodes.<locals>.<genexpr>
  sB       #E / D'':+C+CCD"s   14c                 4    t        | j                               S r   )r6   r  rT  s    rb   <lambda>z.CppKernelProxy.codegen_nodes.<locals>.<lambda>
  s    Q^^%5!6rd   r   c                      j                   | g| 5 } |       t        xj                  dz  c_        |cd d d        S # 1 sw Y   y xY wr   )
new_kernelr   r  )r4  r   r2  r   runs      rb   codegen_kernelz4CppKernelProxy.codegen_nodes.<locals>.codegen_kernel
  sC    (((4t4F ..!3. 544s   #AAc                 x   | j                        \  }}d}D ]  }|j                  d   fz   dffv r|rJ |j                  ||       4d}|j                  d   dfk(  sJ d|j                  d    d d        | j                         5  |j                  |d       d d d         y # 1 sw Y   xY w)NFr#   r   Tzunexpected group: r  r   )r  r  ro  r*  )r2  r0  reduction_vars	in_suffixr  r  r$  r  s        rb   ro  z)CppKernelProxy.codegen_nodes.<locals>.run
  s    #)#4#4UO#L D.I::a=O,_,b1%   )(=HHT>2 $I::a=-  Z ,DJJqM?$ugRGXYZ 
  //1r* 21  21s   B00B9	c            	         g } D ]q  }t        j                  |j                  g|j                   }| t	        j
                  |j                  |j                        D cg c]  }|j                   c}z  } s t               }g }t               }t               }| D ]   }|j                  D ]  }	t        j                  d|	j                        s$t        |	|      }
|
dk(  rO|j                  t!        |	j                  dd               |j#                  t!        |	j                  dd               t%        d |
j                  D              r(|j                  t!        |	j                  dd               |j                  t!        |	j                  dd                 ||z
  |z
  }t'        |      dk(  rt'        j(                        dz
  gS |rt+        |      dd  S ||z  |z
  }t+        |      }t'        |      dk(  r'|d   |v r |d   t'        j(                        dz
  k(  r|S t+        ||j,                        dd  S c c}w )Nz^d\d+$r#   c              3   R   K   | ]  }|j                   j                  d        ! yw)r7  N)r  r>  r'  r7  s     rb   r)  zNCppKernelProxy.codegen_nodes.<locals>.select_tiling_indices.<locals>.<genexpr>  s"     Q=PQVV..s3=Ps   %'r   r-  r   rl  )r   extract_read_writesr+  _sizes	itertoolschainreadswritesr   r  r1  researchr  r   r4  r6   r#  r  r   r3  sortedcount)	all_indexr  rwdepcontig_varscontig_vars_listnon_contig_stride_constnon_contig_stride_otherr   r   stridecontig_onlycontig_and_const_stridecontig_vars_sortedr$  r   s                 rb   select_tiling_indicesz;CppKernelProxy.codegen_nodes.<locals>.select_tiling_indices
  s   I!55djjO4;;O9??288RYY3WX3WCcii3WXX	  %K!&)e#&)e#" --C99Y9 &sE2F{#CHHQRL(9:(//CHHQRL0ABQV=P=PQQ/33C4EF/33C4EF . # 558OO  ;1$DMM*Q.//k*23//55''(# "(!4&'1,&r*.EE&r*c$--.@1.DD)),2B2H2HI"#NNG Ys    Irn   c                    j                   j                  |       }        }|rd}|D ]d  }t        t        j                  j
                        t               ||      5 } |       |xr |j                  }|s	 d d d         n	 d d d        f |r't        |      dk(  r|g|fS t        |      dk(  r||g|fS g g fS # 1 sw Y   xY w)Nr4  Tr#   r   )	r!  r6  r  r   r   r   r   r  r   )	rn   r7  ro  	could_vectiling_indicevec_checkerro  r  r   s	         rb   select_tilingz3CppKernelProxy.codegen_nodes.<locals>.select_tiling&  s     //999FM24N 	%3M, !2!2!7!78,.%%	
 %K($-$F+2F2F	(!   )  &4 >*a/ ->>>*a/ -}=~MMr6M s   !B??C	F)inplace_buffers)r{  Tr   ) rf  r&  r   r  r0  ri   r2   r   r@   r  r  r  r"   r  removed_buffersinplaced_to_remover  r  r
  r!  rn   	_inductorr   patchsplit_with_tiling
set_kernelr!  r  simd_ompsimd_nelementsr3  rm  )r   r$  r  re  rp  scalar_kernelr  tiling_factorsro  	main_loop	tail_loopouter_main_loopouter_tail_loopinner_main_loopinner_tail_loopri  r  r   r  ro  r  s   ``             @@@@@@rb   codegen_nodeszCppKernelProxy.codegen_nodes
  s(   ##E*""5)5zQ1X
   #  $$  	 ((&)6'

% 	$#E? 	/		+( 'y1	=#@#@@	""m&F&FF"*00?""'	OR 05{{ 	 	8 __##))%)@-:9-E*NN~&#n*====>"a''+~~'G'G"1%nQ.? (H ($	9 $$"$nQ&79JI
 $$]3%)	"%)	" ,:!+<+A	(^$)"1%T]]);a)??&q)^A->>? 48>>3S3S"1%nQ.? 4T 40  **=93B3T3T"1%q(99.QRBS 4U 40  **"'):NI
  **"$nQ&79JIM A@@s   FL((L1c                 >    | j                  | j                  ||       y r   )r  r
  )r   r  r  s      rb   r  zCppKernelProxy.codegen_loopsn  s    kBrd   )r   r   r   r   r&  r   r3  rf  r  r  r=  r>  s   @rb   r  r  	  s0    I@/= /bs(jm^Crd   r  c                   ^    e Zd ZdZd ZdefdZd Zd Zd Z	d Z
d	 Zd
 Zd Zd Zd Zd Zy)CppSchedulingi  c                 @    || _         | j                          d| _        y r  )	schedulerget_kernel_group_ready_to_flush)r   r  s     rb   r   zCppScheduling.__init__x  s    "$rd   statusc                     || _         y r   r  )r   r  s     rb   _set_flush_statuszCppScheduling._set_flush_status}  s
    %rd   c                 &    t        d |D              S )Nc              3      K   | ];  }t        t        t        j                  j                  j
                  |             = y wr   )r   mapr"   r  r  r   rv  s     rb   r)  z)CppScheduling.group_fn.<locals>.<genexpr>  s-     Mu!U3qww//88!<=us   AA)r   )r   r  s     rb   group_fnzCppScheduling.group_fn  s    MuMMMrd   c                     ddl m} |  t        t        j                  j
                  |      rt               | _        y t               | _        y )Nr#   )CppWrapperCodeGen)	r-  r  r[   r"   r  r  CppWrapperKernelGroupr   KernelGroup)r   r  s     rb   r  zCppScheduling.get_kernel_group  s2    .agg**,=> 5 7D +Drd   c                 ~    |j                   \  }\  }}|j                   \  }\  }}||k(  r||k(  ry|dk(  r	|||z   k(  ryy)NTr   F)r  )r   node1node2re  vars1reduce1vars2reduce2s           rb   _can_fuse_horizontal_implz'CppScheduling._can_fuse_horizontal_impl  sQ    #kkE7#kkE7E>g0b=Uego5rd   c                     t        |j                               t        |j                               z   t        j                  j                  kD  ry| j                  ||      S r  )r   	get_nodesr   r   max_horizontal_fusion_sizer  r   r  r  s      rb   can_fuse_horizontalz!CppScheduling.can_fuse_horizontal  sL    !"S):%;;jj334 --eU;;rd   c                 L    | j                  ||      xr |j                          S r   )r  r  r  s      rb   can_fuse_verticalzCppScheduling.can_fuse_vertical  s&    --eU;XEDVDVDX@XXrd   c                     | j                   }t        |      }|j                  |       |j                  ||       | j	                         }|t
        j                  kD  r| j                  d       yy)zC
        Turn an set of pre-fused nodes into a C++ kernel.
        TN)r   r  r  finalize_kernel_get_scheduled_num_argsr  MAX_FUSED_KERNEL_ARGS_NUMr  )r   r$  r   cpp_kernel_proxyargs_nums        rb   r  zCppScheduling.codegen_nodes  sh     ((),7&&u-$$%5u=//1m===""4( >rd   c                 6    | j                   j                         S r   )r   get_num_argsr  s    rb   r  z%CppScheduling._get_scheduled_num_args  s      --//rd   c                     | j                   S r   r  r  s    rb   ready_to_flushzCppScheduling.ready_to_flush  s    ###rd   c                      y r   r   r  s    rb   codegen_synczCppScheduling.codegen_sync  s    rd   c                     | j                   j                  t        j                  j                         | j                          | j                  d       y r  )r   codegen_define_and_callr"   r  r  r  r  r  s    rb   flushzCppScheduling.flush  s:    11!''2F2FGu%rd   N)r   r   r   r  r   r8   r  r  r  r  r  r  r  r  r  r  r  r   rd   rb   r  r  r  sO     !$%
& &N.<Y)0$&rd   r  c                   6     e Zd Z fdZd Zd Zd Zd Z xZS )r  c                    t         |           t               | _        t	               | _        t        | j
                        | _        t        j                         | _
        | j                  j                  | j                         g | _        y r   )r  r   r.   r   r$   
loops_codeWorkSharingr  r  r  r  r  scheduled_nodesr   r   s    rb   r   zKernelGroup.__init__  s^    L	&.doo.))+


  )!rd   c                 :     || j                   t               g| S r   )r   r   )r   r4  r   s      rb   rn  zKernelGroup.new_kernel  s    49924<t<<rd   c                     | xj                   |z  c_         | j                  }| j                  }|j                  ||       y r   )r  r  r  r  )r   rn  r$  r  r  s        rb   r  zKernelGroup.finalize_kernel  s5    %WW  r*rd   c                 X    | j                   j                         \  }}}t        |      }|S r   )r   cpp_argdefsr   )r   arg_defs	call_args	arg_typesr  s        rb   r  zKernelGroup.get_num_args  s)    )-)>)>)@&)Yx=rd   c           	         | j                   j                          | j                  sy t        j                  j
                  r.t        | j                  t        j                  j
                        nd}dj                  d||j                         g      }| j                  j                         \  }}}dj                  d      j                  |      }dj                  |      }t               }t        j                  j                  xr t        j                  dk(  }|r|j!                  dg       t"        j$                  j&                  r|nd	}	|j)                  t+        j,                                |j)                  d
|	 d| d       |j/                         5  |rHt"        j$                  j0                  }
|
dt3        |
      z   dz   nd}|j!                  d||z    dg       | j                  j5                         D ]  \  }}|j)                  d| d| d        |j7                  | j8                         d d d        t;               }t"        j$                  j&                  s|j)                  d       |j7                  |       t"        j$                  j&                  s|j)                  d       |j=                         }|j?                  dd      }|jA                  ||d       |jC                  ||d       y # 1 sw Y   xY w)Nr  re  r   z,
   rw  linuxz!#include <ATen/record_function.h>r2  zextern "C" void rs   rZ   graph_zRECORD_FUNCTION("z#", c10::ArrayRef<c10::IValue>({}));r  r  r  zasync_compile.cpp('''z''')z#pragma CMTz//F)r<   )"r  r  r  r   r   descriptive_namesr   r   next_kernel_suffixr   r  ljustr$   enable_kernel_profilesysplatformr  r"   r  cpp_wrapperr  r   
cpp_prefixr  graph_idrM   aliasesr  r  r,   getvaluery  define_kernelgenerate_kernel_call)r   r-  
fused_namekernel_namer  r  r  r  r  kernel_decl_namer  r   oldnewcodecache_defcodecache_strs                   rb   r  z#KernelGroup.codegen_define_and_call  s   

## zz++ "$"6"6

8T8TU 	
 hhz73M3M3OPQ)-)>)>)@&)Y;;r?''1HHY'	~ JJ,,H1H 	 !OO@AB*+''*=*=;8y++-.)*:);1XJaHI[[]$77++;C;OCM1C7UW+F[,@+AAfg
 !II--/Sse3se156 0KK(  '(ww""##$;<T"ww""##F+%..0 &--mTBk=uE$$[)%$H5 ]s   B K==L)	r   r   r   r   rn  r  r  r  r=  r>  s   @rb   r  r    s    "=+
3Ird   r  c                        e Zd Z fdZ xZS )r  c                 @    t         |           t               | _        y r   )r  r   r%   r   r  s    rb   r   zCppWrapperKernelGroup.__init__  s    (*	rd   )r   r   r   r   r=  r>  s   @rb   r  r    s    + +rd   r  c                   0    e Zd Zd Zd Zd Zd Zd Zd Zy)r  c                 `    || _         d| _        d | _        t        j                         | _        y r  )r  in_parallelr  r  r  r  )r   r  s     rb   r   zWorkSharing.__init__  s)    	 ))+
rd   c                    | j                   r|| j                  k7  r| j                          | j                   s|| _        d| _         t        j                  j
                  r| j                  j                  d       n| j                  j                  d| d       | j                  j                  | j                  j                                y y )NTz#pragma omp parallelz!#pragma omp parallel num_threads(rZ   )r  r  r  r   r   r  r  r  r  r  r  )r   r   s     rb   r  zWorkSharing.parallel  s    4+;+; ;JJL&D#Dzz))		##$:;		##&GyPQ$RSJJ$$TYY%5%5%78  rd   c                 h    | j                   r| j                  j                  d       | j                   S )Nz#pragma omp single)r  r  r  r  s    rb   r  zWorkSharing.single(  s*    II 45rd   c                 F    | j                   j                          d| _        y r  )r  r  r  r  s    rb   r  zWorkSharing.close-  s    

 rd   c                 :    | j                   j                          | S r   )r  r  r  s    rb   r  zWorkSharing.__enter__1  s    

rd   c                 >    | j                   j                  |||       y r   )r  r
  r  s       rb   r
  zWorkSharing.__exit__5  s    

Hgv6rd   N)	r   r   r   r   r  r  r  r  r
  r   rd   rb   r  r    s     ,9 
!7rd   r  c                      e Zd ZU dZeej                     ed<   dZeej                     ed<    ej                  d      Z
ej                  ed<    ej                  d      Zej                  ed<   dZeed<   d	Zeed
<   d	Zeed<   d	Zeed<   dZeeeef      ed<   dZed    ed<    ej.                  e      Zed    ed<   dZee   ed<   d Zdee   fdZdefdZded    fdZ d Z!d Z"d Z#d Z$y)r  Nr   r  r   r  r#   stepsr  Fr  r  	collapsedr  parent)default_factoryr  r2  c                 j    t        j                         }|r|j                         | _        y d| _        y r  )r   r5  r6  r  )r   r!  s     rb   __post_init__zLoopLevel.__post_init__K  s-     ,5+A+A+CAO>#;#;#=UVrd   r  c                     | j                   r| j                   gS g }| j                  D ]  }||j                         z  } |S )z,Get all kernel objects under this loop level)r2  r  r  )r   r  r   s      rb   r  zLoopLevel.get_kernelsW  s@    ;;KK= JJDt''))G rd   c                    | j                   s|| _        | }|J |j                         r|j                  j	                         |_        |j
                  }|b|j                         rR|j                  J |j                  j                  |j                         |j
                  }||j                         rRyt        | j                         dk(  sJ | j                   d   j                  |       y)zj
        Set the kernel under this loop level. No split is allowed under
        this loop level.
        Nr#   r   )	r  r2  r  r  r   r  r,  r   r  )r   r2  r   s      rb   r  zLoopLevel.set_kernel`  s    
 zz DK(,D###  ")/)A)A)F)F)H&{{&4+<+<+>11===**11&2J2JK;;D &4+<+<+> 4::!###

1  (rd   c                 j    |dk(  r| gS g }| j                   D ]  }||j                  |dz
        z  } |S )Nr   r#   )r  get_loops_atr   r&  r  r   s       rb   r  zLoopLevel.get_loops_att  sA    A:6ME

**51955 #Lrd   c                 ,    t        | j                        S r   )r8   r  r  s    rb   r  zLoopLevel.is_reduction}  s    D**++rd   c                      fd fd}|dk(  r3 |       \  }} j                   }|r||g|_        ||_         ||_         ||fS t         j                        dk(  sJ  j                  d   j                  |dz
        S )Nc                      g } j                   r0j                   D ]!  }| j                  |j                                # | S r   )r  r#  r)  )r  r   r   s     rb   clone_innerz0LoopLevel.split_with_tiling.<locals>.clone_inner  s4    Ezz JJDLL. 'Lrd   c                  T   t        j                        } t        j                  |       | z  }t	        j
                  |      }| |_        j                  |_        d|_        j                  |_	                |_
        |j                  r|j                  D ]	  }||_         t	        j
                  j                        }||_        j                  |_        d|_        j                  |_	                |_
        |j                  r|j                  D ]	  }||_         ||fS r  )r   Integerr   r  r  r   r  r  r  r  r  r  r  )sympy_factorr  r  r   r  r  r{  r   s        rb   do_split_with_tilingz9LoopLevel.split_with_tiling.<locals>.do_split_with_tiling  s     ==0Ldii6EF!$((F3I*IO!%I"'I*.*@*@I')mIO%OOD"+DK , "$((DII6I%I!%I"'I*.*@*@I')mIO%OOD"+DK , i''rd   r   r#   )r  r  r   r  )r   r&  r{  r  r  r  r  r  s   ` `    @rb   r  zLoopLevel.split_with_tiling  s    		(4 A:#7#9 Iy[[F )95#)	 #)	 i''tzz?a'''::a=22519fEErd   c                     t        |       }g |_        | j                  rC| j                  D ]4  }|j                         }||_        |j                  j	                  |       6 t        | j                        |_        |S r   )r   r  r)  r  r#  r   r2  )r   r   
inner_loopinner_loop_clones       rb   r)  zLoopLevel.clone  si    Dz
::"jj
#-#3#3#5 *. '

!!"23 ) t{{+rd   c                    t        | j                        }t        | j                        }t        j                  j
                  r||k(  ry | j                  r4ddj                  d | j                  j                         D              z   }nd}| j                  r| j                  dkD  rd| j                   dnd}| j                  rJd| d}| j                  dkD  r|d| j                   d	z  }| j                  r^|j                  d
d
|       }nH| j                  rd}n9| j                  rd| | }n%| j                  st        j                         rd}nd}t          d| j"                   d| }| j"                   d| }| j"                   dt        | j$                         }d| d| d| d	}	| j&                  s|s|	gS ||	gS )Nr   c              3   B   K   | ]  \  }}d t         |    d| d  yw)z
reduction(r  rZ   N)r  )r'  r   rtypes      rb   r)  z"LoopLevel.lines.<locals>.<genexpr>  s1      '"@JC \%013%q9"@s   r  r#   zsimd simdlen(z) z#pragma omp forz
 collapse(rZ   z for z#pragma omp z#pragma GCC ivdep=r  z+=zfor(r  )r   r  r  r   r   no_redundant_loopsr  r   r1  r  r  r  ry  r  r   is_gccr   r   r  r  )
r   offset_expr	size_exprr  simdline1
offset_strsize_str	steps_strline2s
             rb   r  zLoopLevel.lines  s   !$++.		*	::(([I-E!!chh '"&"8"8">">"@'  I
 I }}!4!4q!8 D//03 	
 ==%i[2E}}q :dmm_A66}}gtf~>]]E]]"4&4E''I,<,<,>'EE"|1TXXJa}=
hhZq,xxj;tzz#:";<	zl"XJb1=>>7Nu~rd   )%r   r   r   r   r   r   r;  __annotations__r  r  r  r  r  r6   r  r8   r  r  r  r   rM   r  dataclassesfieldr   r  r   r2  r  r
  r  r  r  r  r  r)  r  r   rd   rb   r  r  9  s    $C%**	$!%D(5::
%&q)FEJJ)%a(E5::(HcHdHdIt26xS#X/6$(FH[!(  1{00FE4F"&FHY&
WT)_ ) )(T+%6 ,,F\	&rd   r  c                       e Zd ZU dZdZeee      ed<   dZ	ee
   ed<   ede
fd       Zd Zdee   fdZed	        Zd
 Zd Zd Zy)r  a  
    A loop-nest like structure but with some loop level split along
    the loop range into the main tiling loop and the tail. It is built
    with the `build` method as a loop nest and then split with
    `split_with_tiling` at some depth.

    A typical case is for vectorization where we typically split at the inner-most
    loop level. A more complicated case is 2D tiling where we split at
    both inner-most and outer levels.
    Nr  r2  c                 z   | j                   }| j                  }| j                  }|J g }|}d}t        t	        ||            D ]W  \  }\  }}	t        ||	|      }||k\  r| j                  j                         |_        |j                  |       |j                  }Y t        |      }
|r	| |_        |
S | |
_        |
S )z4Build a LoopNest with the given `kernel` as the leafN)r  )r3  r  r  	enumerater  r  r  r   r#  r  r  r2  )r2  r3  r  r  r  levelsr   loop_idxr   r  r
  s              rb   r  zLoopNestWithSplit.build  s     ?? 00*** ""&$(%.s8V/D%E!HksDS$t4D?*)/)A)A)F)F)H&MM$ZZF &F &d+	 DK   &Ird   c                 ,    t        | j                        S r   )r8   r  r  s    rb   __bool__zLoopNestWithSplit.__bool__
  s    DIIrd   r  c                 p    g }| j                   J | j                   D ]  }||j                  |      z  } |S )zJGet all the loop levels at the given `depth` (most outer loop has depth 0))r  r  r  s       rb   r  zLoopNestWithSplit.get_loops_at  s?    !#yy$$$IIDT&&u--E rd   c                 D   d}| j                   J | j                   }t        |      dkD  ry|r|d   j                         nd}t        |      dk(  rO|d   j                         |k(  r9|dz  }|d   j                  }t        |      dk(  r|d   j                         |k(  r9|S )z
        Maximal allowed depth for parallelism:
        1) Levels without splitting and
        2) All reduction or non-reduction levels
        When the loop is split at the top level, the max depth is 1.
        r   r#   F)r  r   r  r  )r   	max_depthr  r  s       rb   r  z$LoopNestWithSplit.max_parallel_depth  s     	yy$$$		u:>27uQx,,.U%jAo%("7"7"9\"INI!HNNE %jAo%("7"7"9\"I rd   c                     | j                   duxr7 t        | j                         dkD  xr | j                   d   j                         S )zr
        Whether all the loops are for reduction. Reduction loops
        are always the inner most ones.
        Nr   )r  r   r  r  s    rb   r  z#LoopNestWithSplit.is_reduction_only(  s=     IIT!Xc$))nq&8XTYYq\=V=V=X	
rd   c                     || j                         k  sJ d       | j                  J | j                  }|D ]	  }||_         t        d|      D ]  }|d   j                  }d|d   _         y )Nz?Parallel depth cannot exceed the maximal allowed parallel depthr#   r   T)r  r  r  r  r  r  )r   r  r  r   r  s        rb   r  zLoopNestWithSplit.mark_parallel1  sy    0022	ML	M2yy$$$		D%DM q)$A!HNNE!%E!H %rd   c                     | j                  |      }t        |      dk(  sJ |d   j                  d|      }|dk(  r|| _        |S )a  
        Split the loop into main and tail loops at given `depth` so that the range
        of the main loop has range `floor_div(range, factor) * factor` and
        the tail loop handles the remainder. The main loop is tiled
        according to the `factor`.
        r#   r   )r  r   r  r  )r   r&  r{  r  split_loopss        rb   r  z#LoopNestWithSplit.split_with_tiling=  sN     !!%(5zQAh00F;A:#DIrd   )r   r   r   r  r  r   r   r  r)  r2  r  r  r  r2  r  r   r  r  r  r  r   rd   rb   r  r    s|    	 '+D(4	?
#*"&FHY&i  .T)_   $

&rd   r  )r  r*  	functoolsry  r  r]   r}  r  r   r   typingr   r   r   r   r	   r
   r   ri   torch.fxtorch._inductorr   torch._inductor.irr   r   torch._prims_commonr   torch.utils._sympy.functionsr   torch.utils._sympy.value_rangesr   r   r  r   r   r   r   codegen.wrapperr   optimize_indexingr   r  r   r   utilsr   r   r   r   r   r    virtualizedr!   r"   commonr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   _logginggetArtifactLoggerr   r  rj   float64r  r  r  int16int8r  r8   r9   r:   rk   float8_e4m3fnfloat8_e5m2DTYPE_TO_ATENDEVICE_TO_ATENr   r  r  r  PYTHON_TO_CPPCONTAINER_PYTHON_TO_CPPrh   rc   ro   rw   rz   r|   r   r   r   r   r   r   	lru_cacher<  r;  r   r   r   r   r   r   r  r  r  r  r  r@  r  r  r!  rm  r  r  r  r  r  r  	dataclassr  r  r   rd   rb   <module>rT     s         	 
  : :    ( 4 . 1 D - - , < 5  !     ~~//*E 
MM7	MM8	MM6	KK	KK	KK	JJ	KK	JJ	NNJ	OO[ 
MM<	MM=	MM;	KK	KK	KK	JJ	KK	JJ	NNO	OO(	-	)" 
 
7    #&   
NN	MM
9)4(),$2   F -5<< -

 - -V8 V8t 	8! !B8ehhmm 8(; 83"5 3
-2[ -2`R; Rj
[l [|`/ `/F	l9 l^	dl dNg, gTdCY dCNK&N K&\KI KI\+K +"7 "7J e e eP f f frd   