
    Ph                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d d	l m!Z!m"Z" d d
l#m$Z$m%Z%m&Z& d dl'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8 ddl+m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z? ddl@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZH ddlImJZJ ddlKmLZLmMZMmNZN ddlOmPZP  ej                  eR      ZSej                  j                  eRd      ZVej                  j                  eRd      ZWd ZXd ZYd ZZ G d dej                  j                        Z]y)    N)defaultdict)contextmanager)AnyCallableDefaultDictDictListOptionalSetTuple)get_decompositions)defakedynamo_timed)
LazyString)
FakeTensormagic_methodsmethod_to_operator)has_free_symbolsShapeEnvSymTypes)no_dispatch   )configir)get_scheduling_for_deviceget_wrapper_codegen_for_deviceregister_backend_for_device)CppWrapperCodeGenCudaWrapperCodeGenWrapperCodeGen)CppWrapperCodeGenErrorLoweringExceptionMissingOperatorWithDecompMissingOperatorWithoutDecomp)ConstantFixedLayoutInputBuffer	Pointwise	Reduction
StorageBox	TensorBox)FALLBACK_ALLOW_LISTfallback_handler%fallback_node_due_to_unsupported_typelayout_constraints	loweringsmake_fallbackneeds_realized_inputsunsupported_output_tensor)SizeVarAllocator)convert_shape_to_inductorgather_originsget_sympy_Expr_dtype)V
perf_hintsoutput_codec           
         t         j                  t         j                  t         j                  t         j                  t         j
                  t         j                  t         j                  t         j                  t         j                  t         j                  h
}|r]|j                  t         j                         |j                  t         j                         |j                  t         j                         | |v S N)torchfloat32float64int64int32int16int8uint8boolbfloat16	complex64addfloat16float8_e4m3fnfloat8_e5m2)dtypecudasupported_dtypes      `C:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/_inductor/graph.pysupported_dtype_of_cpp_wrapperrQ   B   s    



O EMM*E//0E--.O##    c                    t        | t        j                  t        j                  t        j                  j
                  j                  f      sJ d       t        | t        j                  j
                  j                        rt        j                  S t        | t        j                        rt        |       S | j                  rt        j                  S | j                  rt        j                  S y )Nzgget_constant_buffer_dtype only supports input of sympy.Symbol, sympy.Expr or sympy.core.numbers.Integer)
isinstancesympySymbolExprcorenumbersIntegerr>   rA   r8   
is_integeris_floatr?   )constant_buffers    rP   may_get_constant_buffer_dtyper^   X   s    %,,

EJJ4F4F4N4NO qpq  /5::#5#5#=#=>{{/5::.#O44!!{{		!	!}}rR   c                 L    t         D ch c]  }t        |       }}| |v S c c}w r=   r   )opm	magic_opss      rP   is_magic_methodrc   j   s+    0=>1#A&I>? ?s   !c            
       "    e Zd ZU eej
                     ed<   dej                  fdZ	dej                  fdZ
d Zdddddd e       dddf
dej                  j                  d	eeej                        f fd
Zedefd       Zd Zd Zdej,                  fdZed        ZdefdZdefdZdefdZe fd       Zdej@                  fdZ!dee   fdZ"d Z#defdZ$d8dZ%dedeej,                     fd Z&d!ef fd"Z' fd#Z(ed$ej                  defd%       Z)d& Z*d' Z+d( Z, fd)Z-d* Z.e/d+ej                  j`                  fd,       Z1d-ej                  j`                  f fd.Z2d/ Z3d0 Z4d1 Z5d2 Z6d3 Z7ed4        Z8d5 Z9d6 Z:defd7Z; xZ<S )9GraphLoweringgraph_outputsexc                 2   | j                   r2t        |j                               t        |j                               fS ddlm}  |dt        | j                  j                               }| j                  j                  ||      \  }}}|D cg c]4  }t        |t        j                        r|j                  j                  n|6 }}|D cg c]4  }t        |t        j                        r|j                  j                  n|6 }}||fS c c}w c c}w )z
        Support dynamic shapes and dynamic strides by assigning variables
        to each dimension.  We duck-shape tensors, so if two tensors
        have the same size they get assigned the same symbolic variable.
        r   )ConstantSource__inductor_unknown_tensor_)reuse_shape_envr6   sizestridetorch._dynamo.sourceri   len
_shape_env
var_to_val,create_symbolic_sizes_strides_storage_offsetrT   r>   SymIntnodeexpr)selfrg   ri   sourcerl   rm   _is           rP   symbolic_sizes_stridesz$GraphLowering.symbolic_sizes_stridesr   s     ,RWWY79R		:   < $,S1K1K-L,MNF LL	 LPP4az!U\\:A4PMSTVAu||!<!&&++!CVTV| QTs   9D9Dc                     |j                         D cg c]  }t        j                  |       }}|j                         D cg c]  }t        j                  |       }}||fS c c}w c c}w )z+
        Primarily used to weights
        )rl   rU   rZ   rm   )rv   rg   ry   rl   rm   s        rP   static_sizes_stridesz"GraphLowering.static_sizes_strides   s\     +-'')4)Qa )4,.IIK8Kq%--"K8V| 58s   A%A*c                     t        d      ddlm} t        d|t               t        d      ddlm} t        d|t               y y )Ncpur   )CppSchedulingrN   )CUDACombinedScheduling)r   codegen.cppr   r   r!    codegen.cuda_combined_schedulingr   )rv   r   r   s      rP   init_backend_registrationz'GraphLowering.init_backend_registration   s>    $U+32'}nM$V,4P (0FW	 5rR   NFgmexample_inputsc                 ^   t         |   |       || _        |	|	n| j                  ||      | _        d| _        || _        d| _        |t               }d| _	        n|| _
        d| _	        || _
        t        |      | _        i | _        i | _        t               | _        t               | _        d| _        g | _        i | _        i | _        t               | _        t               | _        t               | _        t               | _        t               | _        d | _        g | _        |
| _        d | _        || _        i | _         t               | _!        g | _"        i | _#        tI        tJ              | _&        tO        jN                         | _(        d| _)        || _*        || _+        || _,        d | _-        | j                  r| j]                         n	t               | _/        dh| _0        || _1        d| _2        d| _3        g | _4        d| _5        d| _6        |jo                         | _8        | js                          y )N)is_inferencer   FTre   zaten.convolution_backward ):super__init__r   decide_layout_opt
layout_optnum_channels_last_convr   extra_tracebackr   rk   rp   r5   sizevarsgraph_inputsgraph_inputs_originalsetdevice_typesdevice_idxsrN   buffers	constantsconstant_reprsremoved_buffersremoved_inplace_buffersmutated_buffersnever_reuse_buffersinplaced_to_removewrapper_codeextern_kernel_nodesextern_node_serializercurrent_nodenum_static_inputslistsmutated_inputsmutated_input_idxsname_to_bufferr   listname_to_userstimecreation_timenamecpp_wrapperaot_modegraph_id	schedulerfind_nodes_prefer_channels_lastnodes_prefer_channels_last_warned_fallbackuser_visible_outputs	cache_key
cache_pathcache_linemapdisable_cudagraphsdisable_cudagraphs_reason__copy__orig_gmr   )rv   r   r   	shape_envr   r   r   r   r   r   r   r   	__class__s               rP   r   zGraphLowering.__init__   s    	, % '''F 	
 '(#($ 
I#(D 'DO#'D #(324=?"&)e%(U	(*24.0),14$),-0U ,/E,0>@  # 	# ,0!2+-
(+-/46@KD@Q!YY[#	&  @D6:ooD00235 	' "= =$8! !  	 #()+&-/[[]&&(rR   returnc                   t         j                  syt         j                  ry| j                  j                  D cg c]@  }|j
                  t        j                  j                  j                  j                  k(  s?|B }}t        |      }|dk(  ryt        j                  j                  rt        j                  j                         ryt!        d |D              rMt        j"                  j$                  j&                  r)t        j"                  j$                  j                         ryt        t)        | j                  j                              d|z  k\  rt*        j-                  d       yt/        d |D              rt*        j-                  d       yd	 d
 d |rBddlm} t5        t6              }|D ]  }t        j8                  j:                  j=                  |      \  }}	}
|r~ |d      5 }t>        j@                  5   |j
                  |	i |
 ddd       ddd       jC                         } |      rd}n |      rd}n |      rd}nd}||xx   |z  cc<   t*        j-                  d        d}d}d}d}tE        |jG                               }|d   |z  |d   |z  z   |d   |z  z   |d   |z  z   }||k  }|st*        j-                  d||       |S t/        fd|D              rt*        j-                  d       yt/        fd|D              rt*        j-                  d       yt!        fd|D              rt*        j-                  d       yyc c}w # 1 sw Y   RxY w# 1 sw Y   WxY w)zl
        Decide if we should enable layout optimization for this graph based on
        heuristics.
        FTr   c              3      K   | ]G  }d D ]@  }|j                   |   j                  d   j                  t        j                  d      k(   B I yw)r   r   valr~   N)argsmetadevicer>   .0nidxs      rP   	<genexpr>z2GraphLowering.decide_layout_opt.<locals>.<genexpr>  sM      #A!C s  '..%,,u2EE! F#s   AAi,  z*Skipped layout opt because only a few convc              3   t   K   | ]0  }d D ])  }t        |j                  |   j                  d          + 2 yw)r   r   N)r   r   r   r   s      rP   r   z2GraphLowering.decide_layout_opt.<locals>.<genexpr>(  s=      
 QVVC[--e45 6s   68zeSee perf regression with dynamic shape. Follow up in https://github.com/pytorch/pytorch/issues/102670c                     | j                   d   dkD  xr. | j                   d   j                  d   j                  d      dkD  S )Nr   r   r   r   rl   r   s    rP   
is_groupedz3GraphLowering.decide_layout_opt.<locals>.is_grouped2  s:    66":>GaffQinnU&;&@&@&Ca&GGrR   c                    | j                   d   j                  d   j                  d      dz  | j                   d   j                  d   j                  d      k  xr. | j                   d   j                  d   j                  d      dkD  S )Nr   r   r      r   r   s    rP   is_in_out_channelz:GraphLowering.decide_layout_opt.<locals>.is_in_out_channel5  sv    q	u%**1-1QVVAY^^E5J5O5OPQ5RR 6FF1INN5)..q1A5rR   c                     | j                   d   j                  d   j                  d      dk  xr. | j                   d   j                  d   j                  d      dk  S )Nr   r   r   @   r   r   s    rP   is_small_channelz9GraphLowering.decide_layout_opt.<locals>.is_small_channel;  sT    q	u%**1-3 8FF1INN5)..q1R7rR   )FlopCounterMode)displayNgroupedsmallin_outdefaultzConv inputs meta not foundg|?5^?gtV?g333333?guV?zhSkipped layout opt in inference because weighted flops indicate slowdown, default: %d, channels last: %dc              3   .   K   | ]  } |        y wr=    )r   r   r   s     rP   r   z2GraphLowering.decide_layout_opt.<locals>.<genexpr>  s     1jz!}j   zFSkip layout opt because found grouped convolution with >1 in_channels!c              3   .   K   | ]  } |        y wr=   r   )r   r   r   s     rP   r   z2GraphLowering.decide_layout_opt.<locals>.<genexpr>  s     8Z #Zr   zBSkip layout opt because some convolutions have smaller out_channelc              3   .   K   | ]  } |        y wr=   r   )r   r   r   s     rP   r   z2GraphLowering.decide_layout_opt.<locals>.<genexpr>  s     7Jq"Jr   z>Skip layout opt because all convolution channels are too small)$r   layout_optimizationforce_layout_optimizationgraphnodestargetr>   opsatenconvolutionr   ro   versionhiprN   is_availableallbackendsmkldnnenabledr   logdebuganytorch.utils.flop_counterr   r   float	_inductorfx_utilsget_fake_args_kwargsr9   	fake_modeget_total_flopssumvalues)r   r   r   
conv_nodesnconvr   flop_countsrt   successr   kwargsflop_counter_modecounted_flops	node_typeGROUPED_MULTIPLIERDEFAULT_MULTIPLIERIN_OUT_MULTIPLIERSMALL_MULTIPLIERtotal_flopsweighted_flopsdo_layout_optr   r   r   s                        @@@rP   r   zGraphLowering.decide_layout_opt   sA    ))++ xx~~
%!UYY^^5O5O5W5W)WA~ 	 
 JA: ==!8!8!:  # 
 %%--%%224
 tBHHNN#$e3IIBC 

 

 IIw 	H		 @,7,>K"(-(@(@(U(U)%v (7;L[['DKK88 ) 8 %6$E$E$GM!$'$-	)$/$+	*40$,	$-		*m;*II:;- #6 "'!& %$k0023K I&);;g&)99:h'*;;< i(+==>  +k9M 		~"
 ! & 1j11IIX  8Z88IIT  7J77IIVWq
H )[ 87s0   A M'?M'8M9	M,M9,M61M99N	c                    t               }t        | j                  j                  j                        D ]w  }|j
                  t        j                  j                  j                  j                  k(  r|j                  |       P|j                  D ]  }||v s|j                  |        w y | j                  j                  j                  D ])  }||v s|j                  D ]  }|j                  |        + |S )aC  
        The rule to decide if an node prefer channels last is simple.
        1. if it's input/output of a convolution
        2. if one of its user prefers channels last

        We have rule 1 because cudnn runs a faster convolution kernel for channels last inputs;
        Rule 2 is also important. It makes sure that indirect inputs to convolution also prefers
        channels last.

        Consider the scenario: conv -> batch-norm -> relu -> conv
        Without rule 2, batch-norm output may use a contiguous layout. That will cause 2 extra copies:
        1. the output of batch-norm should be channels last initially since its input is a conv's output.
           Forcing the batch-norm's output to be contiguous results in the first copy
        2. The second conv's input is initially contiguous. This layout is propagated from the batch-norm's output.
           We need convert it to channels last layout which results in the second copy.
        With rule 2, we makes sure all the tensors in the chain uses channels last layout. So both copies
        can be saved.
        )r   reversedmoduler   r   r   r>   r   r   r   r   rI   users)rv   
output_setr   userchilds        rP   r   z-GraphLowering.find_nodes_prefer_channels_last  s    & U
$++++112Axx599>>55===q!:%NN1%   30 ""((AJWWENN5) % )
 rR   c                     || j                   vr2| j                   j                  |       t        j                  d|       y y )NzUsing FallbackKernel: %s)r   rI   perf_hint_loginforv   r   s     rP   warn_fallbackzGraphLowering.warn_fallback  s:    t,,,!!%%d+94@ -rR   r   c                     | j                   j                  |j                         |j                  &| j                  j                  |j                         y y r=   )r   rI   typeindexr   )rv   r   s     rP   add_device_infozGraphLowering.add_device_info  sA    fkk*<<#  . $rR   c                 "    t         j                  S r=   )r9   r   )rv   s    rP   r   zGraphLowering.fake_mode  s    {{rR   buffer_namec                 x    || j                   v r| j                   |   S || j                  v r| j                  |   S y r=   )r   r   )rv   r  s     rP   
get_bufferzGraphLowering.get_buffer  sC    $---&&{33$+++$$[11rR   c                    || j                   v r| j                   |   j                  S || j                  v r| j                  |   j                         S || j                  v r| j                  |   j                         S t        j                  d|      }|r | j                  |j                  d            S t        d|       )Nz1(as_strided|reinterpret_tensor)\(([a-zA-Z0-9_]+),r   could not find )	r   rM   r   	get_dtyper   rematchgroupKeyError)rv   r  ra   s      rP   r   zGraphLowering.get_dtype  s    $..(>>+.444$---&&{3==??$+++$$[1;;==HHI;W>>!''!*--677rR   c                 `   ddl m} || j                  v r| j                  |   j                         S || j                  v r7| j                  |   }t        t        |dd       |      ry|j                         S || j                  v r| j                  |   j                         S t        d|       )Nr   )MultiOutputLayoutlayoutr  )
r   r&  r   numelr   rT   getattr	get_numelr   r$  )rv   r  r&  bufs       rP   r*  zGraphLowering.get_numel  s    )$..(>>+.4466$---%%k2C'#x68IJ==?"$+++$$[1;;==677rR   c                     t        |   | S r=   )r   run)rv   r   r   s     rP   r-  zGraphLowering.run  s    w{D!!rR   bufferc                    dt        | j                         }| j                  j                  |       || j                  |<   t	        |t
        j                        r|j                         s| j                  |j                                |S )Nr+  )
ro   r   appendr   rT   r   ComputedBufferis_zero_elementsr  
get_device)rv   r.  r   s      rP   register_bufferzGraphLowering.register_buffer	  sq    S&'(F#$*D!&""3"34F<S<S<U  !2!2!45rR   buffer_namesc                 L    ddj                  |      z   }|| j                  |<   |S )Nlist_rx   )joinr   )rv   r5  r   s      rP   register_listzGraphLowering.register_list  s(    ,//'

4rR   c                 $      fd |       y )Nc                    t        | t        t        f      r| D ]
  } |        t        | t        j                        rt        | d      rht        | j                  t        j                        rDt        | j                  d      r.t        | j                  j                  t        j                        sy | j                         D ]   }j                  |   j                  |        " y y )Ndata)
rT   r   tupler   IRNodehasattrr<  get_read_namesr   r0  )valuex	read_nameregisterrv   s      rP   rD  z1GraphLowering.register_users_of.<locals>.register  s    %$/AQK %+v.%ejj"))<

F3&uzz		B !&!5!5!7I&&y188? "8 ,rR   r   )rv   node_outputrD  s   ` @rP   register_users_ofzGraphLowering.register_users_of  s    	@$ 	rR   r   c                     t        |t              sJ | j                  j                  |       || j                  vry| j                  |   D ]  }|j                           y)z
        When a buffer is mutated we need to make sure all the reads to
        the old version are realized before the mutation happens.
        N)rT   strr   rI   r   realize)rv   r   r  s      rP   mark_buffer_mutatedz!GraphLowering.mark_buffer_mutated,  sW    
 $$$$  &t)))&&t,DLLN -rR   c                       fd} ||      }t        j                  t        j                  |t	        j
                  j                  g j                                     S )Nc                 @   j                   j                         D ]  \  }}j                  rj                         |j                         k(  s5j	                         |j	                         k(  sWj
                  |j
                  k(  sqj                  |j                  k(  st        j                  |      j                         s|c S  | dt        j                          } | d   j                         rd|  } t        j                  dd|       }|} d}| j                   v r| d| } |dz  }| j                   v rj                   | <   t        j                  t!              j#                  d            j%                         j&                  | <   | S )Nconstantr   	constant_z[^a-zA-Z0-9_]rx   r   zutf-8)r   items	is_mkldnnrl   rm   rM   r   r>   eqr   ro   isdigitr!  subhashlibsha256reprencode	hexdigestr   )r   constant_namerA  prefixcntr<  rv   s        rP   allocatez3GraphLowering.add_tensor_constant.<locals>.allocate;  s`   (,(<(<(>$u		uzz|37

ekk1u||3u-113(( )? |!#dnn"5!67Aw "4&) VV,c48FDC$..( 3%(q $..( $(DNN4 (/T
!!'*)ik % KrR   )r,   creater   ConstantBufferr'   r   rM   r|   )rv   r<  r   r\  s   ``  rP   add_tensor_constantz!GraphLowering.add_tensor_constant:  sZ    	< ~DKKVd6O6OPT6UV
 	
rR   device_overridec                     | j                   |   j                  |k(  s||S | d|j                   |j                  xs d }|| j                   vr+| j                   |   j	                  |      | j                   |<   |S )z
        We AOT copy constants to the devices they are needed on.
        If device_override doesn't match the constant's device, then
        copy it and return a different name.
        rx   r   )r   r   r  r  to)rv   r   r`  alt_names       rP   rY  zGraphLowering.constant_nameb  s     >>$&&/9_=TKV1_112?3H3H3MA2NO4>>)'+~~d';'>'>'ODNN8$rR   r   c                    t         	|   |||      }t        |t              r'|j                  j
                  }|| j                  |<   |S t        |t        t        t        f      r&t        j                  |      }|| j                  |<   |S t        |t        j                        sJ |       |j                  s| j                  |      \  }}n| j!                  |      \  }}t#        j$                  t'        |t)        |j*                  |j,                  ||                  }|| j                  |<   |j.                  j.                  | j0                  |<   | j3                  |j*                         |S r=   )r   placeholderrT   r   rt   ru   r   intrF   r   rU   sympifyr>   Tensor_has_symbolic_sizes_stridesr|   rz   r,   r]  r(   r'   r   rM   r<  r   r  )
rv   r   r   r   exampleru   sizesstridestensorr   s
            rP   re  zGraphLowering.placeholdero  s8   '%fdF;gx(<<$$D(,Df%K#tU!34==)D(,Df%K'5<<09'90
 22!66w?NE7!88ANE7!!GNNGMM5'J
 %+&!-3[[-=-=""6*W^^,rR   c                 V   |t         j                  u r/t        |d   t        t        t
        f      rt        |   |||      S t        |d      r ||i |S |t        vrt        |t        j                  j                        s
J | d       |j                         j                  d      d   }|t        v rt!        |       nt"        j$                  rKt'        |g      rt(        nt*        }t,        j/                  d|j1                  |||             t!        |       n&t'        |g      rt)        |||      t+        |||      	 t,        j3                  dt        |          t        |   |i |}|S # t4        $ r-}t7        ||||      j9                  |j:                        d d }~ww xY w)Nr   _inductor_lowering_functionz is not an OpOverload.z"Creating implicit fallback for:
%sz  via %s)operatorgetitemrT   r   r=  dictr   call_functionr?  r1   r>   _ops
OpOverloadr   splitr-   r2   r   implicit_fallbacksr   r$   r%   r   r  operator_strr   	Exceptionr#   with_traceback__traceback__)	rv   r   r   r   	base_nameerrorouter   s	           rP   rt  zGraphLowering.call_function  s   X%%%*T!WtUD>Q*R7(v>>6894*6**"

-- 0./0  ++C03I//f%** *6(3 .5 
 9&&vtV< f%#VH- 0fEE264HH	IIj)F"34F#T4V4CJ 	#AvtV<KK	s   ,E2 2	F(;(F##F(tc                 Z    t        | j                        dk(  xr | j                  d   dk  S )zM
        True if this is a small constant attr that will be inlined.
        r   r      )ro   shape)r  s    rP   can_inline_constantz!GraphLowering.can_inline_constant  s(    
 177|q 4QWWQZ1_4rR   c                    t        | j                  |      }t        j                  st	        |      r| j                  ||      S t               5  |j                  dk(  r8t        |j                         |j                  |j                        cd d d        S | j                  |      r<ddlm}  ||j                         |j                  |j                        cd d d        S 	 d d d        | j                  ||      S # 1 sw Y   xY w)Nr   r   )rm  )rM   r   )r)  r  r   always_keep_tensor_constantsr4   r_  r   r  r&   itemrM   r   r  loweringrm  tolist)rv   r   r   r   rA  rm  s         rP   get_attrzGraphLowering.get_attr  s    V,..2KE2R++E6::]{{b 

ekk5<<H ] ''.,ellnEKKU ] /  ''v66 ]s   >C>AC>>Dc                     t               r=   AssertionErrorrv   r   r   r   s       rP   call_modulezGraphLowering.call_module      rR   c                     t               r=   r  r  s       rP   call_methodzGraphLowering.call_method  r  rR   c                 b   t         
|   |||      }t        |t        t        f      sJ t        |             t        d |D              sJ |       |D cg c]!  }t        j                  j                  |      # c}| _
        | j                  j                         D ]+  \  }}t        |t        t        j                  f      sJ dt        |              t        |t              sK|j!                          t        |t              sJ |j"                  }t        |t        j$                        sJ |}|j"                  }t        |t&              r|j)                         |k7  st        j*                  j-                  || j.                  |          	 | j                  j1                  |      }	| j.                  |   | j                  |	<   . | j5                          t6        j9                  d| j:                  | j<                  | j<                         y d       y c c}w # t2        $ r Y w xY w)Nc              3      K   | ]t  }t        |t        t        j                  t	        d       t        j
                  t        j                  t        j                  j                  j                  t        f       v y wr=   )rT   r,   r   r&   r  r^  rU   rW   logicboolalgBooleanrf  )r   rB  s     rP   r   z'GraphLowering.output.<locals>.<genexpr>  s`      
  KKJ%%JJKK''// s   A:A<z'Unsupported inductor graph input type: zGForce channels last inputs for %d conv for the current graph with id %dr   )r   outputrT   r=  r   r  r   r   ExternKernelrealize_inputrf   r   rO  r,   rU   rW   rI  r<  r+   r(   get_nameMutationLayoutrealize_intor   r  
ValueErrorfinalizer   r   r   r   )rv   r   r   r   resultrB  r   rA  value_storage_boxindr   s             rP   r  zGraphLowering.output  s   f5&5$-0>$v,>0 
 
 
 	 	 
 IOO1boo;;A>O,,224KD%	5::. G8eFG  eY/MMOeY///JJEeR]]333 %JJEe[1U^^5E5M!!..ud6P6PQU6VW,,223DEC.2.H.H.ND&&s+% 5, 			U''!]]6DMM	
 =?	
3 P* " s   &H7H!!	H.-H.c                 F    | j                   D ]  }|j                           y r=   )r   decide_layout)rv   r+  s     rP   r  zGraphLowering.finalize
  s    <<C  rR   rt   c              #   b   K   | j                   }	 || _         d  || _         y # || _         w xY wwr=   )r   )rv   rt   olds      rP   set_current_nodezGraphLowering.set_current_node  s1     	$ $D #DDs   /# /	,/r   c                    fd}h}j                   dk(  r#| j                        \  }}|t        ||      z  }t        j                  j                  |      5  | j                        5  t        j                        5  j                   dk(  rMj                  t        j                  ur1t              r& |d        t        j                  d      i }nj                   dk(  rWj                  t        v rE |d       t        j                     gi \  }}| j                  j                  ||      }nt        j                        rc |d       t!        j"                  d   t$        j&                        r$j"                  d   j(                  j*                  }n't,        | ]        }n |d	       t,        | ]        }t$        j0                  j2                  j4                  j6                  t$        j0                  j2                  j8                  j6                  t$        j0                  j2                  j:                  j6                  gt=        d
 j>                  D              }t=        fdj>                  D              }|s|rt!        j"                  d   t$        j@                        r߉j"                  d   jC                         }	t$        jD                  jG                  j"                  d         }
|
rtI        |	      rt        jJ                  |	      }tI        |jM                               dk(  r8| jN                  v r*jP                  | jR                  vr|st        jT                  }t        jV                  jY                  ||      }tI        t[        j>                              }|dkD  rt!        |t\              rj>                  D ]  }|j                  t^        v rl|ja                          t$        j0                  j2                  jb                  j6                  t$        j0                  j2                  jd                  j6                  t$        j0                  j2                  jf                  j6                  g}| jh                  s=|jk                  t$        j0                  j2                  jl                  j6                         t$        jn                  jp                  r|t$        j0                  jr                  jt                  j6                  t$        j0                  jr                  jt                  jv                  t$        j0                  jr                  jx                  jv                  t$        j0                  jr                  jz                  j6                  t$        j0                  jr                  j|                  j6                  t$        j0                  jr                  j|                  jv                  t$        j0                  j2                  j~                  j6                  t$        j0                  j                  j                  j6                  t$        j0                  j                  j                  jv                  t$        j0                  j                  j                  j6                  g
z  }t$        jn                  j                  r2|t$        j0                  j                  j                  j6                  gz  }|j                  |v rNt        jV                  jY                  |t        jJ                  j"                  d   jC                                     }|j                   dk(  st!        |j                  j                  t        t        f      s|j                           |j                  tI        j>                               t!        |t\              r |j                         r|ja                          t!        |t\              rqt!        |j                  t              rW|j                  j                  }t!        |t              r1|j                         t        j                  kD  r|j                          d d d        d d d        d d d        t!        t\              rt!        |j                  t        j                        rt!        |j                  j                  t        j                        r|j                  j                  _Q        nt!        |j                  j                  t        j                        rX|j                  j                  _Q        t!        |j                  j                  t        j                        r^t!        |j                  j                  j                  t        j                        r&|j                  j                  j                  _Q        nt!        |j                  j                  t        j                        r|j                  j                  j                  sct!        |j                  j                  j                  d   t        j                        r(|j                  j                  j                  d   _Q        | j                  |       |S # 1 sw Y   5xY w# 1 sw Y   :xY w# 1 sw Y   ?xY w)Nc                 Z    t         j                  dt        j                        |        y )Nzlowering %s %s)r   r   r   format_node)msgr   s    rP   r   z%GraphLowering.run_node.<locals>.debug  s    II&
1==(A3GrR   rt  r.   F)add_to_fallback_setr0   rc   r   r   c              3   :   K   | ]  }|j                   d k(    yw)r  N)r`   )r   r  s     rP   r   z)GraphLowering.run_node.<locals>.<genexpr>G  s     DGDDGGx/Gs   c              3   :   K   | ]  }|j                   v   y wr=   )r   )r   r  as_strided_opss     rP   r   z)GraphLowering.run_node.<locals>.<genexpr>H  s      *:A$~-'s      r   r  r   )Xr`   fetch_args_kwargs_from_envr7   r   r>  current_originsr  r9   r   rq  rr  r/   r.   r0   rt  rc   rT   r   r>   rs   rt   ru   r   run_noder   r   
as_stridedr   as_strided_as_strided_scatterr   r  rh  rm   _prims_commonis_non_overlapping_and_densero   get_stride_orderget_sizer   r   r   NHWC_STRIDE_ORDERr  require_stride_orderr   r,   r3   realize_hintconvolution_backwardmm_int_mmr   r0  r   _C_has_mkldnnr   _convolution_pointwisebinary_convolution_pointwise_ _convolution_transpose_pointwise_linear_pointwisemkldnn_rnn_layeronednnqconv2d_pointwiseqlinear_pointwisehas_mklmkl_mkl_linearr<  r)   r*   rI  
mark_reusehas_exceeded_max_readsr+   inner_fn_str_lenr   realize_bytes_thresholdLoopsorigin_nodeBufferr1  MultiOutputindicesinputsrF  )rv   r   r   originsr   r   r  	is_outputis_input_for_as_stridedrl  densestride_order	num_usersr  need_fixed_layoutcurrr  r   s    `              @rP   r  zGraphLowering.run_node  sW   	H #44?"::1=LD&~dF33GYY&&w/1F1F2
a 'HHH$4$449!<()N)!((N# (QXX9K-K*+1!((;AOOOf++AHHdFC * '(affUmU\\:VVE]//44F"W-a0Fb	)!, 		))11		**22		1199N
 DAGGDDI&) *:;''* '# 4*uu||; &&-..0++HHPUW S\#%#6#6w#?LFOO-.!3!@!@@FF$*C*CC 7')';';__AA&,WF CL)I1}FI!>GGD{{&;;++- "IINN??GG!IINN--55!IINN22::-)
  $-44UYY^^5O5O5W5WX 88//- %		 0 0 G G O O %		 0 0 G G N N %		 0 0 H H O O %		 0 0 Q Q Y Y %		 0 0 B B J J %		 0 0 B B I I %		 ? ? G G %		 0 0 B B J J %		 0 0 B B I I %		 0 0 B B J J2 -  %xx// 1eiimm6O6O6W6W5X X 1;;*;;%'__%I%I &(;(;AFF5M<P<P<R(S&F ww(*%fkk&6&6I8NO"NN,S $X !!#agg,/ &),1N1N1P ##% &),FKK1T{{''dI.,,.1O1OO(y !2
/P fi(ZR]]-S&++**BHH5/0  ,FKK,,bii8/0  ,fkk..0A0ABzKK$$))288H 9:FKK$$))5 v{{//@"KK,,44!&++"2"2"9"9!"<biiHAB((//2>v&s ! 2
 2
//sJ   k/k[3k:*k&C(kkkkkk	kk%c                    t         j                  rt        d      t        j                  dk7  rt        dt        j                         | j
                  j                         D ]  }d }t        |t              r|j                         }nXt        |t        j                  t        j                  t        j                  j                  j                  f      rt!        |      }t#        || j$                        rt        d|        y )NzC++ codegen is disabledlinuxzUnsupported platform zUnsupported input dtype )r   disable_cpp_codegenr"   sysplatformr   r   rT   r,   r   rU   rV   rW   rX   rY   rZ   r^   rQ   rN   )rv   rA  rM   s      rP   !validate_can_generate_cpp_wrapperz/GraphLowering.validate_can_generate_cpp_wrapper  s    %%()BCC<<7"(+@)OPP&&--/EE%+)ejj%**2D2D2L2LM 6e<1%C,/Gw-OPP 0rR   c                    d| j                   v | _        | j                  r<| j                          | j                  rt	               | _        y t               | _        y | j                   j                         }|j                  d       t        |      dk  s%J dj                  dj                  |                   t        |      dk(  }|rdn|j                         }t        |      }|J d| d        |       | _        y )	NrN   r~   r   zDoes not support mixing {}+r   zDevice z not supported)r   rN   r   r  r    r   r   copydiscardro   formatr8  popr   )rv   r   only_cpudevice_typewrapper_code_gen_clss        rP   init_wrapper_codezGraphLowering.init_wrapper_code  s   d///	224(,		"$   8I7J  ((--/U#< A% 	
'C'J'JHH\"(
 	
% |$)'e\-=-=-?=kJ#/V7;-~1VV/02rR   c                    d| j                   v rd| _        | j                         j                  }d }t        j
                  j                  j                         5  | j                  J | j                  D cg c]
  } ||       }} ||       ddd       ~d| _        | j                  j                          | j                  j                          | j                         S | j                         S c c}w # 1 sw Y   jxY w)ad  
        For CPU, the cpp wrapper codegen is done in one pass.
        For GPU, the cpp wrapper codegen is done in two steps: JIT-compile the model with python
        wrapper code and run it to generate autotuned kernel binaries in the first pass; and then
        generate cpp wrapper code and compile it to a dynamic library in the second pass.
        rN   Fc                 (   t        | t        j                  t        j                  f      r| j                  j
                  S t        | t              rt        |       S t        | t        j                        sJ dt        t        |             z          | S )Nz&Unknown type when creating real inputs)rT   r>   rs   SymFloatrt   hintr   r   rh  rH  r  )rB  s    rP   materializez;GraphLowering.codegen_with_cpp_wrapper.<locals>.materialize  st    a%,,!?@66;;&:.!!9$%5<< O?#d1g,NO  HrR   NT)r   r   compile_to_modulecallr>   utils_python_dispatch_disable_current_modesr   r   clearr   codegen)rv   compiledr  rB  real_inputss        rP   codegen_with_cpp_wrapperz&GraphLowering.codegen_with_cpp_wrapper  s     T&&&$D--/44H
 --DDF**6667;7J7JK7J!{1~7JK% G   $D  &&(##))+<<>! <<>! L GFs   C:8C5
C:5C::Dc                 V   ddl m} | j                           || j                        | _         t        j
                  j                  | j                  | j                   j                         | j                   j                          | j                  j                  | j                        S )Nr   	Scheduler)r   r  r  r   r9   r   draw_orig_fx_graphr   r   r  r   generater   )rv   r  s     rP   r  zGraphLowering.codegen  sp    ( "4<<0	""4<<1E1EF   ))$*;*;<<rR   c                    ddl m}  || j                        }d}g }g }|j                  D ]N  }|j	                         }||z  }|j                  ||dz  f       |j                  ||j                         f       P |||fS )Nr   r  r   r  )r   r  r   r   get_read_write_buffers_sizesr0  get_estimated_runtime)rv   r  r   total_bytesnode_countsnode_runtimesrt   	num_bytess           rP   count_byteszGraphLowering.count_bytes  s    (dll+	OOD99;I9$Ki1n56  $(B(B(D!EF	 $
 K66rR   c                 n   ddl m} | j                  r| j                         n| j	                         \  }}|D cg c]  \  }}||j
                  f }}}|j                  |      \  }}|j                  |||| j                        }|| _	        || _
        || _        |j                  J t        j                  d|j                         t        j                  d|       t        j!                  d|j                         t"        j$                  r(t'        d|j                   t(        j*                         t,        j                  j/                  |j                         t,        j                  j1                  t2        j4                  j7                  |j                        d   d	z          |S c c}}w )
Nr   )PyCodeCache)linemapattrszOutput code written to: %sOutput code: 
%szCompiled module path: )filer   z.debug)	codecacher  r   r   r  stack_tracewriteload_by_key_pathr   r   r   r   __file__r   r   output_code_logr  r   benchmark_kernelprintr  stderrr9   r;   r  ospathsplitext)	rv   r  coder  line_nort   keyr  mods	            rP   r  zGraphLowering.compile_to_module%  s]   * 04/?/?D))+T\\^ 	g ELLG=7DGT--.GL%%d+	T**wdnn + 
 $ ||'''		.=14893<<H""*3<<.9

K	CLL)	RWW%%cll3A6AB
' Ms   F1c                    | j                   rddlm} | j                  sJ d       | j	                         \  }}t
        j                  d|       d }t        j                         rI| j                  r=| j                  r1| j                  | j                        }t
        j                  d|       |j                  | ||| j                        S | j                         j                  S )Nr   )AotCodeCachez"AOT mode only supports C++ wrapperr  z#Serialized Extern Kernel Nodes: 
%s)rN   )r   r  r%  r   r   r  r   r   	is_fbcoder   r   compilerN   r  r  )rv   r%  r   r  serialized_extern_kernel_nodess        rP   compile_to_fnzGraphLowering.compile_to_fnA  s    ==/##I%II# 99;MD'!!"5t<-1*  ",,//151L1L,,2.  %%:2  ''d: (   ))+000rR   c                     | j                   D cg c]F  }t        |t        j                        s*t        |t        j                        s|j                         H c}S c c}w r=   )rf   rT   r   NoneAsConstantBufferShapeAsConstantBufferr  )rv   rt   s     rP   get_output_nameszGraphLowering.get_output_names^  sU     **
*dB$;$;<tR%=%=> MMO*
 	
 
s   AAc                     || j                   j                         v xrL | j                   |   j                         dk(  xr* | j                   |   j                         j                  dk(  S )Nr   r~   )r   keysr*  r3  r  r  s     rP   is_unspec_argzGraphLowering.is_unspec_argf  sj     D%%**,, C!!$'113q8C!!$'22499UB	
rR   r=   )=__name__
__module____qualname__r	   r   r>  __annotations__r>   rh  rz   r|   r   	frozensetfxGraphModuler
   r   staticmethodrF   r   r   r  r   r  propertyr   rH  r  r   r*  r   r-  r  r4  r9  rF  rJ  r_  rY  re  rt  r  r  r  r  r  r  r   Noder  r  r  r  r   r  r  r  r)  r-  r0  __classcell__)r   s   @rP   re   re   o   s9   		?"! !Fu|| 
X 8<&[#R)HH  R) !ell!34R)h c$ c cJ1fA
/ell /
  c 
8S 
88S 8 " "bii $s) 
* &
P# 8N # @)V 5u|| 5 5 57$/
b  $UXX]] $ $c%((-- cJQ&3*&"P=7  61:

# 
rR   re   )^rT  loggingrq  r  r!  r  r   collectionsr   
contextlibr   typingr   r   r   r   r	   r
   r   r   rU   r>   torch._loggingtorch.fxtorch._decompr   torch._dynamo.utilsr   r   r   torch._subclasses.fake_tensorr   torch.fx.experimental.sym_noder   r   %torch.fx.experimental.symbolic_shapesr   r   r   torch.utils._mode_utilsr   r   r   r   codegen.commonr   r   r   codegen.wrapperr   r    r!   excr"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r  r-   r.   r/   r0   r1   r2   r3   r4   r   r5   r  r6   r7   r8   virtualizedr9   	getLoggerr1  r   _logginggetArtifactLoggerr  r  rQ   r^   rc   r6  Interpreterre   r   rR   rP   <module>rP     s       	 	 
  # % O O O     , 4 % 4 L V V /  
 S R   	 	 	 ' R R g!00<H..228]K$,$
~
EHH(( ~
rR   