
    Phɵ              "          d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZmZmZmZmZmZmZmZmZ d dlmZ d dlmZ d dlmc mZ d dlZd dlmc m Z! d dl"m#Z#mZ$mZ% d dl&m'Z'm(Z( d dl)m*Z*m+Z+ d d	l,m-Z-m.Z.m/Z/ d d
l0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 ddl8m9Z9 ddl:m;Z; ddl<mZm=Z= ddl>m?Z? ddl@mAZA ddlBmCZC ddlDmEZEmFZF ddlGmHZH ddlImJZJ ddlKmLZL ddlmMZMmNZN ddlOmPZP  ej                         rd dlRmSZS ndeTfdZS ej                  eV      ZWej                  j                  eVd      ZZej                  j                  eVd       Z[d!Z\ej                   G d" d#             Z^ej                   G d$ d%             Z_d& Z`d'ej                  d(eeb   d)ej                  fd*Zcd'ej                  d)edfd+Ze ej                  d      d,        Zg ej                  d      d-        Zhd. Zid/ej                  j                  fd0Zle?j                  	 dud/ej                  j                  d1eej                     d2ebfd3       Zn	 dvd/ej                  j                  d1eej                     d5edfd6Zoe?j                  ej>                  j                  j                          eSd78      dd d4dd4d4d4d er       ddfd/ej                  j                  d1eej                     d9ee^   d2ebd:edd;eeb   d<edd=edd>edd?ee_   d@eeT   dAeed   dBeeeeL   gef      d)ee.eTf   fdC                     Zsdd d4dd4d4d4 er       ddf
d/ej                  j                  d1eej                     d9ee^   d2ebd:edd;eeb   d<edd=edd>edd@eeT   dAeed   dBeeeeL   gef      d)ee.eTf   fdDZtdEej                  fdFZudGeej                     dHeeb   d)dfdIZvdJeeej                     eeb   f   dKeeb   d)eeb   fdLZwdMeeej                     gef   dNeeb   fdOZx	 dwdMeeej                     gef   dJeej                     dKeeb   fdQZye%j                  	 dwdPdRdMej                  j                  dJeej                     dKeeb   dSebdTeeeT      d:edd>eddUeej                  dVf   fdW       Z{dJeeej                     eeb   f   dKeeb   fdXZ|dEej                  fdYZ}dZej                  d[ej                  d(eeb   fd\Z~	 dwdMej                  j                  dJeej                     dKeeb   fd]Zd^ej                  j                  fd_Zesdfd`ej                  j                  daeej                     dbedVef   dceeeTef      fddZ e	d       Zdeej                  j                  dfeej                     dgej                  j                  dhebdbedVef   d9e^d;ebdie_fdjZesddfd`ej                  j                  daeej                     dbedVef   dceeeTef      dkeee3edVef   f      f
dlZdxdmZdJeej                     fdnZd/ej                  j                  fdoZd/ej                  j                  fdpZd/ej                  j                  dJeej                     dqedVef   fdrZd/ej                  j                  fdsZd/ej                  j                  dJeej                     dqedVef   fdtZy)y    N)count)	AnyCallableDict	FrozenSetListOptionalSequenceTupleUnion)mock)#min_cut_rematerialization_partition)compiled_autogradloggingutils)detect_fake_modelazy_format_graph_code)aot_export_modulemake_boxed_func)	code_hashCompiledFxGraphFxGraphCache)save_args_for_compile_fx_inner)
OpOverload)
FakeTensor)FakeTensorProp   )aot_autograd)_PyTreeCodeGen   )configmetrics)DebugContext)select_decomp_table)joint_graph_passes)post_grad_passesview_to_reshape)pre_grad_passes)GraphLowering)ExternKernelNode)get_dtype_sizehas_incompatible_cudagraph_ops)V)time_and_logattrc                 "    t         j                  S N)dynamo_utilsidentityr/   s    eC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/_inductor/compile_fx.pyr.   r.   =   s    $$$    
perf_hintspost_grad_graphs   c                   .    e Zd ZU eed<   d Zed        Zy)	BoxedBoolvaluec                     | j                   S r1   )r<   )selfs    r5   __bool__zBoxedBool.__bool__K   s    zzr6   c                 6    t        | t              r	d| _        | S y)NF)
isinstancer;   r<   )objs    r5   disablezBoxedBool.disableN   s    c9%CIJr6   N)__name__
__module____qualname__bool__annotations__r?   staticmethodrC    r6   r5   r;   r;   G   s     K  r6   r;   c                   $    e Zd ZU ee   ed<   d Zy)BoxedDeviceIndexr<   c                 :    |t        |t              sJ || _        y r1   )rA   intr<   )r>   
device_idxs     r5   setzBoxedDeviceIndex.setZ   s    !Z
C%@@@
r6   N)rD   rE   rF   r	   rN   rH   rP   rJ   r6   r5   rL   rL   V   s    C= r6   rL   c                     t        | t        j                        sy t        | j                        D cg c].  }| j                  |      dk(  s| j                  |      dk7  s-|0 c}S c c}w Nr   r    )rA   torchTensorrangendimstridesize)tis     r5   get_expanded_dimsr[   c   sP    a&QVV}L}!q(8QVVAY!^A}LLLs   A)A)"A)rY   expanded_dimsreturnc                 l    |D ].  }t         j                  j                  j                  | |dd      } 0 | S rR   )rS   opsatenslice)rY   r\   expanded_dims      r5   index_expanded_dimsrc   i   s/    %IINN  L!Q7 &Hr6   c                    t        | t        |             } t        j                  |       dk7  r| j	                         }| j
                  }t        t        t        |                  }t        t        ||            D cg c]  \  }}|	 }}}t        t        |            D ]6  }|dk(  rdn
|||dz
        }|dk(  rdn
|||dz
        }|||      ||z  k  s6 y yc c}}w )Nr   r    TF)rc   r[   rS   _debug_has_internal_overlaprW   shapelistrU   lensortedzip)	rY   stridessizesindices_xrZ   prev_stride	prev_sizes	            r5   complex_memory_overlaprr   o   s     	A034A((+q0((*uS\*+!'GW(=!>?!>A1!>?s7|$A Av!771q5>+BK!Vwq1u~)>Iwqz"[9%<<	 %
  @s   ?Cc                  4    t        j                  t              S r1   )dynamo_loggingget_step_loggerlogrJ   r6   r5   _step_loggerrw      s    ))#..r6   c                     t         j                  j                         rgt         j                  j                  j                  j
                  s8t         j                  j                         dk\  rt        j                  d       y y y y )N)   r   zTensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.)	rS   cudais_availablebackendsmatmul
allow_tf32get_device_capabilitywarningswarnrJ   r6   r5   _warn_tf32_disabledr      sc     	

!##**55JJ,,.&8d	
 9 6 	"r6   c           	          i }| j                  d      D ]
  \  }}|||<    | j                  d      D ]
  \  }}|||<    ddlm}m}  ||||i       } |||t        j                         d |i |j                        }	|	S )NF)remove_duplicater   )'_construct_inp_pos_to_param_buffer_name_unlift)named_parametersnamed_bufferstorch._export.exported_programr   r   pytreeLeafSpecbuffers_to_mutate)
modgmgraph_signature
state_dictnameparamr   r   inp_pos_to_param_buffer_nameunlifted_gms
             r5   _unlift_graphr      s    J++U+Ce 
4 D((%(@e 
4 A
 $K

	$  
$
))K r6   r   c                 h   t         j                  j                  }|j                  j                  |j
                  j                  |j                  j                  |j                  j                  h}| j                  j                  D ]  }|j                  dk(  s|j                  |v s"t        |j                  j                  dd       t         j                        sW|j                  d   j                   t         j"                  k(  s|j                  d   j$                  j&                  dk(  s y y)Ncall_functionvalrz   TF)rS   r_   r`   mmdefaultaddmmbmmbaddbmmgraphnodesoptargetrA   metagetrT   dtypefloat32devicetype)r   r`   tf32_opsnodes       r5   is_tf32_warning_applicabler      s    99>>D

	H GG&x'499==5u||D		% &&%--7		% '',,6  r6   example_inputs	num_fixedc                 f   t        |      }t        | |      }t        j                  |      5  t	        | d       d d d        t        | ||      }t        j                  |      5  t        j                  |      5   |j                  |  |j                         \  }}}	t        xj                  |z  c_        t        xj                  |z  c_        t        xj                  |	z  c_        d d d        d d d        t        | j                        S # 1 sw Y   xY w# 1 sw Y   2xY w# 1 sw Y   6xY w)NF)	shape_envnum_static_inputs)_shape_env_from_inputsfake_tensor_propr-   set_fake_moder&   r)   set_graph_handlerset_real_inputsruncount_bytesr"   num_bytes_accessednodes_num_elemnode_runtimesr   forward)
r   r   r   kwargsr   	fake_moder   	num_bytesr   r   s
             r5   count_bytes_innerr      s     '~6I ^4I	
	#U# 
$ "	YOE	
		U	#Q%6%6~%F		>"383D3D3F0	>=""i/".0. &G	# 2::&& 
$	# &G%F	#	#s0   D%D';A/D*D'DD$	 D''D0Fforce_allow_non_fake_inputsc                 j   t        |      }|s<t        j                  j                  d      } t	        | |      j
                  |  |S |st        j                         n t        j                  j                  |dd      }|5   t	        | |      j                  |  ddd       |S # 1 sw Y   |S xY w)z}
    If we can not detect fake mode from the context of inputs, create one.

    The created fake mode will be returned.
    Tallow_non_fake_inputs)moder   N)r   rS   _subclassesFakeTensorModer   	propagate
contextlibnullcontextr   patchobjectpropagate_dont_convert_inputs)r   r   r   r   ctxs        r5   r   r      s     !0I%%4444P	4r	*44nE  / ""$""9.EtL 	
 LN2I.LL 
  
 s   B((B2zcompilation time (in seconds)r4   
cudagraphsis_backwardgraph_idcpp_wrapperaot_modeis_inferenceboxed_forward_device_indexuser_visible_outputs
layout_optextern_node_serializerc                 8   t        j                  | j                        dk(  r|st        | j                        S t        t        t        t        | j                  j                                    j                  d   t        t        f      sJ d| j                          t        j                  rt        | ||||||||	|
|       |#t!        t        j"                  j$                        }|||||||
||d
}t'        j&                         }t        j(                  r|st+        j,                  t.        | ||      }nt/        | |fi |}t0        j3                  dt'        j&                         |z
         t4        j6                  j8                  j;                         }|K|j<                  ?t?        |j<                        dk(  sJ |j<                  jA                  |j<                         |r|S |r-t        | j                  j                        d   }t?        |j                        dk(  sJ |j                  d   D cg c]>  }t        |t4        jB                  jD                  jF                        r|jH                  nd@ }}tK        d	 |D              }t        j"                  jL                  r tO        fd
|jP                  D               }nt?        |jR                        dk7  }tU        |jV                        dhk(  df| dftY        |        df| dftO        d |D              dft?        |jZ                        dk(  xs t        j"                  jL                   dfg}|D cg c]
  \  }}|r	| }}}|st        j"                  jL                  s-|D ](  }t        |t4        j\                        st_        |       * |	1|s/|s-|	jU                  t        t        |jZ                                     ta        |jc                         |te              t        t        |jZ                              |||t        |jf                  ji                                     |_5        nt         jm                  |       |rt        j"                  jL                  rh|	J |	jn                  J |jc                         t4        jp                  jL                  js                  |	jn                  d      J fd}||_5        d|jV                  v rtt        jw                  d|       |s=ty        |jc                         |te                    }||jc                         ur||_5         t{               t|        j~                  d|rdnd d|        d|_@        |S c c}w c c}}w )z
    Inductor API that compiles a single graph.

    If you change the argument list for this function, make sure you
    also update the call to save_args_for_compile_fx_inner below accordingly.
    r   zGinductor can only compile FX graphs which return a tuple/list, but got )
r   r   r   r   r   r   r   r   r   r   N)
r   r   r   r   r   r   r   r   r   r   z%FX codegen and compilation took %.3fsr    c              3   f   K   | ])  }t        |t        j                        rt        |       + y wr1   )rA   rS   rT   rr   .0rY   s     r5   	<genexpr>z#compile_fx_inner.<locals>.<genexpr>X  s+      ,
#!U\\* #1%#s   /1c              3   (   K   | ]	  }|k    y wr1   rJ   )r   idxr   s     r5   r   z#compile_fx_inner.<locals>.<genexpr>a  s      #+LCi+Ls   rz   znon-cuda device in graphzmutated inputszincompatible opszcomplex memory overlapc              3   p   K   | ].  }t        |t        j                  t        j                  f       0 y wr1   )rA   rS   rT   SymIntr   s     r5   r   z#compile_fx_inner.<locals>.<genexpr>m  s(      IWAJq5<<">?s   46znon-Tensor inputsz/multiple device indices without cudagraph_trees)static_input_idxsdevice_indexstack_tracesr   r   	constantsF)create_if_none_existsc                 4    j                           |       S r1   )set_to_running_backward)
new_inputscompiled_graph_callablemanagers    r5   compiled_artifactz+compile_fx_inner.<locals>.compiled_artifact  s    3352:>>r6   skipping cudagraphs due to %sztorchinductor done compiling 	BACKWARDSFORWARDS graph T)Ar2   count_callsr   r   r   rA   nextiterreversedr   argstuplerg   r!   	save_argsr   r;   tritonr   timefx_graph_cacher   loadfx_codegen_and_compilerv   debugrS   _guardsTracingContexttry_getoutput_stridesrh   extendfxr   Nodestack_traceanycudagraph_treesallmutated_input_idxsmutated_inputsrP   device_typesr,   device_idxsr   rN   cudagraphifyget_current_callablerU   r   valuescurrent_callablerC   r<   	_inductorget_managerperf_hint_logwarningalign_inputsrw   r   INFO_boxed_call)r   r   r   r   r   r   r   r   r   r   r   r   r   graph_kwargsstartcompiled_graphcontextoutputargr   complex_memory_overlap_inputshas_mutationcudagraph_testsbscudagraph_fail_reasonsrY   r   new_callabler   r   s      `                         @@r5   compile_fx_innerr$     sY   0 )Q.xrzz**T(288>>*+,11!4udm \	PQSQYQYPZ[\  &!##%'A!5!	
 v}}778
 !""$ 4 "8L IIKEX%**"B
 0
".
 II5tyy{U7JK mm**224Gw55A7))*a///%%n&C&CDbhhnn%b)6;;1$$$ {{1~
% !+30B0B CS__M% 	 

 ), ,
#,
 )
% ==((" #+9+L+L#   L ~<<=BL ,,-&9;UV/0/335GH..0HI IW  $	 223q8 9!==888A
& 1@!I1q!!I%==00'A!!U\\2A (
 +6$#*..tD9S9S4T/UV.:335"'	"2!$~'A'A"BC)') 8 8 ? ? AB	/N+ j)
 v}}<<1===177CCC*8*M*M*O'//99EE.44E F  ***? 3D/444%%35K
 #//1>5CS
 ~BBDD.:N+LN'%;:
6 7
	 "&N]
N "Js    AV(
V3Vc                    t        |       r
t                t        j                  t	        t        j
                         d              t               t        j                  d|rdnd d|        t        j                  j                  | |       t        |      }t        |        t        j                         5  t!        | |      }d d d        t        j"                        5  t%        | |       t        j                  j'                  | |       t(        j+                  dt-        d|              d d d        t        j"                  |      5  t/        | |rt        j0                  n|||||||	||	
      }t        j2                  |      5   |j4                  |  g }|j6                  d|j6                  D ]U  }t9        |d
      r6|j;                  t=        d |j>                  j@                  D                     E|j;                  d        W |jC                         }t        jD                  du r|cd d d        cd d d        S |jF                  rCtH        jK                  dt        jL                  jN                         tP        jS                  |       tU        |||      }d d d        d d d        S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   -xY w# 1 sw Y   S xY w)Ni  ztorchinductor compiling r   r   r   r   z%szAFTER POST GRAD)	r   r   r   r   r   r   r   r   r   layoutc              3   n   K   | ]-  }t         j                  j                  j                  |       / y wr1   )r-   r   sizevars	size_hint)r   r!  s     r5   r   z)fx_codegen_and_compile.<locals>.<genexpr>  s*      "GX! 0 0 : :1 =GXs   35Tr   )+r   r   syssetrecursionlimitmaxgetrecursionlimitrw   r   r  r-   r   fx_graphr   r'   rS   no_gradr   r   r&   fx_graph_transformedpost_grad_graphs_loginfor   r)   real_inputsr   r   graph_outputshasattrappendr   r'  rW   compile_to_fnaot_compilationdisable_cudagraphsr  r  r   disable_cudagraphs_reasonr;   rC   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   outcompiled_fnr  s                      r5   r   r     so    ""% #c335t<=LN"%;:
6 7
	 GGR(&~6I$ B 
$R8	 
 
	#,7	$$R8!!$(>?PRT(UV	 
$ 
	# -91==n'#!5#9%
   'EII~&>@N"". !..CsH-&--! "GJzzGXGX"  '--d3 /  --/K  D("' ('! 
$	#J ''%%3QWW5V5V !!*-,[%PN7 (! 
$Z u 
 
$	#, ('! 
$Z sK   :J2$AJ?=KB%K1	KAK K2J<?K	K	KK"ro   c                 *   t        d t        | j                         | j                               D              dz   }t	        j
                  | |fd      j                         }t	        j
                  || j                         | j                               S )Nc              3   2   K   | ]  \  }}|d z
  |z    ywr    NrJ   r   rf   rW   s      r5   r   z)clone_preserve_strides.<locals>.<genexpr>8        P6O]UFUQY& 6O   r    )r    )sumrj   rX   rW   rS   
as_stridedclonero   needed_sizebuffers      r5   clone_preserve_stridesrJ  6  sp    Pc!&&(AHHJ6OPPSTT  a+6<<>FFAFFHahhj99r6   r   check_inputs_idxsc                 j    |D ].  }| |   j                         t        z  st        | |         | |<   0 y r1   )data_ptr	ALIGNMENTrJ  )r   rK  rZ   s      r5   copy_misaligned_inputsrO  >  s5     a=!!#i/2:a=AJqM r6   inputsr   c                    d }g }t        |       D ]q  \  }}t        |t        j                        s!||vs" ||j	                         |j
                        rG|j                  j                  dk(  sa|j                  |       s |S )Nc                 2    | t        |      z  t        z  dk(  S )Nr   )r+   rN  )storage_offsetr   s     r5   
is_alignedz+get_input_idxs_to_check.<locals>.is_alignedJ  s    !66)CqHHr6   rz   )		enumeraterA   rS   rT   rS  r   r   r   r7  )rP  r   rT  ids_to_checkrZ   inputs         r5   get_input_idxs_to_checkrX  F  sx    I Lf%5uell+**!%"6"6"8%++F!!V+" & r6   modelinputs_to_checkc                 6     t              dk(  r S  fd}|S )Nr   c                 ,    t        |         |       S r1   )rO  )r   rZ  rY  s    r5   r   z)align_inputs_from_check_idxs.<locals>.runa  s    z?;Z  r6   )rh   )rY  rZ  r   s   `` r5   align_inputs_from_check_idxsr]  [  s#     ?q ! Jr6   rJ   c                 2    t        ||      }t        | |      S r1   )rX  r]  )rY  rP  r   rZ  s       r5   r  r  h  s    
 .f6GHO'??r6   )r   r   r   r   .c                    
 ddl m} t        j                  j                  rt        j                  ||||||      nt        t        d |D              s
  |      S d 

 fd}	|	S )Nr   )cudagraphify_impl)r   r   r   r   r   c              3   <   K   | ]  }t        |t                y wr1   )rA   r   )r   inps     r5   r   zcudagraphify.<locals>.<genexpr>  s     =fsz#z*fs   c                 ~    't        j                         5   |       d d d         |       S # 1 sw Y   xY wr1   )r2   preserve_rng_state)r   r=  cudagraphify_fnrY  r   s    r5   r   zcudagraphify.<locals>.run  s=    002-eZARS 3:&& 32s   3<)torch._inductor.cudagraph_treesr`  r!   r   r  	functoolspartialr  )rY  rP  r   r   r   r   r   r   new_cudagraphify_implr   r=  re  s   ` `       @@r5   r  r  q  sq    
 }}$$#++!%%#%
 , =f==uf.?@@K' Jr6   c                     g }t        ||       D ]L  \  }}t        |t        j                        s!|j	                         t
        z  dk(  s<|j                  |       N t        |      t        |      k7  r|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )rj   rA   rS   rT   rM  rN  r7  rh   )rP  r   aligned_static_input_idxsr   rW  s        r5   remove_unaligned_input_idxsrl    sr     !#+V4
UeU\\*0@90LQR/R%,,S1 5 $%->)??((r6   c                 6   t        d t        | j                         | j                               D              dz   }t	        j
                  || j                  | j                        }t	        j                  || j                         | j                               S )z1
    Copy and input while preserving strides
    c              3   2   K   | ]  \  }}|d z
  |z    ywr@  rJ   rA  s      r5   r   zstatic_input.<locals>.<genexpr>  rB  rC  r    )r   r   )	rD  rj   rX   rW   rS   emptyr   r   rE  rG  s      r5   static_inputrp    sm     	Pc!&&(AHHJ6OPPSTT  [[AGGAHHEFFAFFHahhj99r6   dstsrcc                 V    t        | |      } t        ||      }| j                  |       y)z=Index into expanded dimensions of both dst and src then copy_N)rc   copy_)rq  rr  r\   s      r5   index_expanded_dims_and_copy_ru    s'     c=
1C
c=
1CIIcNr6   c                   	
 t        |      }t        |      t        ||       t        |t              sJ t        |      D cg c]  \  }}|vrt        |      ng  c}}t        |      D cg c]@  \  }}t        |t        j                        s|n|vrt        |      n|j                         B c}}t        t        |            D ]8  \  }\  }}t        |t        j                        s$|vs)t        |   ||       : t        j                  j                          t        j                  j                         }|j!                  t        j                  j#                                t        j                  j%                  |      5   | t	                     ddd       |j                          t        j                  j#                         j!                  |       t        j                  j                          t        j                  j'                         
t        j                  j)                  
|d      5   | t	                    ddd       t        t        t*        f      sft,        j.                  r

fd}n0t1        t3                    D cg c]	  }|vs| c}		
fd}t5        ||      S c c}}w c c}}w # 1 sw Y   :xY w# 1 sw Y   xY wc c}w )zQ
    Assumes inputs[static_input_idxs[i]] are always the same memory address
    Nthread_local)streamcapture_error_modec                 ^   t              t        |       k(  sJ t        t        |             D ]Y  \  }\  }}}t        |t        j
                        s%|v r$|j                         |j                         k(  rKJ t        |||       [ | j                          j                          	S r1   )
rh   rU  rj   rA   rS   rT   rM  ru  clearreplay)
r   r   rq  rr  r\   r   inps_expanded_dimsr   static_inputsstatic_outputss
        r5   r   zcudagraphify_impl.<locals>.run  s    }%Z8882;M:/AB3..c3 "#u||4--<<>S\\^;;;
 2#sMJ3 LLN!!r6   c                     D ]  }|   }t        |   | |   |        | j                          j                          S r1   )ru  r{  r|  )r   r   r\   copy_indicesr   r}  r~  r  s      r5   r   zcudagraphify_impl.<locals>.run  sL    # 23 7-!#&
3 $
 LLN!!r6   )rX  rl  rO  rA   rg   rU  r[   rS   rT   rp  detachrj   ru  rz   synchronizeStreamwait_streamcurrent_streamrx  	CUDAGraphr   r   r!   size_assertsrU   rh   r]  )rY  rP  r   check_input_idxsr   ro   r\   rx  r   r  r   r}  r~  r  s     `      @@@@@r5   r`  r`    s    /v7HI3F<MN6#34fd###  ''FC !$+< <!"D'  ' (FC	 !U\\* 	
 '' !_XXZ		
 (M $-S9K-L#Maa&36G+G)-*<aO $N
 
JJZZ F
uzz0023			6	"d=!" 
#
	JJ++F3	JJ JJ  "E			%>		RtM23 
SntUm4(*	" 	"( !]!34
4CCT8TC4
	" 	" (-=>>S& 
#	" 
S	R4
s1   K8AKK#K01	K<;K<#K-0K9fx_gc                     d }d}g }| j                   j                  D ]0  }|j                  dk(  s ||      r|j                  |       |dz  }2 |t	        t        t        |                  k(  sJ t        |      S )z>
    Infers which inputs are static for a backwards graph
    c                 ^    d| j                   vxr d| j                   vxr d| j                   vS )Ntangentsbwd_seedbwd_base_offset)r   )ro   s    r5   is_saved_tensorz'count_tangents.<locals>.is_saved_tensor&  s5    aff$ 0!&&(0!/	
r6   r   placeholderr    )r   r   r   r7  rg   rU   rh   )r  r  	arg_countstatic_arg_idxsns        r5   count_tangentsr  !  s|    

 IOZZ44= q!&&y1NI	  d5_)=#>????r6   model_example_inputs_inner_compileconfig_patchesc           
         |ddini |ddi}d|vr5t         j                  j                  si |dt        | j                        i}|j                  dd       }t        j                  d      5  t        | |t        j                  |d|      |      }t        j                  j                  |      s
J d|        |cd d d        S # 1 sw Y   y xY w)Nr   Tzaot_inductor.output_pathr   )r   r   )r  r  z/AOTInductor compiled library does not exist at )r!   aot_inductoroutput_pathr   codepopr-   set_aot_compilation
compile_fxrg  rh  ospathexists)r  r  r  r  r   compiled_lib_paths         r5   compile_fx_aotr  9  s     ! 
44t4  	#.8##//

&	&++(>

 ,//0H$O	
		t	$&#++'=
 *	
 ww~~
 	Q<=N<OP	Q 
 ! 
%	$	$s   /AC		Caot_autograd_modelaot_example_inputsdynamo_modelnum_example_inputsforward_devicec                 f   ddl m}m}	 t        |        t	        j
                  | d      }
|
rt        | |d        ||         |	|| |      \  }D cg c]  }||   	 }}t              |z
  }t        |      }|j                  j                  ^ }}|j                  d   }|D cg c]3  }t        |t        j                  j                        s(|j                   5 }}t        j"                  j$                  j'                         }|3|j(                  }|J t+        t        |            D ]  }|vsd ||<    t,        j.                  j1                  |dd      5   ||||||d||
|	      d d d        t2        j4                  du rS fd}d|_        |S c c}w c c}w # 1 sw Y   6xY w)Nr   )%convert_conv_weights_to_channels_lastfreezeTr&  r   )r   r   r   r   r   r   r   c                 b    D cg c]  }| |   	 }}| j                           |      S c c}w r1   )r{  )r   rZ   args_newoptimized_functionpreserved_arg_indicess      r5   wrapperz%fw_compiler_freezing.<locals>.wrapper  s6    %:;%:DG%:;

!(++ <s   ,)torch._inductor.freezingr  r  r%   r)   decide_layout_optr   rh   r   r   r   r   rA   rS   r  r  r   r   r   r   params_flatrU   r   r   r   r-   r9  r  )r  r  r  r  r  r   r   r  r  r  r   	opt_modelindr   r   rn   model_outputs_nodemodel_outputsr  r   tracing_contextr  rZ   r  r  r  s                           @@r5   fw_compiler_freezingr  b  s    W )*001CRVWJ+-?F-.@A'-($I$ >SS=Rc,S1=RS)*-??I !34I '__22Q&++A.M%%1Auxx}})E  
 mm22::<O"%11&&&s;'(A--!%A ) 
		9&=t	D*!'5!!5

 
E 	D !!,
 GN] T 
E	Ds   F()F"F" F''F0decompositionsc                     |rGt        j                  |      5  t         | t        j                  |            |      cddd       S t         j                  rLt        j                  ddddd      5  t	        j
                  |      5  |}t         t        j                  j                        r j                  j                  D cg c],  }|j                  dk(  r|j                  j                  d      . }}t        d |D              rbt!        t#               ||      D ]H  \  }}	}
|	j$                  |
j$                  k7  s!t'        d	| d
|	j$                   d|
j$                   d       |}t         |t)        j*                  d      |      cddd       cddd       S t)        j*                  t        |      }t-               st/         ||      S t         t        j                  j                        r=t         j                  j0                  t2              rt5         ||      S t7         |       t9        d |D              rt;         ||      S t         j<                  rJ t?        |      tA        t         jB                  jD                        tG        d      tI        tJ              ||n	tM               }tN        jP                  dt        j                  j                  dtR        t        jT                     dtV        f fd       }t)        j*                  |d      }t         jX                  r5t        jZ                         s!t)        j*                  t\               }nt)        j*                  |d      }d }tN        jP                  dt        j                  j                  dtR        t        jT                     ffd       }t_        |      xs  t        j`                  jc                  d      }t        jd                  jf                  ji                         xs t        jd                  jg                  |      }t        jj                  du retm         |d|      \  }}to         ||      }t	        jp                  |      5  ts        jt                         5   |||      cddd       cddd       S t	        jp                  |      5  t        jd                  jw                  |      5  ts        jt                         5   ty        |||||d       |      cddd       cddd       cddd       S # 1 sw Y   xY wc c}w # 1 sw Y   nxY w	 ddd       # 1 sw Y   xY w# 1 sw Y   nxY wddd       # 1 sw Y   xY w# 1 sw Y   nxY w	 ddd       n# 1 sw Y   nxY wddd       y# 1 sw Y   yxY w)z+Main entrypoint to a compile given FX graph)r  r  NFT)r   ztriton.autotune_cublasLtztriton.cudagraphsztriton.store_cubinr  r   c              3   $   K   | ]  }|d u 
 y wr1   rJ   )r   vs     r5   r   zcompile_fx.<locals>.<genexpr>  s     :kq}ks   zBDevice mismatch between fake input and example input at position #z: z vs zx. If the model was exported via torch.export(), make sure torch.export() and torch.aot_compile() run on the same device.)r   c              3   R   K   | ]  }t        |t        t        t        f       ! y wr1   )rA   rg   r   dict)r   ro   s     r5   r   zcompile_fx.<locals>.<genexpr>  s     
G!:a$t,-s   %'rY  r   r   c           
         |rt        |        t        j                  rdnd}t        |      z
  |z
  }t	               }t
        j                  rs| j                  j                  ^ }}|j                  dk(  sJ t        j                  |j                   }t        |      }	t        j                  j                  j!                         }
|
%|
j"                  r|s|
j"                  j$                  }nd}t'        t        j(                  j*                        rXj                  j                  ^ }}|j                  dk(  sJ t        j,                  |j                        \  }}t        |      }n|	}||	k  sJ ||z   }||	k  sJ ||| D ch c]2  }t'        |t        j(                  j.                        r|j0                  4 }} | ||||      S c c}w )Nr   r   r  )r   r   r   r   r   r   )r%   functorch_configfunctionalize_rng_opsrh   rP   r!   keep_output_strider   r   r   r   arg_tree_leavesr   rS   r   r   r   fw_metadatanum_mutated_inp_runtime_indicesrA   r  GraphModuletree_flattenr  r   )rY  r   r   num_rng_seed_offset_inputsfixedr   rn   r  r  num_model_outputsr  original_output_start_indexorig_model_outputs_nodeorig_model_outputsnum_orig_model_outputsorig_output_end_idxr  r   r  r   r  r  r  s                    r5   fw_compiler_basez$compile_fx.<locals>.fw_compiler_base  s    u%*:*P*PQVW"N#&88;UU"u$$%*[[%6%6"Q"%((H444"224F4K4KLM #M 2mm22::<G"w':':<''GG , /0+&%(("6"67.4ll.@.@++.11X===(.(;(;+00)%"A *--?)@&):&)->>>> #>@V"V '*;;;; ''BCVW$WAa/ W ! $ !%'5!5	
 		
$s   	7Gr&  )r  r  r  r   r   r  c                 :    t        |        t        | |fi |ddiS )Ncompilerinductor)r%   r   )r   joint_inputsr   s      r5   partition_fnz compile_fx.<locals>.partition_fnf  s,    5!2<
#)
4>
 	
r6   c           	      8    t        |       } | ||d      S )NT)r   r   r   r   r   )r  )rY  r   r  r   r  r   r  s      r5   bw_compilerzcompile_fx.<locals>.bw_compilerl  s/    u%!'5
 	
r6   r   )trace_jointr  )fw_compilerr  inference_compilerr  r  keep_inference_input_mutations)=r!   r   r  r   r-   r   rA   rS   r  r  r   r   r   r   r   r  rj   r   r   
ValueErrorrg  rh  graph_returns_tuplemake_graph_return_tuple_codegenr   handle_dynamo_export_graphr(   r  flatten_graph_inputs_raise_error_for_testingrh   r;   r   r   rL   r   _graph_counterr$   r2   dynamo_timedr   rT   rG   freezingis_grad_enabledr  r   r   r   r   r   r   r9  r   r   r   r   rC   tracingr   )r  r  r  r  r  inputs_r   fake_inputsr   firZ   recursive_compile_fxr  r  r  r  r  r   r  r   r   r   r   r  r   r  s   ` `                   @@@@r5   r  r    s!    \\.):fll>:=I- *) \\$,1%*&*	
 _-%G&%(("6"67 !' 2 2 2ww-/ IIMM%( 2  
 :k::&)%';&H
R990","dehdiik#%99+T!(( <k!k#  'I *G'//4P-	% .-
 
@ %,,#% v&& 
 	
 &%((../fll++^<-$  !9

G
GG# 
 	
 ...._-6==334J%d+NN#H )4:M:O  I
xx##I
U\\*I
 I
 I
 I
V ##$45IKu446&.. 1'!)
 '../?dS
 

588// 

ellAS 

 

  !1 U5F5F5U5U" 6V 6I 	$$,,. 	3==''	2 
 	D /O~
O $FB@__Y'):)B)B)D%k?C *E)D'' 
	#U]]%:%:&  "
|##1)%+/
 /# #"& &	#	#m *)( .--
 
 
R *E)D'''
 #""& & &	#	#	#s   (U;U5?U1UA	UAU	U5V2	V;	V# WV9V#0	V99	WUUU'	#U55U?V	VV #V,(V90	W9W	>WWc                     t        j                  |       5  t        j                         cd d d        S # 1 sw Y   y xY wr1   )r!   r   get_config_copy)r  s    r5   get_patched_config_dictr    s&    	n	%%%' 
&	%	%s   4=c                     d }t        |       }||j                  S | D ]4  }t        |t        j                        s|j
                  j                  c S  y r1   )r   r   rA   rS   r   r   )rP  r   r   rW  s       r5   r   r     sT    I (I """ eU\\*::''' 
 r6   c                     t        t        t        | j                  j                                    }|j
                  dk(  sJ |S )z$Get the output node from an FX graphr  )r   r   r   r   r   r   )r   	last_nodes     r5   output_noder    s6    T(288>>234I<<8###r6   c                    t        | t        j                  j                        syt	        |       j
                  \  }t        |t        t        f      ryt        |t        j                  j                  j                        rst        |j                  d      r]t        |j                  j                  j                        dkD  r1t        d |j                  j                  j                  D              ryy)z"True if a FX graph returns a tupleT_schemar    c              3   L   K   | ]  }t        |j                        d k(    yw)rT   N)strr   )r   rets     r5   r   z&graph_returns_tuple.<locals>.<genexpr>  s      O5NcCHH)5Ns   "$F)rA   rS   r  r  r  r   rg   r   r   r  r6  r   rh   r  returnsr  )r   rvs     r5   r  r    s    b%((../O  ER"tUm$2uxx}}))*BIIy)		!!))*Q.ORYY5F5F5N5NOO r6   
compile_gmc                    t        |       }|j                  \  }t        j                  |      \  }| j                  j                  |      5  | j                  j                  |       ddd       | j                  j                  |       t        |       sJ  || |      t        j                        fd       }|S # 1 sw Y   ZxY w)z
    Mutate gm so it returns a tuple.  This is only needed for graphs
    not created by torchdynamo that return non-tuples.
    Nc                  <    t        j                   | i |      S r1   )r   tree_unflatten)r   r   r=  specs     r5   r  z(make_graph_return_tuple.<locals>.wrapper  s     $$[$%A&%A4HHr6   )r  r   r   r  r   inserting_beforer  
erase_noder  rg  wraps)r   rP  r  r   r  r  r=  r  s         @@r5   r  r    s     r?DIIER""2&HB		"	"4	(
 
)HHr"""R(K__[!I "I N 
)	(s   CCc                      t        j                  |      \  } G  fddt        j                  j                        } | |       |      t        j                        fd       }|S )z
    Mutate inputs so that they are flat and wrap gm such that it
    accepts those inputs.  This is only needed for graphs not created
    by torchdynamo that take bumpy inputs.
    c                   ,     e Zd Z fdZfdZ xZS )'flatten_graph_inputs.<locals>.GmWrapperc                 0    t         |           | _        y r1   )super__init__r   )r>   	__class__r   s    r5   r
  z0flatten_graph_inputs.<locals>.GmWrapper.__init__  s    GDGr6   c                 `    t        |      } | j                  t        j                  |       S r1   )rg   r   r   r   )r>   r   r  s     r5   r   z/flatten_graph_inputs.<locals>.GmWrapper.forward  s*    "4jD477F11$=>>r6   )rD   rE   rF   r
  r   __classcell__)r  r   r  s   @r5   	GmWrapperr    s    		? 	?r6   r  c                  .     t        j                  |   S r1   )r   r  )r   r=  s    r5   r  z%flatten_graph_inputs.<locals>.wrapper  s     F22D9::r6   )r   r  rS   nnModulerg  r  )r   rP  r  r  r  r=  r  s   `    @@r5   r  r    s_     &&v.LFD?EHHOO ? Y[&1K__[!; "; Nr6   c                 ,   | j                   j                  t        j                  j                   j	                         | j                   _        | j                           ||  j                  |       t        j                        fd       }|S )z
    `torch._dynamo.export` embeds pytrees in the FX graph codegen object,
    convert that to a normal FX graph so inductor can compile it.
    c                  F    j                    j                  |         S r1   )process_outputsprocess_inputs)r   codegenr=  s    r5   r  z+handle_dynamo_export_graph.<locals>.wrapper  s'    &&{4JG4J4JD4Q'RSSr6   )	r   r  rS   r  CodeGen	recompiler  rg  r  )r   rP  r  r  r  r=  s       @@r5   r  r    sx     hhG..0BHHLLNR!7!7!7!@AK__[!T "T Nr6   )r   )F)rJ   r1   )r   dataclassesrg  r   r  r+  r   r   	itertoolsr   typingr   r   r   r   r   r	   r
   r   r   unittestr   functorch.compiler   torch._functorch.config
_functorchr!   r  torch.fxrS   torch.utils._pytreer   _pytreer   torch._dynamor   rt   r2   torch._dynamo.utilsr   r   torch._functorch.aot_autogradr   r   torch._inductor.codecacher   r   r   torch._inductor.debugr   
torch._opsr   torch._subclasses.fake_tensorr    torch.fx.passes.fake_tensor_propr   _dynamo.backends.commonr   fx.graphr    r"   r   r#   decompositionr$   fx_passes.joint_graphr%   fx_passes.post_gradr&   r'   fx_passes.pre_gradr(   r   r)   irr*   r+   r,   virtualizedr-   	is_fbcodetorch._inductor.fb.utilsr.   r  	getLoggerrD   rv   _logginggetArtifactLoggerr  r2  rN  	dataclassr;   rL   r[   rT   rN   rc   rG   rr   	lru_cacherw   r   r   r  r  r   wrapr   r   _python_dispatch_disable_current_modes	frozensetr$  r   rJ  rO  rX  r]  r  r  r  rl  rp  ru  r`  r  r  r  r  r  r  r   r  r  r  r  r  rJ   r6   r5   <module>r?     s5	       	 
   
 
 
  A 2 2  $ $ 
 I L N N @ ! 4 ; 2 %   . 5 B /     A 65%3 % g!00<H~~77BTU 	         M5<< S	 ell ell t " T/ / T	
 	
<588#7#7 (  ''&' ' '2 ).& "&8 44623 '+"=A+4;!%PTGG&G #G 	G
 G smG G G G !))9 :G $C.G G %Xt4D/E.F.K%LMG ?C G 4 7 GZ '+"+4;!%PToo&o #o 	o
 o smo o o o $C.o o %Xt4D/E.F.K%LMo ?C od:ell :BU\\"B7?}B	B$u||$hsm34} c]*
T%,,'(#-.
AI#
  (*@T%,,'(#-.@@  }@  (*) +-)88))  })
 ) x}%) ) ) U\\3&') )X$u||$hsm34}"
:ELL 
:		 9 (*W?88W?W?  }W?t --  6 )9/3	#!HH  #!%,,'#! CH%#! T#s(^,	#!L qI,,IU\\*I ((&&I 	I
 CH%I I I %I^ )9/3EIi#HH  i#%,,'i# CH%i# T#s(^,	i#
 T*hsCx.@"@ABi#Z(
4#5 *EHH(( EHH00 $ c"4UXX11 6 c"r6   