
    Ph|                     @   U d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlmZmZ d dlmZmZm Z m!Z!m"Z" d dl#m$Z$ d dl%m&Z& d d	l'm(Z) d d
l*m+Z+m,Z,  ejZ                  d      Z.ej^                  e0d<   ejb                  jd                  Z2 e       Z3 e4       Z5ee6   e0d<    e jn                  e4      Z8ee6ee6   f   e0d<    e jn                  e4      Z9ee6ee6   f   e0d<   da:e6e0d<   dLde6fdZ;dee   dee   defdZ< ed       G d d             Z=dej|                  de=fdZ?de$deee6d f   e6f   dee=   fd!Z@d"ed#edefd$ZAd"ed#ede&fd%ZBde$d"ed&ee   d'ed(ed)edej|                  fd*ZCde$d+e=d,ee=   d-eej|                  eDf   ddf
d.ZEde$d,ee=   d-eej|                  eDf   de=fd/ZFde$d,ee=   ddfd0ZG e<g g 1      de$d2eDddfd3       ZH e<eHgg 1      de$ddfd4       ZI e<g g 1      de$ddfd5       ZJ e j                  d6g d7      ZL ed       G d8 d9             ZM ed       G d: d;             ZN ed       G d< d=             ZOd>ej|                  deOfd?ZPde$d@eee6d f   e6f   deeO   fdAZQde$dBeOdCeej|                     deeOeOf   fdDZRde$dEeOdCeej|                     deeOeOf   fdFZS e<eJgeIg1      de$dGe6dHe6ddfdI       ZTde$dJeej|                     deej|                     fdKZUy)M    N)	dataclassfield)wraps)AnyCallablecastDefaultDictDictIterableListOptionalSetTupleUnion)
FakeTensorFakeTensorMode)CommTypedump_graphs_to_files	find_node
get_outputOP)IterGraphModule)TensorMetadata)_pytree)tree_flattentree_unflattengraph_optimizationlogger_optimized_func_prerequisite_sets_apply_before_sets _dump_graph_folderfolderc                 4    | st        j                         } | ay N)tempfilemkdtempr#   )r$   s    uC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/distributed/_spmd/graph_optimization.pyenable_graph_optimization_dumpr*   4   s    !!#    prerequisitesapply_afterreturnc                 0     dt         dt         f fd}|S )a  Define the contract of a graph optimization pass.

    All the passes should be wrapped with this decorator.
    `prerequisites` is used to annotate the prerequisite passes of the this pass.
    `apply_after` means that this wrapped pass must be applied after the passes
    in `apply_after`. The difference between `prerequisites` and `apply_after`
    is that all the passes in `prerequisites` must be applied to the graph and
    must be applifed before the wrapped pass while the passes `apply_after` are
    optional. But if a pass in `apply_after` is applied to the graph, it has to
    be done before the wrapped pass.
    Optimizer pass developers are required to add these fields accordingly and
    users need to follow the restrictions to avoid the assert.

    Current design has one limitation: users can only apply the optimizations
    once.  In some cases, we may need to run multiple the same optimization
    multiple time, e.g., optimization passes -> profiling the result -> apply
    optimization passes with the profiling result again. This limitation will be
    addressed limitation in the future.

    Args:
        prerequisites (Iterable[Callable]): the list of string to the names of
            passes which are the prerequisites of this pass.
        apply_after (Iterable[Callable]): the list of string to the names of
            passes that can not be applied after the wrapped pass.
    funcr.   c           	      J    dt         dt        fd} |       D ch c]
  } ||       c}t        <   D ]   }t         ||         j	                         " t               dt        t        j                  t        f   dt        dt        dd f fd       }|S c c}w )Nr0   r.   c                 8    | j                    d| j                   S )N.)
__module____name__)r0   s    r)   make_keyz8graph_optimization_pass.<locals>.inner.<locals>.make_key^   s    oo&a77r+   gmargskwargsc                    t        j                          }t        | t        j                  t        f      sJ d       t
        vsJ d d       t           j                  t
              }|rJ | d d       t           j                  t
              s"J t           t
        z
   d dt
         d        | g|i | | j                  j                          | j                  j                          | j                          t
        j                         dj                   }t         r`t        | t              r>t#        | d	| j$                  | d
| j&                  | d| j(                  it                nt#        || it                t*        j-                  dt        j                          |z
         y )NzPThe first argument of the pass must be either fx.GraphModule or IterGraphModule.zCannot apply z twice.z must be applied after r3   z are the prerequisites of z+ but are not applified. Applied passes are after_	_setup_gm_main_gm_cleanup_gmzSpent %f seconds applying %s)time
isinstancefxGraphModuler   r   r!   intersectionr    issubsetgraphlinteliminate_dead_code	recompileaddr5   r#   r   setup_gmmain_gm
cleanup_gmr   info)r7   r8   r9   begininvalid_passesprefixr0   func_keys         r)   pass_wrapperz<graph_optimization_pass.<locals>.inner.<locals>.pass_wrapperf   s    IIKEb2>>?"CD 5D ?2UmH:W4UU2/9FFWN"E !!8
!DE"%h/88I %h//AB C$$,: .&&5%6a9I %d%f%HHMMOHH((*LLN)dmm_-F!b/2(%hi0"++%hh/%hk2BMM
 + )&"7IJKK6		e8KXVr+   )r   strr    r!   rI   r   r   rA   rB   r   r   )r0   r6   fapply_after_passrR   rQ   r-   r,   s   `    @r)   innerz&graph_optimization_pass.<locals>.inner]   s    	88 	8 	8 D>=J'K]]'K8$ +x(89:>>xH !, 
t'	Wbnno56'	W?B'	WNQ'	W'	W 
'	WR ] (Ls   B )r   )r,   r-   rV   s   `` r)   graph_optimization_passrW   ?   s    <3H 3 3j Lr+   T)unsafe_hashc                       e Zd ZU eej
                     ed<   eej                     ed<   eej                     ed<   eej                     ed<   ej                  ed<   e
ej                     ed<   y)	CommBlockshape	node_listinputs
wait_nodes	comm_nodeoutputsN)r5   r4   __qualname__r   torchSize__annotations__r   rA   Noder    r+   r)   rZ   rZ      sS    EJJBGG}MRWWww\r+   rZ   r_   c                    d}g }g }t        j                  | j                  i | j                  }|D cg c]  }t	        |t
        j                        s|! }}d}d}d}	t        j                  | dg      }
|
r|dk  r|
j                         }||dz  }|
r|
j                  d       2|j                  |       |j                  j                  |      r|j                  |       n=|j                  D ].  }t	        |t
        j                        s|
j                  |       0 |
r|dk  r|st        d      t               }t        j                  |      }
|
r|
j                         }|J |j                  D ]l  }t	        |t
        j                        r>|j                  j                  |	      r#|
j                  |       |j                  |       [|j!                  |        n |
r|d   j"                  j%                  dd      }t'        |r&t)        j*                  d	 |j,                  D              nd||| ||
      S c c}w )a-  Find out all the nodes belong to this communcation given a collective node (e.g., allreduce).

    Args:
        comm_node(fx.Node): The target communication/collective node.

    Returns:
        The CommBlock that encapsulates the related nodes (e.g., wait_node) of
        the given comm_node.
       r   )	wait_commwait_tensor)splitreshapegetitemdetachaliasN   z?The wait nodes are too far away from the comm node {comm_node}.tensor_metac              3   2   K   | ]  }t        |        y wr&   )int).0ss     r)   	<genexpr>z!get_comm_block.<locals>.<genexpr>   s     ;):AQ):s   r[   r\   r^   r_   r]   r`   )pytreearg_tree_leavesr8   r9   r@   rA   re   collectionsdequepopleftappendname
startswithusersRuntimeErrorsetrI   metagetrZ   rb   rc   r[   )r_   MAX_WAIT_DISTANCEr\   r^   r]   inpinput_nodesdistancewait_prefixesnon_end_users_nodesnodesnodechildr`   userrq   s                   r)   get_comm_blockr      s    IJ##Y^^Hy7G7GHF"(E&3JsBGG,D3&KEH0MLy$/0E
HqL}}<MHT"99.d#eRWW-LL' $ HqL M
 	

  EGj)E
}}JJD$(TYY-A-ABU-VT"  &D!   a.%%))->K?Jejj;):):;;PT U Fs   IIr7   comm_ops.c                     | j                   j                  D cg c](  }|j                  j                  |      rt	        |      * c}S c c}w r&   )rE   r   r~   r   r   )r7   r   r   s      r)   get_all_comm_blocksr      sH    
 HHNN"D99) 	t"     -A	fake_tensor_modevalc           	          t        | t        j                  |j                  |j                  d|j
                        |j                        S )Nr   )dtypedevicerequires_grad)r   rb   emptyr[   r   r   r   r   r   s     r)   _create_meta_valr      sC    
 II))++		
 	

	 	r+   c           	      v    t        |j                  |j                  |j                  |j                  d di       S )NF)r[   r   r   stridememory_formatis_quantizedqparams)r   r[   r   r   r   r   s     r)   _create_meta_tensor_metar      s8     iiii''zz	 	r+   meta_valfunctionr8   r9   c                    | j                   j                  |||      }|t        ||f      \  }}g }	d }
|D ]X  }t        |t        j
                        s|	j                  |       /|j                  d   }|	j                  t        ||             Z t        |	|      \  }} ||i |}n|}||j                  d<   t        ||      |j                  d<   |S )Nr   rq   )rE   call_functionr   r@   rA   re   r}   r   r   r   r   )r7   r   r   r   r8   r9   r   	flat_argsspecnew_flat_argsr   argr   	fake_argsfake_kwargsnew_meta_vals                   r)   _call_functionr     s     88!!(D&9D&f~6	4Cc277+$$S)((5/C  !12BC!HI  "0t!D	;:k:#DIIe78H,WDIImKr+   fused_comm_blockcomm_blocksnode_indicesc                    d}| j                   j                  D ]/  }||j                  k(  r nt        j	                  ||      |      }1 |j                  }|j
                  d   }| j                   j                  |      5  | j                   j                  t        j                  ||D cg c]=  }t        t        t        j                  |j                        j                               ? c}f      }	ddd       g }
	}| j                   j                  |	      5  t!        |      D ]L  \  }}|j
                  d   }t#        j$                  t'        |j(                              }|rk|j+                         }t-        |t.        j0                        s-|   |k  r5|
j3                  |       |j5                  t'        |j(                               |rk| j                   j                  t6        j8                  |	|f      }| j                   j                  |      5  | j                   j                  t        j:                  ||j                  f      }ddd       | j                   j=                  |       O ||	k(  r}ddd       t?        |
fd      }
| j                   jA                  |
|       | j                   jC                          yc c}w # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   nxY w)zzScatter the result of the fused communication node to the original users -- splitting the output and reshape each subitem.r   Nc                     |    S r&   rf   )r   r   s    r)   <lambda>z&_scatter_wait_result.<locals>.<lambda>f  s
    |D?Qr+   )key)"rE   r   r_   maxr   r^   inserting_afterr   atenrk   rs   r   rb   rc   r[   numel	enumeraterz   r{   listr   r|   r@   rA   re   r}   extendoperatorrm   rl   node_replace_all_uses_withsorted
move_afterrG   )r7   r   r   r   last_wait_node_idxr   fused_comm_nodefused_wait_nodecb
split_nodeneed_sort_nodeslast_split_reshape_nodeidx
comm_block	orig_waitr   	user_nodesplit_idx_nodewait_output_nodes      `               r)   _scatter_wait_resultr   -  sl    #--- T#568J
  '00O&11!4O		!	!/	2XX++JJCNO;RT%**bhh/5578;O

 
3 O(		!	!*	-(5OC
 #--a0I%%d9??&;<E!MMO	!)RWW5	*-??#**95LLioo!67   XX33H4D4DzSVFWXN)).9#%88#9#9LL>:3C3C"D$  : HH//	;KL)  6, #j0&6#1 
.4 _2QROHH)@AHH  "I P 
3	2< :9# 
.	-sK   *K,AK.KB6K-AK-7K!.K-KK!K*&K--K6c                 4   |d   j                   d   }d}g }|D ]z  }|j                   d   }|j                  j                  d      r't        t        j
                  |j                  d         }|j                  |       ||   }||k\  sp||k7  sJ |}|}| | j                  j                  |      5  g }	|D ]<  }|	j                  t        | t        dt        j                  j                  |             > 	 ddd       | j                  j                  	d         5  t        | t        dt        j                  |	      }
ddd       |d   }|j                   }|j"                  d   }| j                  j                  
      5  t%        |j                  |j&                  f      \  }}|
|d<   t)        ||      \  }}t        | t        |
j*                  d   |j,                  g|i |}ddd       | j                  j                        5  t%        |j                  |j&                  f      \  }}||d<   t)        ||      \  }}t        | t        |
j*                  d   |j,                  g|i |}ddd       |	|
|gz   }| j                  j/                  ||       |
j*                  j1                  d      }t3        |j4                  ||g|g||
g|h      }t7        | |||       |S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w)zLFuse the CommBlocks using concat given a list of CommBlock (only allreduce).r   cloneNr   rq   rw   )r]   r~   r   r   rA   re   r8   r}   rE   r   r   r   r   flatten
using_intscatr_   r^   r   r9   r   r   targetr   r   rZ   r[   r   )r7   r   r   last_input_nodelast_input_indexall_input_nodesr   
input_nodeindex
cat_inputscat_node	last_commlast_comm_nodelast_wait_nodeflatten_argsr   r8   r9   r   r   nodes_to_moverq   r   s                          r)   _fuse_with_catr   l  s    "!n++A.OO!
&&q)
??%%g.bggzq'9:Jz*Z($$,,,,(O$ " 
	!	!/	2
)J($0G0G * 
3 
	!	!*Q-	0!"&6dhh
S 
1 BI((N))!,N		!	!(	+)>+>+>@U@U*VWd"Q%lD9f(MM% !!	

 
 
	 
, 
	!	!/	2)>+>+>@U@U*VWd)Q%lD9f(MM% !!	

 
 
	 
3 (O_!MMMHH7--##M2K "O4#$!z ! -{LIw 
3	2 
1	0 
,	+ 
3	2s3   1AK'"K4A&L
A&L'K14K>LLc                 *   t        | j                  j                        D ci c]  \  }}||
 }}}|D ]V  }|j                  }d}|j                  D ]  }||   }	|	|kD  s|}|	} | j                  j                  ||j                         X y c c}}w )Nr   )r   rE   r   r_   r]   node_append)
r7   r   ir   r   r   
last_inputlast_input_idxinput	input_idxs
             r)   _expedite_comm_opsr     s    +4RXX^^+DE+D4D!G+DLE!
))
&&E$U+I>)"
!*	 '
 	Z)=)=> " Fs   B)r,   r-   bucket_size_mbc                 F   t        | t        j                  df      }t        | |       t        | t        j                  df      }t	        | j
                  j                        D ci c]  \  }}||
 }}}d}|dz  }dx}x}	}
|	t        |      k  rk|
t        t        j                  ||	   j                        j                         dz  z  }
|	dz  }	|
|k  rTt        | |||	 |       |}|	}d}
|	t        |      k  rk|t        |      k  rt        | |||	 |       yyc c}}w )zoRun fuse communication with concat.

    This implementation uses concat to concat the bucketed gradients.
    
all_reducei   r      rp   N)r   r   	ALLREDUCEr   r   rE   r   lenr   rb   rc   r[   r   r   )r7   r   r   r   r   r   bucket_sizebucket_cap_sizerN   end	curr_sizes              r)   comm_fusion_with_concatr     s2    &b8+=+=|*LMKr;'%b8+=+=|*LMK+4RXX^^+DE+D4D!G+DLEK$w.OEC)
K 
 T%**k#&6&<&<=CCEII	q{"r;uS1<@%	 K 
  3{##2{55|D $! Fs   'Dc           
         t        | t        j                  df      }t               }|D ].  }|j                  D ]  }|j                  |j                          0 t        | j                  j                        D ci c]  \  }}||
 }}}|D ]  }t        |j                        dk\  sJ d| d       t        t        t        t        |j                              j                              }d}	d |j                  D        D ]  }
||
   }||	k  s|
}|}	 d}t        |j                        D ]  \  }}||j                  d   k(  s n |dk\  sJ | j                  j                  |j                  |d	 |        y	c c}}w )
zFDelay the execution of wait tensors of allreduce until its first user.r   rp   z1Found a allreduce that has zero outputs/users -- r3   l        c              3   B   K   | ]  }|j                   D ]  }|   y wr&   )r   )rt   outputr   s      r)   rv   z%schedule_comm_wait.<locals>.<genexpr>
  s     S(9fflldTlT(9s   r   r   N)r   r   r   r   r`   updater   r   rE   r   r   nextiterr\   r^   move_before)r7   r   allreduce_users	allreducer   r   r   r   target_nodetarget_node_indexr   r   wait_idxs                r)   schedule_comm_waitr    s    &b8+=+=|*LMK %(EO 	''F""6<<0 ( ! ,5RXX^^+DE+D4D!G+DLE 	 	!!"a'	L>ykK	L' 4T)*;*;%< = C CDE!S	(9(9SD &E(("$)!	 T '	(;(;<NHdy++A.. = 1}}
Y00;[I+ ! Fs   <E=c                    d}t               }t        | j                  j                        D ]y  }|j                  r|j
                  t        j                  k7  s'|j                  t        j                  j                  k7  rVt               }t        j                  |dg      }d}d}|r||k  r|j                         }||dz  }|r|j                  d       2|j!                  |       |j
                  t        j                  k(  r&t#        |j                        j%                  d      rd}t'        j(                  |j*                  i |j,                  }	|	D ].  }
t/        |
t0        j2                        s|j                  |
       0 |r||k  r|si|j5                  |       | t        | j                  j                        D ]/  }|j                  r||vr| j                  j7                  |       1 y)a  Erase the orphant copy_ that generated when tracing optimizer.

    Two reasons why we could not simply use the DCE of fx.Graph.
    1. fx.Graph treats copy_ as a side-effect node and does not erase it.
    2. Users may want to preserve some orphan `copy_` that is not from the
       optimizer.
    If the second reason does not hold, this pass can be rewritten as using
    DCE from fx.Graph (with the overwrite to the side-effect node list).
    rh   Nr   Frp   )zaten._foreach_zaten._fused_T)r   reversedrE   r   r   opr   CALL_FUNCTIONr   r   copy_defaultrz   r{   r|   r}   rI   rS   r   rx   ry   r8   r9   r@   rA   re   r   
erase_node)r7   MAX_COPY_DISTANCEremove_candidatesr   copy_ancestorsr   r   should_removevisitingparentsparents              r)   remove_copy_from_optimizerr    s    &)e(::77b&&&$++9K9K*K'*u!!4,/#44}}HALL&x({{b...3x3G3R3R24 !%,,hmmOxOG!fbgg.LL( " #44   $$^4= )@ (::((
D! )r+   AdamArgs)paramsgradsexp_avgsexp_avg_sqsmax_exp_avg_sqsstate_stepsc                   V   e Zd ZU ej                  ed<   eed<    ee      Z	e
ej                     ed<    ee      Ze
ej                     ed<    ee      Ze
ej                     ed<    ee      Ze
ej                     ed<    ee      Ze
ej                     ed<   d	 Zd
 Zd Zy)FusedAdamBlock
optim_nodegenerate_outputdefault_factoryparam_outputsgrad_outputsexp_avgs_outputsexp_avg_sqs_outputsr  c                       fd} |d j                           |d j                          |d j                         y )Nc                    j                   j                  }|j                  j                         5  |j                  t        j
                  j                   | f      }d d d        t        j                   j                  |          D ]  \  }}|j                        5  |j                  t        j
                  ||f      }d d d        |j                        5  |j                  t        j                  ||f      }d d d        |j                          y # 1 sw Y   xY w# 1 sw Y   exY w# 1 sw Y   5xY wr&   )r  rE   r   r   r   rm   r   r8   r   r  r}   )	arg_idxoutput_listrE   optim_getitemr   r   updated_argoutput_copyselfs	           r)   _generate_outputsz:FusedAdamBlock.generate_outputs.<locals>._generate_outputsi  s    OO))E&&t7 % 3 3$$t&@! 8 $DOO$8$8$AB3**=9"'"5"5 ((=!*<#K : **;7"'"5"5djj3BT"UK 8"";/ C	 87
 :9 87s#   -D!#D(#D4D%(D1	4D=	r         r  r!  r"  )r*  r+  s   ` r)   generate_outputszFusedAdamBlock.generate_outputsf  s:    	0 	!T//0!T223!T556r+   c                       fd} |d j                           |d j                          |d j                         y )Nc           
         	j                   }	j                   j                  D ]J  }|j                  t        j                  k(  sJ d	j                    d       |j
                  d   | k(  sH|} n |	j                   k7  sJ d	j                           |j                  	j                   gt        t        t        t        j                     	j                   j
                  d               z         |j                  D ]  }|j                  t        j                  k(  sJ d|j                   d       |j
                  d   }t        t        |j                              }t        |j                        j                  d      sJ d|j                   d       |||<    t!        |      D ]  \  }}|	j                   k7  rJ | d	        |sJ d
	j                    d       y )NzThe user of z is not getitem.rp   z!Cannot find the getitem node for r   Unexpected node target r3   
aten.copy_th output is not replaced.The output for 
 is empty.)r  r   r   r   rm   r8   r   r   r   r   rA   re   r   r   rS   r   r   )
args_idxr&  r'  r   r(  r   r)  r   r   r*  s
            r)   _populate_outputsz:FusedAdamBlock.populate_outputs.<locals>._populate_outputs  s    OOM--KK8#3#33D!$//!22BCD399Q<8+$(M . 0E24??2CDE0!CT"'']DOO<P<PQR<S(T$UU  -22&&(*:*::C,[-?-?,@BC:!&&q)"4(9(9#:;;--.99  C,[-?-?,@BC  $/C   3 '{3	60RQC7Q2RR0 4 M/$//1B* MM;r+   r   r,  r-  r.  )r*  r8  s   ` r)   populate_outputszFusedAdamBlock.populate_outputs}  s;    	N< 	!T//0!T223!T556r+   c                 x    | j                   ry | j                  r| j                          y | j                          y r&   )r  r  r/  r9  r*  s    r)   __post_init__zFusedAdamBlock.__post_init__  s/    !!#!!#r+   N)r5   r4   ra   rA   re   rd   boolr   r   r  r   r   r!  r"  r  r/  r9  r<  rf   r+   r)   r  r  Z  s    #(#>M4=>"'"=L$rww-=&+D&Ad277mA).t)DbggD%*4%@OT"'']@7."7H$r+   r  c                   ~    e Zd ZU ej                  ed<   eed<    ee      Z	e
ej                     ed<   d Zd Zd Zy)	ForeachAddBlockadd_noder  r  r`   c                 r   | j                   j                  }t        t        t        t
        df   | j                   j                  d               D ]  \  }}|j                  | j                         5  |j                  t        j                  | j                   |f      }d d d        |j                        5  |j                  t        j                  ||f      }d d d        | j                  j                          | j                  sJ d| j                    d       y # 1 sw Y   xY w# 1 sw Y   TxY w)N.r   r5  r6  )r@  rE   r   r   r   r   r8   r   r   r   rm   r   r  r`   r}   )r*  rE   r   r   r(  r)  s         r)   r/  z ForeachAddBlock.generate_outputs  s     ##U38_dmm6H6H6K LMFAs&&t}}5#11(2B2BT]]TUDVW 6&&{3#11$**sK>PQ 4LL, N ||Ht}}oZHH| 6533s   1-D!7#D-!D*	-D6	c                    t        t        t        df   | j                  j                  d         D cg c]  }| j                   c}| _        | j                  j                  D ]  }|j                  t        j                  k(  sJ d|j                          t        t        |j                  d         }t        t        |j                              }t        |j                        j                  d      sJ dt        |j                                || j
                  |<    t        | j
                        D ]  \  }}|| j                  k7  rJ | d        y c c}w )N.r   r2  rp   r3  z'The execpted output node is different, r4  )r   r   r   r@  r8   r`   r   r   r   rm   rs   r   r   rS   r   r   )r*  _r(  r   r)  r   r   s          r)   r9  z ForeachAddBlock.populate_outputs  sF    $(c3h9K9KA9N#O
#OaDMM#O
  ==..K""h&6&66>(););(<=>6sK,,Q/0CtK$5$567K{))*55 S8[=O=O9P8QRS  !,DLL / #4<<0IAvT]]*Lqc1K,LL* 1
s   Ec                 x    | j                   ry | j                  r| j                          y | j                          y r&   )r`   r  r/  r9  r;  s    r)   r<  zForeachAddBlock.__post_init__  s-    <<!!#!!#r+   N)r5   r4   ra   rA   re   rd   r=  r   r   r`   r   r/  r9  r<  rf   r+   r)   r?  r?    s:    gg"48GT"'']8
IM$$r+   r?  c                   "    e Zd ZU eed<   eed<   y)FusedOptimizerBlockstepoptimN)r5   r4   ra   r?  rd   r  rf   r+   r)   rF  rF    s    
r+   rF  r  c                 0   d}t        j                  | dg      }| }d}|r||k  r|j                         }||dz  }|r|j                  d       2|j                  t
        j                  k(  r't        |j                        j                  d      r|}nG|j                  d t        j                  |j                  i |j                  D               |r||k  r|| k(  rt        d|  d| d	      t!        |d
      }t#        | d
      }t%        ||      S )z@Given a fused optimizer node and return the FusedOptimizerBlock.rh   Nr   rp   zaten._foreach_addc              3   T   K   | ]   }t        |t        j                        r| " y wr&   )r@   rA   re   )rt   as     r)   rv   z,get_fused_optimizer_block.<locals>.<genexpr>  s&      JAa) Js   &(z;Cannot find step node (foreach_add) for the optimizer node z with z? BFS distance. The API design does not match the tracing graph.Fr  )rz   r{   r|   r}   r  r   r  rS   r   r   r   rx   ry   r8   r9   r   r?  r  rF  )r  MAX_STEP_DISTANCEr   	step_noder   r   rG  rH  s           r)   get_fused_optimizer_blockrO    s(   z401EIH
H00}}<MHT"WW(((S-=-H-H.
 ILL //JdkkJ  H00$ JIl&!2 3 4??
 	
 9e<D:u=EtU++r+   	optim_opsc                     | j                   j                  D cg c](  }|j                  j                  |      rt	        |      * c}S c c}w )zQFind all the FusedOptimizerBlock that the optimizer operators are in `optim_ops`.)rE   r   r~   r   rO  )r7   rP  r   s      r)   get_all_fused_optimizer_blocksrR    sH     HHNN"D99	* 	"$'"  r   orig_optim_blocksplit_gradientsc           
        %& t        |j                  j                  j                   }t        g g g g g g       t        g g g g g g       f}g g f}g g f}t	        |j
                        D ]  \  }}||v rdnd}	||	   j                  |       t        |||	         D ]  \  }
}|
s	|j                  |
|           ||	   j                  d   }t        |j                        j                  d      sJ d|j                   d       |j                  d   }dt        |j                        v sJ d|j                   d	       |j                  d   }||	   j                  |        t        d
 ||z   D              st        d      t        | j                        }g }t!        |j                  |j"                  f      \  %}t%        j&                  t(              &t	        %      D ]4  \  }}t+        |t,        j.                        s!&|   j1                  |       6 dt,        j.                  dt,        j.                  f%&fd}t3        d      D ]x  }	g }g }| j                  j5                  |j                  j                        5  ||	   D ]  }|j                  t7        t8        t,        j.                  df   |j:                  j<                  j                  d         |          |j                  |j:                  j>                  |           | j                  jA                  tB        jD                  jF                  |df      }ddd       tI        d      }t	        |j>                        D ]W  \  }}||   } |||       ||	   j                  |   |k(  sJ d| d||	   j                  |           |||	   j                  |<   Y | j                  j5                  |j>                  d         5  | j                  jA                  tB        jJ                  jL                  ||	   |j                  j                  j"                        }ddd       tO        d      }t	        ||	         D ]?  \  }}d}|D ]3  }tQ        |j                  |      } tQ        ||      }! || |   |!|          5 A |j                  tS        ||             { tU        %|      \  }"}#| j                  jW                  ||"       | j                  jY                  ||#       t[        j\                  |j                  j^                  |j                  j`                  |j                  jb                        D ]  }$| j                  je                  |$        | j                  jg                          |j:                  j>                  D ]  }$| j                  je                  |$        | j                  jg                          |d   |d   fS # 1 sw Y   xY w# 1 sw Y   xY w)a  Split the `orig_optim_block` into two FusedOptimizerBlock.

    The first one will be the optimizer that optimize `split_gradients`. The second one is
    used to optimize the remaining gradients.
    An assert will be raised if one of the optimizer optimize zero gradients.
    r   rp   r   r3  zThe copy output is z, expect aten.copy_rm   zThe copy getitem is z, expect operator.getitemc              3       K   | ]  }|  y wr&   rf   )rt   ls     r)   rv   z$_split_fused_adam.<locals>.<genexpr>9  s     CAQqAs   z1At least one split optimizer does not have input.	orig_nodenew_nodec                 $    |    D ]  }||<   	 y r&   rf   )rX  rY  r   flatten_output_argsflatten_output_args_indicess      r)   replace_flatten_output_argsz6_split_fused_adam.<locals>.replace_flatten_output_argsF  s    .y9C'/$ :r+   r,  .NTrL  z*The expected step output node mismatched,  r.  )4r  rH  r  r8   r   r  r}   zipr  rS   r   r   all
ValueErrorr   rE   r   r9   rz   defaultdictr   r@   rA   re   rI   ranger   r   r   rG  r@  r`   r   r   _foreach_addScalarr?  _fused_adamr  r  getattrrF  r   node_set_argsnode_set_kwargs	itertoolschainr  r!  r"  r	  rG   )'r7   rS  rT  orig_optim_args
optim_argsorig_optim_indicesorig_step_indicesr   gradient	group_idxorig_arg	optim_argorig_step_outputorig_step_getitemorig_step_idxr   resultsr   
output_argr]  	step_argsorig_step_outputsrG  
step_blockr   step_outputrH  optim_blockcurr_idxorig_idx
list_namesr~   	orig_list	curr_listoutput_argsoutput_kwargscopy_outputr[  r\  s'                                        @@r)   _split_fused_adamr    s     0 6 6 A A F FGO2r2r2r2HRRRQS4TUJ792h68"X"?#8#89X!_4A!	9%,,S1#&
98M#NHi   #/ $O &i0<<R@#**+66
 	N !1!8!8 99LM	N 
 -11!4C$$
 
 	V!"3":":!;;TU	V 
 *..q1)$++M:/ :2 C,/AACCLMM!F)+G ,fkk6==-I J 	$   %%89Zj"''*'
377< :0rww 0"'' 0
 1X	#%	+- XX%%&6&<&<&G&GH(3  rww|,.>.C.C.L.L.Q.QRS.TU
 "(()9)>)>)F)Fs)KL 4 88))!!((AD I %T4@
'
(:(:;NA{  13'(8+Fi(44Q7;KK <=M<Nai(44Q78:K 4?Jy!--a0 < XX%%j&8&8&;<HH**  ((9% &&1188E = %UDA"+,>y,I"JHhUJ"#$4$:$:DA	#K6	+Ih,?8ATU # #K 	*:{CD] d "00CT!JKHH6;/HHV]3 ,,//22
 	K( HH  "',,44
K( 5 HH  "1:wqz!! IH6 =<s    B?W3+AX 3W=	 X
	r}  c                     |st        d      t        |j                  j                  j                        j                  d      rt        | ||      S t        d      )Nz#The given split_gradients is empty.zaten._fused_adamz Only fused_adam is supported now)ra  rS   rH  r  r   r   r  NotImplementedError)r7   r}  rT  s      r)   split_fused_optimizerr    sT    
 >??
;''../::;MN [/BB!"DEEr+   target_comm_nodetarget_dest_nodec                 `   t        | d      D ]  }|j                  j                  |k(  s n t        d|       t	        | d      }|D ]V  }t        |j                  j                  j                   }t        t        |j                              }||j                  v sV n t        | d      t        | ||j                        \  }}	t        | |j                  |j                  j                   g      }
t#        | j$                  fd      d   }| j$                  j'                  |
|       y)zExtract a comm block and split out a new optimizer and step for it.

    This subgraph is then moved to the forward graph.
    r   zCannot find rf  z$ is not used by any fused optimizer.c                 "    | j                   k(  S r&   )r~   )nr  s    r)   r   z0iter_move_grads_and_optimizers.<locals>.<lambda>  s    aff8H.Hr+   r   N)r   r_   r~   ra  rR  r  rH  r  r8   r   r   r`   r  r  find_all_descendantsrG  r@  r   rE   move_to_next_iter_before)r7   r  r  r   optim_blocksr}  rm  
one_output
move_optimrC  
move_nodes	stop_nodes     `         r)   iter_move_grads_and_optimizersr    s    *"l;
$$(88 < <(8'9:;;1"mDL#{00;;@@A
$z1123
)))	 $ ,--QRSS)"k:;M;MNMJ%
Z!!:??#;#;<J "(($HI!LIHH%%j)<r+   parent_nodesc                    t        |      dkD  sJ d       t        | j                        }t        j                  |      }t               }|rb|j                         }|j                  |       ||j                  D cg c]%  }t        |t        j                        s||k7  s$|' c}z  }|rb| j                  j                  D cg c]	  }||v s| }}|S c c}w c c}w )zBIdentify the list of nodes to move during FX graph transformation.r   zNo parent nodes are given.)r   r   rE   rz   r{   r   r|   rI   r   r@   rA   re   r   )r7   r  r   dq_parent_nodesmove_node_setr   ur  s           r)   r  r    s    
 |q >">> !F!''5OEM
&&($zz
!!Z277%;VAz
 	
  $&88>>K>4T]5J$>JK
 Ls   <CC"C	CC)r"   )Vrz   rj  loggingr   r'   r?   dataclassesr   r   	functoolsr   typingr   r   r   r	   r
   r   r   r   r   r   r   rb   torch.fxrA   torch._subclasses.fake_tensorr   r   #torch.distributed._spmd.graph_utilsr   r   r   r   r   )torch.distributed._spmd.iter_graph_moduler   torch.fx.passes.shape_propr   torch.utilsr   rx   torch.utils._pytreer   r   	getLoggerr   Loggerrd   opsr   r   r   r   rS   rb  r    r!   r#   r*   rW   rZ   re   r   r   r   r   r   rs   r   r   r   r   r  r  
namedtupler  r  r?  rF  rO  rR  r  r  r  r  rf   r+   r)   <module>r     s-         (       D  F 5 ) <***+?@ @yy~~!# ES !1H1H1H1M KSX. M 2I1H1H1M KSX. M C  3  SH%S(#S Sl t  Bbgg B) BJ#(sCx#)=#>	)_$	 "$	  $ z" 	
   WW<<#<#<# i<# rww|$	<#
 
<#~RRiR rww|$R 	Rj
?? 
?i 
?T 
? EEE 
E	EB *+ J?  Jt  J	 JF 1"? 1"t 1"	1"j ";!!T tL$ L$ L$^ t+$ +$ +$\ t  
","'' ",6I ",J$)%S/3*>$?	
@"@")@" \@"  334	@"F
F
F$
F \
F  334	
F -.#$ = = =  = 
	 =	 =Frww- 
"'']r+   