
    Ph                     2   d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZmZ d dlmZmZmZmZmZmZmZmZ d dlmZ d dlZd dlmZ d dlmZ d dlm Z m!Z" d d	l#m$Z$ d d
l%m&Z& d dl'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z.m/Z/m0Z0 d dlm1Z1m2Z2 d dl3m4Z5 d dl6m7Z7m8Z8m9Z9 d dl:m;Z;m<Z<  G d de      Z= G d de      Z> G d dej~                  e      Z@dej~                  dej                  defdZB	 	 dGdej~                  deCfdZDdHdZEd ZFd  ZGdIdej~                  d!eCfd"ZHdej~                  d#eCfd$ZIdej~                  d%eCfd&ZJ G d' d(      ZK G d) d*e@      ZL G d+ d,e@      ZM G d- d.eM      ZN G d/ d0eM      ZO G d1 d2e@      ZP G d3 d4eP      ZQ G d5 d6ej~                        ZR G d7 d8eM      ZSd9eeTee   f   d:ed;efd<ZU G d= d>e8      ZV G d? d@e7      ZW G dA dBej~                        ZX G dC dDej~                        ZY G dE dFej~                        ZZy)J    N)ABCabstractmethod)nullcontext)deepcopy)autoEnum)AnyCallableDictListOptionalTupleTypeUnion)mock)
CPUOffloadFullyShardedDataParallel)TrainingState)#NO_RESHARD_AFTER_FORWARD_STRATEGIES)BackwardPrefetchMixedPrecisionShardingStrategy)ShardedGradScaler)always_wrap_policyModuleWrapPolicywrap)TransformerDecoderLayerTransformerEncoderLayer)DistributedDataParallel)MultiProcessTestCaseMultiThreadedTestCase
TEST_SKIPS)FILE_SCHEMAget_cycles_per_msc                   (    e Zd Z e       Z e       Zy)FSDPInitModeN)__name__
__module____qualname__r   NO_FSDP	RECURSIVE     nC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/testing/_internal/common_fsdp.pyr&   r&   %   s    fGIr-   r&   c                   6    e Zd Z e       Z e       Z e       Zy)CUDAInitModeN)r'   r(   r)   r   CUDA_BEFORE
CUDA_AFTER
CUDA_NEVERr,   r-   r.   r0   r0   .   s    &KJJr-   r0   c                       e Zd ZdZedeej                  df   fd       Zedej                  fd       Z	edd       Z
eeddd	d
ej                  dedededeeeef      dededej,                  fd              Zy)FSDPTestModelzZThis defines the interface expected from all models used commonly for
    FSDP unit tests.return.c                      y)z+Returns an input for the model as as tuple.Nr,   selfdevices     r.   	get_inputzFSDPTestModel.get_input;        	r-   c                      y)z,Returns the loss given the input and output.Nr,   )r9   inputoutputs      r.   get_losszFSDPTestModel.get_loss@   r<   r-   Nc                      y)z<Runs the backward pass (e.g. including ``loss.backward()``).Nr,   r9   losss     r.   run_backwardzFSDPTestModel.run_backwardE   r<   r-   F)fsdp_kwargsdeterministicgroupfsdp_init_mode	init_argscuda_init_moderE   rF   init_kwargsc                     y)z&Initializes an instance of this model.Nr,   )rG   rH   rJ   rE   rF   rI   rK   s          r.   initzFSDPTestModel.initJ   s     	r-   r6   N)r'   r(   r)   __doc__r   r   torchTensorr;   r@   rD   staticmethoddistProcessGroupr&   r	   r0   r   r   strboolnnModulerM   r,   r-   r.   r5   r5   7   s     5s):#;         15#
  
$
 
 %	

 d38n-
 
 
 

  
r-   r5   modelprocess_group	assert_fnc                    | j                         D cg c]%  \  }}||j                         j                         f' }}}|| j                         D cg c]%  \  }}||j                         j                         f' c}}z  }t	        j
                  |      }t        |      D 	cg c]  }	d }
}	t	        j                  |
||       |
d   }|
dd D ]%  }t        ||      D ]  \  \  }	}\  }	} |||        ' yc c}}w c c}}w c c}	w )a  
    All-gathers module states across ranks and calls ``assert_fn`` on each pair
    of corresponding states from rank 0 and a nonzero rank. For example, if
    ``assert_fn`` is ``self.assertEqual()``, then this checks that all module
    states are equal across ranks.
    NrG   r      )	named_parametersdetachcpunamed_buffersrS   get_world_sizerangeall_gather_objectzip)rY   rZ   r[   
param_nameparamnamed_module_statesbuffer_namebuffer
world_size_olistrank0_statesstatep1p2s                  r.   _assert_module_statesrs   Y   s     "'!7!7!9!9J 
U\\^'')*!9   #(#6#6#8#8K 
fmmo))+,#8  $$]3J ,-,aT,E-5"5]K8Lqr #L% 8GQWab" !9 
 .s   *C=*D'	D	zero_buffersc                    |rt        j                  |       n	t               }|5  | j                         D ]/  }t	        j
                         5  |j                          ddd       1 |rB| j                         D ]/  }t	        j
                         5  |j                          ddd       1 ddd       y# 1 sw Y   xY w# 1 sw Y   PxY w# 1 sw Y   yxY w)zBZeros the parameters and optionally buffers of ``model`` in place.N)FSDPsummon_full_paramsr   
parametersrP   no_gradzero_buffers)rY   rt   summon_fullctxrh   rk   s         r.   _zero_modelr~   v   s     -8$
!
!%
([]C	%%'E ! ( --/]]_LLN %_ * 
  %_ 
s;   (CB43CC !
C4B=9C C	CCc                 j    |s| j                         } |r| j                          | j                         S N)cudahalf
state_dict)rY   cpu_offloadr   s      r.   _get_state_dictr      s+    



r-   c           	      j    dj                  |D cg c]  }|| t        |         nd c}      S c c}w )Nrm   none)joinrU   )test_name_mappingargsss      r.   subtest_namer      s;    88IMNAam	3q6	"	?N Ns   0c                 @   |j                         D ];  \  }}|j                  t        j                  d      k7  s)|j                         ||<   = | dk(  r|nd g}t	        j
                  |       |d   }|j                         D ]  }||   j                         ||<    |S )Nra   r   )itemsr:   rP   ra   rS   broadcast_object_listkeysr   )rankr   rg   rh   rn   s        r.   _broadcast_state_dictr      s     (--/
E<<5<<..%*YY[Jz" 0  19Z$/Eu%qJ oo'
!+J!7!<!<!>
: (r-   recursec                     t        j                  | |      5  t        t        | j	                                     cddd       S # 1 sw Y   yxY w)a[  
    Returns the full unsharded parameters of ``model``. Any FSDP-managed
    parameters offloaded to CPU are moved to GPU in the returned list.

    Args:
        recurse (bool): If ``False``, only unshards the parameters immediate to
            ``model``; if ``True``, recurses through the module hierarchy
            rooted at ``model``.
    )r   N)rv   rw   r   listrx   )rY   r   s     r.   get_full_paramsr      s6     
	 	 	8U--/01 
9	8	8s   "AAmove_to_cudac                 *    |r| j                         S | S r   )r   )rY   r   s     r.   _maybe_cudar      s    '5::<2U2r-   	wrap_fsdpc                 (    |s| S t        | g|i |S r   rv   )rY   r   r   kwargss       r.   _maybe_wrap_fsdpr      s    !5CtE'CD'CF'CCr-   c                   :    e Zd ZdedefdZdefdZdefdZd Zy)	DummyProcessGroupr   sizec                      || _         || _        y r   )_rank_size)r9   r   r   s      r.   __init__zDummyProcessGroup.__init__   s    

r-   r6   c                     | j                   S r   )r   r9   s    r.   r   zDummyProcessGroup.rank       zzr-   c                     | j                   S r   )r   r   s    r.   r   zDummyProcessGroup.size   r   r-   c                 B    t        j                         }d }||_        |S )Nc                  d    t         j                  j                         } | j                  d       | S )Nr^   )rP   futuresFuture
set_result)futures    r.   
get_futurez/DummyProcessGroup.allreduce.<locals>.get_future   s'    ]]))+Fa Mr-   )r   Mockr   )r9   r   r   	dist_waitr   s        r.   	allreducezDummyProcessGroup.allreduce   s"    IIK		
  *	r-   N)r'   r(   r)   intr   r   r   r   r,   r-   r.   r   r      s2    S  c c 	r-   r   c                        e Zd Zdej                  dededef fdZd Zd Z	d Z
d	 Ze	 	 	 ddej                  d
ededeeeef      dededeej(                  ef   fd       Zd Z xZS )TransformerWithSharedParamsrG   rJ   add_bnrF   c                    t         |           |j                         | _        |j                         | _        |rt        j                  d       d}d}t        j                  ||      | _	        t        j                  |dddd      | _        t        j                  ||      | _        | j                  j                  | j                  _        | j                  d| j                  j                  j!                  |f             | j                  d	t        j"                  | j$                  t
        j&                  
             d| _        |r)t
        j                  j+                  | j(                        nt
        j                  j-                         | _        |t0        j2                  k(  r| j5                         } |r| j7                          y y )Nr               g?)d_modelnum_encoder_layersnum_decoder_layersdim_feedforwarddropout
vocab_biaslong_buffer)dtype)superr   r   r   rl   rP   manual_seedrW   	Embeddingembed_tokensTransformertransformerLinearoutput_projweightregister_buffernew_ones
zeros_liker   longbsBatchNorm1dIdentitybnr0   r1   r   eval)r9   rG   rJ   r   rF   d_vocabr   	__class__s          r.   r   z$TransformerWithSharedParams.__init__   s^    	JJL	**,a LL':>>  
 99Wg6 #'"3"3":":$++22;;WJG	
 	T__EJJ?	

 39%((&&tww/uxx?P?P?R\55599;DIIK r-   c                 ,   t        j                  d| j                  z          t        j                  d|      j	                  d| j
                        }t        j                  | j
                  dz  |      j	                  d| j
                        }||fS )Nr^      r:         )rP   r   r   arangeviewr   )r9   r:   srctgts       r.   r;   z%TransformerWithSharedParams.get_input   sl    !dii-(ll2f-221dgg>ll477Q;v6;;AtwwGSzr-   c                    | j                  |      }|| j                  z   | j                  j                  |      z   }| j                  |      }| j	                  |      }| j                  ||      }| j                  |      S r   )r   r   r   type_asr   r   r   )r9   src_idstgt_idsr   r   xs         r.   forwardz#TransformerWithSharedParams.forward  sv    (DOO#d&6&6&>&>s&CC(ggclS#&""r-   c                     |\  }}t         j                  j                  |j                  d|j	                  d            |j                  d      d      S )Nsum)	reduction)rW   
functionalcross_entropyr   r   )r9   r>   r?   rm   r   s        r.   r@   z$TransformerWithSharedParams.get_loss  sI    3}}**KKFKKO,chhrle + 
 	
r-   c                 $    |j                          y r   backwardrB   s     r.   rD   z(TransformerWithSharedParams.run_backward      r-   rH   rE   r6   c                 D   |i }|t         j                  k(  r&t        | t              r| d   }n| }t	        ||||      S |t         j
                  k(  rd|vrt        t        t        h      }n|j                  d      }d|v r8|d   t        j                  t        j                  hv rt        | t              sd}n| }t        | t              r| d   }	n| }	t	        |	|||      }
t        |
|fd|i|}|t        j                  k(  r|j!                         }|S t#        d|       )ao  
        Initializes a :class:`TransformerWithSharedParams` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps with
                top-level FSDP. By default, the top-level FSDP uses the
                ``ModuleWrapPolicy`` for encoder and decoder layers, but a
                different auto wrap policy may be specified via
                ``fsdp_kwargs``.
            cuda_init_mode (CUDAInitMode): Determines model movement to CUDA.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
            add_bn (bool): Whether to include batch norm in the model.
        Nr   auto_wrap_policysharding_strategyUnsupported FSDP init mode: )r&   r*   
isinstancetupler   r+   r   r   r   popr   HYBRID_SHARD_HYBRID_SHARD_ZERO2rv   r0   r2   r   
ValueError)rG   rH   rJ   rE   rF   r   pgr   fsdp_pg
tformer_pgm
fsdp_models               r.   rM   z TransformerWithSharedParams.init  sT   6 K\111%'1X.NFM  |555!4#3//$  $/??3E#F  ${2 34$113C3W3WXY"5%0%'"1X
"
+NFMA  "2 	J !8!88'__.
77GHIIr-   c                     | j                   gS r   )r   r   s    r.   get_ignored_modulesz/TransformerWithSharedParams.get_ignored_modulesd  s      !!r-   )NFT)r'   r(   r)   rS   rT   r0   rV   r   r;   r   r@   rD   rR   r&   r   r   rU   r	   r   rW   rX   rv   rM   r  __classcell__r   s   @r.   r   r      s    (  ( %( 	(
 (T#
 
 15#KJ  KJ$KJ %KJ d38n-	KJ
 KJ KJ 
ryy$	KJ KJZ"r-   r   c                        e Zd Zdej                  dededef fdZd Zd Z	d Z
d	 Ze	 	 ddej                  d
ededeeeef      dedej&                  fd       Z xZS )NestedWrappedModulerG   r   rJ   rF   c                    t         |           j                         | _        j                         | _        |t
        j                  k(  }fd}|rt        j                  d       t        j                  t        t        j                  dd      |       |t        j                   |t        t        j                  dd      |            t        t        j                  dd      |                   |t        t        j                  dd      |            t        t        j                  dd      |            | _        y )Nc                 &    rt        | fi S | S r   r   layerrE   rG   r   s    r.   _maybe_wrapz1NestedWrappedModule.__init__.<locals>._maybe_wrapv      E58K88Lr-   r   r   r   r   )r   r   r   r   rl   r0   r1   rP   r   rW   
Sequentialr   r   module	r9   rG   r   rJ   rF   rE   r   r  r   s	    ``  `  r.   r   zNestedWrappedModule.__init__i  s     	JJL	**,%)A)AA	
 a mm		!Q6BIIa,<l KL		"b 1<@ BIIb!$4lCD		!Q6

r-   c                 x    t        j                  d| j                  z          t        j                  dd|      fS )Nr^   r   r   r   )rP   r   r   randr8   s     r.   r;   zNestedWrappedModule.get_input  s.    !dii-(

1a/11r-   c                 $    | j                  |      S r   r  r9   r   s     r.   r   zNestedWrappedModule.forward      {{1~r-   c                 &    |j                         }|S r   )r   r9   r>   r?   rC   s       r.   r@   zNestedWrappedModule.get_loss  s    zz|r-   c                 $    |j                          y r   r   rB   s     r.   rD   z NestedWrappedModule.run_backward  r   r-   rH   rE   r6   c                     |i }|t         j                  k(  rt        | d||      S |t         j                  k(  r5t        | fd||d|}|t        j
                  k(  r|j                         }|S t        d|       )a  
        Initializes a :class:`NestedWrappedModule` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps some nested
                modules with FSDP but not the top-level module. The model may
                later be wrapped with a top-level FSDP external to this method
                if desired.
            cuda_init_mode (CUDAInitMode): Determines model movement to CUDA.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
        Fr   rJ   rF   Tr   )r&   r*   r  r+   r0   r2   r   r   )rG   rH   rJ   rE   rF   r   s         r.   rM   zNestedWrappedModule.init  s    . K\111&-+	  |555,-+	
 J !8!88'__.
77GHIIr-   NF)r'   r(   r)   rS   rT   rV   r0   r   r;   r   r@   rD   rR   r&   r   r   rU   r	   rW   rX   rM   r  r  s   @r.   r  r  h  s    
  
 
 %	

 
@2 
 15#+J  +J$+J %+J d38n-	+J
 +J 
+J +Jr-   r  c                   h     e Zd Ze	 	 ddej
                  dededee	e
ef      def
 fd       Z xZS )AlwaysWrapNestedWrappedModulerG   rH   rJ   rE   rF   c                 4   t        t        t              }|j                  | t        j                  |||      }|t        j                  k(  r|S |t        j
                  k(  r7t        |fdt        i|}|t        j                  k(  r|j                         }|S y)z
        Initializes a :class:`NestedWrappedModule` instance, but unlike
        :meth:`NestedWrappedModule.init`, for the ``RECURSIVE`` init mode, this
        wraps with top-level FSDP and the ``always_wrap_policy()`` auto wrap
        policy.
        )rG   rH   rJ   rE   rF   r   N)r   r  rM   r&   r*   r+   rv   r   r0   r2   r   )	rG   rH   rJ   rE   rF   super_rY   r   r   s	           r.   rM   z"AlwaysWrapNestedWrappedModule.init  s     46ST'//)#'  
 \111L|555eX6HXKXJ!8!88'__.
	 6r-   r  )r'   r(   r)   rR   rS   rT   r&   r0   r   r   rU   r	   rV   rM   r  r  s   @r.   r  r    s^    
 15#  $ % d38n-	
  r-   r  c                        e Zd Zdej                  dededef fdZed
d       Z	e	 	 ddej                  de
dedeeeef      def
d	       Z xZS )NonUniformReqGradNWMrG   r   rJ   rF   c                    t         t        |           j                         | _        j	                         | _        |t        j                  k(  }fd}|rt        j                  d       t        j                  t        t        j                  dd      |       |t        j                   |t        t        j                  dd      |            t        t        j                  dd      |                   |t        j                  t        t        j                  dd      |      t        t        j                  dd      |                        | _        y )Nc                 &    rt        | fi S | S r   r   r	  s    r.   r  z2NonUniformReqGradNWM.__init__.<locals>._maybe_wrap  r  r-   r   r   r   r   )r   r  r   r   r   rl   r0   r1   rP   r   rW   r  r   r   r  r  s	    ``  `  r.   r   zNonUniformReqGradNWM.__init__  s     	!413 JJL	**,%)A)AA	
 a mm		!Q6BIIa,<l KL		"b 1<@ 		"a 0,?		!Q>
r-   c                     | j                         D ]-  \  }}t        j                  ||      r|j                  d       / y r  )r_   rematchrequires_grad_)rY   req_grad_masknps       r.   _set_nonuniform_req_gradz-NonUniformReqGradNWM._set_nonuniform_req_grad  s5    **,DAq88M1-  ' -r-   rH   rE   c                    t        j                  d      }|t        j                  k(  r't	        | d||      }t        j                  ||       |S |t        j                  k(  rO|i }t	        | fd||d|}|t        j                  k(  r|j                         }t        j                  ||       |S t        d|       )a  
        Initializes a :class:`NestedWrappedModule` instance, but unlike
        :meth:`NestedWrappedModule.init`, it wraps a second :class:`torch.nn.Sequential`
        container to enable the desired non-uniform ``requires_grad``
        ``use_orig_params=True`` tests. For both ``RECURSIVE`` and ``NO_FSDP``
        init modes, freezes all parameters except the last two to validate
        ``ShardedGradScaler`` support for ranks with no (non-zero sized) local shards in
        FSDP ``use_orig_params=True`` mode.
        zmodule\.2.*\.1.*Fr  Tr   )r%  compiler&   r*   r!  r+  r+   r0   r2   r   r   )rG   rH   rJ   rE   rF   req_grad_pattern	ddp_modelr   s           r.   rM   zNonUniformReqGradNWM.init  s    ( ::&9:\111,-+	I !99)EUV|555" --+	
 J !8!88'__.
 99*FVW77GHIIr-   rN   r  )r'   r(   r)   rS   rT   rV   r0   r   rR   r+  r&   r   r   rU   r	   rM   r  r  s   @r.   r!  r!    s    (
  (
 (
 %	(

 (
T ( (
 
 15#+J  +J$+J %+J d38n-	+J
 +J +Jr-   r!  c                        e Zd ZdZdej
                  dedef fdZd Zd Z	d Z
d	 Zed
ee   dedededef
d       Z xZS )ModuleWithDelayzThis class wraps a :class:`FSDPTestModel` to optionally add a delay
    after computing the loss and/or before the gradient reduction.r  delay_after_loss_msdelay_before_reduction_msc                 L    t         |           || _        || _        || _        y r   )r   r   r2  r3  r  )r9   r  r2  r3  r   s       r.   r   zModuleWithDelay.__init__J  s'     	#6 )B&r-   c                 8    | j                   j                  |      S r   )r  r;   r8   s     r.   r;   zModuleWithDelay.get_inputU  s    {{$$V,,r-   c                 $    | j                  |      S r   r  r  s     r.   r   zModuleWithDelay.forwardX  r  r-   c                     | j                   j                  ||      }| j                  dkD  r=t        j                  j                  t        | j                  t               z               |S Nr   )r  r@   r2  rP   r   _sleepr   r$   r  s       r.   r@   zModuleWithDelay.get_loss[  sQ    {{##E62##a'JJc$":":=N=P"PQRr-   c                      t         j                  j                   fd}t        j                  d|      5   j
                  j                  |       d d d        y # 1 sw Y   y xY w)Nc                      j                   dkD  r=t        j                  j                  t	        j                   t               z                | i |S r8  )r3  rP   r   r9  r   r$   )r   r   orig_reduce_scatterr9   s     r.   _delayed_reduce_scatterz=ModuleWithDelay.run_backward.<locals>._delayed_reduce_scatterd  sL    --1

!!669J9LLM '777r-   z'torch.distributed.reduce_scatter_tensor)rP   distributedreduce_scatter_tensorr   patchr  rD   )r9   rC   r=  r<  s   `  @r.   rD   zModuleWithDelay.run_backwarda  sR    #//EE	8 ZZ57N
 KK$$T*
 
 
s   AA'module_class
model_argsmodel_kwargsc                <    t         | j                  |i |||      S )aA  
        Args:
            module_class (Type[FSDPTestModel]): Wrapped module class to which
                to add delays.
            model_args: Positional arguments forwarded to the ``module_class``
                ``init()``.
            delay_after_loss_ms (int): Delay after computing the loss/before
                the optimizer step (in ms).
            delay_before_reduction_ms (int): Delay before reduce-scattering
                gradients (in ms).
            model_kwargs: Keyword arguments forwarded to the ``module_class``
                ``init()``.
        )r1  rM   )rA  r2  r3  rB  rC  s        r.   rM   zModuleWithDelay.initp  s,    * Lz:\:%
 	
r-   )r'   r(   r)   rO   rW   rX   r   r   r;   r   r@   rD   rR   r   r5   r	   rM   r  r  s   @r.   r1  r1  F  s    F				 !	 $'		-+ 
=)

 !
 $'	

 
 
r-   r1  c                        e Zd Zeej
                  ddddfdej                  dedede	e
eef      ded	ed
ef fd       Z xZS )NestedWrappedModuleWithDelayNFr   rG   rH   rJ   rE   rF   r2  r3  c           
      J    t         t        t          t        | ||||||      S )N)rG   rH   rJ   rE   rF   r2  r3  )r   rF  rM   r  )rG   rH   rJ   rE   rF   r2  r3  r   s          r.   rM   z!NestedWrappedModuleWithDelay.init  s9     13OU))#' 3&? V 	
 		
r-   )r'   r(   r)   rR   r0   r2   rS   rT   r&   r   r   rU   r	   rV   r   rM   r  r  s   @r.   rF  rF    s     (4'>'>04##$)*
  
$
 %
 d38n-	

 
 !
 $'
 
r-   rF  c                   $     e Zd Z fdZd Z xZS )DummyDDPc                 0    t         |           || _        y r   )r   r   r  )r9   r  r   s     r.   r   zDummyDDP.__init__  s    r-   c                 &     | j                   |i |S r   r  r9   r   r   s      r.   r   zDummyDDP.forward  s    t{{D+F++r-   r'   r(   r)   r   r   r  r  s   @r.   rI  rI    s    ,r-   rI  c                        e Zd Zdej                  dedededef
 fdZd Z	d Z
e	 	 	 ddej                  d	eded
eeeef      dedefd       Z xZS )MixtureOfExpertsrG   r   rJ   delay_before_free_msrF   c                    t         |   ||||       || _        || _        || _        |t
        j                  k(  | _        |r"t        j                  d| j                  z          d}d}d}	t        t        j                  ||      | j                        }
t        |
j                         D cg c]  }|j!                          c}      | _        |
j                         D ]	  }d|_         |rt        j                  d       t        t        j                  ||      | j                        }|rHt        j&                  j)                  |j                         g      }t+        |
|fi |}
t+        ||fi |}t        j,                  t        t        j                  |	|      | j                        ||
t        t        j                  ||	      | j                              | _        y c c}w )N)rG   r   rJ   rF   *   r   r   r   Tr   )r   r   rG   rP  r   r0   r1   r   rP   r   r   r   rW   r   r   rx   numelnum_expert_paramsexpertr>  	new_grouprv   r  r  )r9   rG   r   rJ   rP  rF   rE   d_expertd_sharedd_inputrU  r*  sharedexpert_groupr   s                 r.   r   zMixtureOfExperts.__init__  s    	)'	 	 	
 
$8!"*l.F.FFb499n-RYYx:D<M<MN!$9J9J9L%M9LAaggi9L%M!N""$AAH % a RYYx:D<M<MN ,,66L &,>+>F&%7;7Fmm		'84d6G6GH		(G4d6G6GH	
% &Ns   -G%c                 f     j                   dkD  r j                  d   }t        |t              ret        j
                  j                  j                  j                   fd}t        j                  d|      5   j                  |      cd d d        S  j                  |      S # 1 sw Y   xY w)Nr   r   c                      t         j                  j                  t        j                  t               z                | i |S r   )rP   r   r9  r   rP  r$   )r   r   orig_reshardr9   s     r.   _delayed_reshardz2MixtureOfExperts.forward.<locals>._delayed_reshard  s>    JJ%%D558I8KKL (888r-   z.torch.distributed.fsdp._runtime_utils._reshard)rP  r  r   rv   rP   r>  fsdp_runtime_utils_reshardr   r@  )r9   r   rU  r_  r^  s   `   @r.   r   zMixtureOfExperts.forward  s    $$q([[^F&$'$0055DDMM9 ZZDFV  ;;q> 
 {{1~ s   ;B''B0c                    |j                          | j                  st        j                         5  | j	                         D ]i  }t        |d      r|j                  j                  | j                         t        j                  j                  |j                  | j                         k 	 d d d        y y # 1 sw Y   y xY w)NrU  r]   )r   r   rP   ry   rx   hasattrgraddiv_rl   r>  
all_reducerG   )r9   rC   r*  s      r.   rD   zMixtureOfExperts.run_backward  s    ~~*Aq(+ FFKK0%%00tzz0J	 + !  s   A=B99CrH   rE   c                     |i }|t         j                  k(  rt        | d|||      S |t         j                  k(  r6t        | fd|||d|}|t        j
                  k(  r|j                         }|S t        d|       )a  
        Initializes a :class:`MixtureOfExperts` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps some nested
                modules with FSDP, including the expert and shared layers, but
                not the top-level module. The model may later be wrapped with a
                top-level FSDP external to this method if desired.
            cuda_init_mode (CUDAInitMode): Determines model movement to CUDA.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
            delay_before_free_ms (int): Delay before resharding expert
                parameters in the forward pass (in ms).
        F)r   rJ   rP  rF   Tr   )r&   r*   rO  r+   r0   r2   r   r   )rG   rH   rJ   rE   rF   rP  r   s          r.   rM   zMixtureOfExperts.init   s    4 K\111#-%9+  |555)-%9+ J !8!88'__.
77GHIIr-   )NFr   )r'   r(   r)   rS   rT   rV   r0   r   r   r   rD   rR   r&   r   r   rU   r	   rM   r  r  s   @r.   rO  rO    s    2
  2
 2
 %	2

 "2
 2
h(	K 
 15#$%0J  0J$0J %0J d38n-	0J
 0J "0J 0Jr-   rO  subtest_configtest_fntest_kwargsc                 t   t        |j                               }|D cg c]  }|d   	 }}|D cg c]  }|d   	 }}t        j                  | D ]P  }	t	        t        ||	            }
 | j                  di |
5   ||i ||
 ddd       t        j                          R yc c}w c c}w # 1 sw Y   *xY w)a\  
    Runs a test function given by ``test_fn`` as a subtest according to the
    configurations specified by ``subtest_config``. This amortizes the
    costly setup overhead (including process spawn and initializing the
    process group) over the subtests.

    Args:
        subtest_config (Dict[str, List[Any]]): A mapping from subtest
            keyword argument name to a list of its possible values.
        test_fn (Callable): A callable that runs the actual test.
        test_args: Positional arguments to pass to ``test_fn``.
        test_kwargs: Keyword arguments to pass to ``test_fn``.
    r   r^   Nr,   )	r   r   	itertoolsproductdictrf   subTestrS   barrier)cls_instri  rj  	test_argsrk  subtest_config_itemsitemsubtest_config_keyssubtest_config_valuesvaluessubtest_kwargss              r.   run_subtestsrz  4  s    * 9=^=Q=Q=S8T:N%O:N$d1g:N%OBV-WBV$d1gBV-W##%:;c"5v>?X//Y@+@@ 0 < &P-W 0/s   B$B):B..B7	c                   4     e Zd Zed        Z fdZd Z xZS )FSDPTestMultiThreadc                 ~    t         j                  j                         rt         j                  j                         S dS )Nr   )rP   r   is_availabledevice_countr   s    r.   rl   zFSDPTestMultiThread.world_sizeU  s)    ,1JJ,C,C,Euzz&&(L1Lr-   c                 B    t         |           | j                          y r   )r   setUp_spawn_threadsr9   r   s    r.   r  zFSDPTestMultiThread.setUpY  s    r-   c                      t        | g|i |S r   rz  rL  s      r.   rz  z FSDPTestMultiThread.run_subtests]      D242622r-   )r'   r(   r)   propertyrl   r  rz  r  r  s   @r.   r|  r|  T  s!    M M3r-   r|  c            $           e Zd Z fdZed        Zed        Zed        Zd Zd Z	d Z
d Zed	        Z	 	 	 	 	 	 	 d%dej                  dedededee   dedee   dededeeeef      fdZd
dd e       d
d
d
ddddd
d
fdee   dededee   dedededee   d ee    dee   d!ed"ededed#eeeef      deeeef      f d$Z! xZ"S )&FSDPTestc                 h    t         |           dt        j                  d<   | j	                          y )N0TORCH_NCCL_DESYNC_DEBUG)r   r  osenviron_spawn_processesr  s    r.   r  zFSDPTest.setUpb  s)     14

,-r-   c                     t         j                  j                         r(t        t         j                  j	                         d      S dS )Nr   r   )rP   r   r~  minr  r   s    r.   rl   zFSDPTest.world_sizej  s1    49JJ4K4K4Ms5::**,a0TSTTr-   c                 >    t         j                  j                         S r   )rS   distributed_c10d_get_default_groupr   s    r.   rZ   zFSDPTest.process_groupn  s    $$7799r-   c                 *    t          | j                   S r   )r#   	file_namer   s    r.   init_methodzFSDPTest.init_methodr  s    t~~.//r-   c                 <    | j                  ||j                         y r   )assertEqualr   )r9   r   r   s      r.   _check_cpu_offloadzFSDPTest._check_cpu_offloadv  s    j&<&<=r-   c                 <    | j                  ||j                         y r   )r  backward_prefetch)r9   r   r  s      r.   _check_backward_prefetchz!FSDPTest._check_backward_prefetchy  s    *J,H,HIr-   c                 <    | j                  ||j                         y r   )r  forward_prefetch)r9   r   r  s      r.   _check_forward_prefetchz FSDPTest._check_forward_prefetch|  s    ):+F+FGr-   c                      t        | g|i |S r   r  rL  s      r.   rz  zFSDPTest.run_subtests  r  r-   c                 v    | |      }||_         ||_        t        d|j                    d|j                          t        j
                  j                         rdnd}	 t        j                  |j                  |t        |j                        |j                          t        j
                  j                         rft        j
                  j#                         rHt        j
                  j%                  |j                   t        j
                  j#                         z         t        j&                          |j)                  ||       t        j&                          t        j*                          y # t        $ r=}d|j                  d   v r&t        j                  t        d   j                           d }~ww xY w)	Nzdist init r=z, world=ncclgloo)r  backendrl   r   	recompiler   backend_unavailable)r   r  printrl   rP   r   r~  rS   init_process_groupr  r   RuntimeErrorr   sysexitr"   	exit_coder  
set_devicerq  run_testdestroy_process_group)clsr   	test_namer  piper9   r  es           r.   _runzFSDPTest._run  s5   9~	"TYYKx/@AB "JJ335&6	## ,,t/YY	 ::""$)@)@)BJJ!!$))ejj.E.E.G"GH
 	i&""$%  	affQi'$9:DDE		s   A E2 2	F8;8F33F8NFrY   	num_stepsautocastlrfsdp_cpu_offload
save_modelmixed_precisionenable_sharded_grad_scaleruse_pure_fp16sharded_grad_scaler_kwargsc           	         |xr |j                   }t        |j                               j                  }|
i }
t	        d	d|i|
}t
        j                  j                  |j                         |d      }t        |      D ]  }|j                          t
        j                  j                  j                  |      5  |j                  j                  t        j                  d            }|	s|rMt        |t               s=t        |t
        j"                        r|j%                         }nt'        d |D              } || }|rft        |t               rV|j(                  t*        vrD|j                         D ]1  }| j-                  |j                  t        j                  d             3 |j                  j/                  ||      j1                  |      }d d d        |j3                        }|s&|	s$|j4                  t
        j6                  k(  sJ d       |	r+| j-                  |j4                  t
        j8                         nat        |t               r'| j-                  |j4                  |j:                         n*| j-                  |j4                  t
        j6                         |j                  j=                  |       |rTt        |t               rD|j                         D ]1  }| j-                  |j                  t        j                  d             3 |j?                  |       |jA                          |s|jC                         jE                         D ci c]  \  }}||jG                          }}}tI        |       |jK                  |        t        |t               r|jM                  tN        jP                         jS                         S # 1 sw Y   xY wc c}}w )
Nenabledg?)r  momentum)r  r   c              3   <   K   | ]  }|j                           y wr   )r   ).0r   s     r.   	<genexpr>z4FSDPTest._train_for_several_steps.<locals>.<genexpr>  s     %>1affhs   ra   zeloss data type should be float32, as the original                     parameter data type is float32.r,   )*offload_paramsnextrx   r:   r   rP   optimSGDrd   	zero_gradr   ampr  r  r;   r   rv   rQ   r   r   r   r   r  r@   toscaler   float32float16param_dtyperD   stepupdater   r   cloner~   load_state_dict_assert_stater   IDLEr`   )r9   rY   r  r  r  r  r  r  r  r  r  cpu_offload_paramsmodel_devicesharded_grad_scalerr  rm   r>   r?   r*  rC   kvr   s                          r.   _train_for_several_stepsz!FSDPTest._train_for_several_steps  s    .Q2B2Q2QE,,./66%-)+&/ 
.
2L

  0 0 2rCHy!AOO(((:..u||F/CD _Zt=T!%6 %

 %%>%> > '"5$/ //>? #--/((5<<3FG 0 ||,,UF;>>|L- ;. ',,T2D"=JJ%--/555/ !$$TZZ?t,$$TZZ1L1LM$$TZZ?LL%%d+!j&=))+A$$QXXu||E/BC ,  $$U+&&(7<7G7G7I7O7O7QR7Qtq!al7Q
R E"%%j1q "t eT" 2 23{{}u ;:d Ss   >DOO%O"	r   Tmodel_classrH   rJ   ref_init_fn	num_itersr   r  r   r  use_orig_paramsrK   c                    |t         j                  k7  sJ d       |i }d}| j                  j                         } |j                  | j                  t         j                  t
        j                  fddi|}|t        ||g|      }n ||      }|r|j                         }| j                  |||
du|||
|||	      }t        |j                               }|j                  |||	|
||d       	  |j                  | j                  |||fddi|}t!        |t"              st#        || j                  fi |}|r|j                         }|t
        j$                  k(  r|j'                         }|duxr |j(                  }|xr |t
        j$                  k(  }|xr |t
        j$                  k7  }|rFt+        j,                  d      }|j                         D ]  }| j/                  |j,                  |         |r| j1                  t2        d      n	t5               }|5  | j                  ||d||||
|||
      } ddd       |ry|rA|j                         D ]  }| j/                  |j,                            j'                         } t7        |      }!t*        j8                  j;                  | d       |
|s| j/                  ||!dd       yyy# t        $ r }t        d	| d
t        |             |d}~ww xY w# 1 sw Y   xY w)a  
        Tests FSDP training against a reference, which defaults to DDP but
        may be customized with ``ref_init_fn``.

        Args:
            model_class (Type[FSDPTestModel]): A model class that inherits from
                ``FSDPTestModel``, which defines the expected interface.
            fsdp_init_mode (FSDPInitMode): The mode to initialize the
                FSDP-wrapped model. This should not be ``NO_FSDP``.
            ref_init_fn (Optional[Callable]): A callable to invoke that wraps a
                non-wrapped model to construct the reference model, where this
                wrapper should provide data parallel semantics. If ``None``,
                then the callable defaults to the DDP constructor.
        z.Expects an FSDP init mode that wraps with FSDPN{Gz?rF   T)
device_idsoutput_device)r  r  r  r  r  r  r  )r   r  r   r  r  r  zInitializing z raised error ra   zSAn FSDP-managed module with parameter CPU offloading enabled has parameters on cudaF)r  r  r  r  r  r  r  r  )check_dtypezFSDP did not match DDP)exact_devicemsg)r&   r*   rZ   r   rM   r0   r1   DDPr   r  r   rx   r  	Exceptionr   rU   r   rv   r2   r   r  rP   r:   r  assertRaisesRegexr  r   r   testingassert_close)"r9   r  rH   rJ   r  r  r  r   r  r   r  r  r  r  r  rK   r  rE   r  r   rY   	ref_modelref_loss
ddp_paramsr   r  r  expects_device_errorexpects_cpu_device
cpu_devicerh   context	fsdp_lossfsdp_unsharded_paramss"                                     r.   _test_fsdp_parityzFSDPTest._test_fsdp_parity  sq   F l222	<;	<2K!!&&(     $$
 	

 
 EtfDII#E*I!(I00$D0(+'A''A 1 

 )..01
*%6%6#2$4#2		

	Y)))""	
 # J *d+ j$*<*<LLJ#*J\444#*J$D0O[5O5O
 H~1H1HH 	 H~1H1HH 	 e,J#..0  z: 1 $ "")  	 55!,% /+E++E 6 I    #..0  z: 1!(I /
 ; 	""8YE"J "=%!,	   ,9"I  	Y}[MAxPQWXX	YF Ws$   2"K K;	K8K33K8;L)r  NFNFFN)#r'   r(   r)   r  r  rl   rZ   r  r  r  r  rz  classmethodr  rW   rX   r   rV   floatr   r   r   r   rU   r	   r  r   r5   r&   r0   r
   r   r   r  r  r  s   @r.   r  r  a  s*     U U : : 0 0>JH3 $% $%V 15 48+0#?CTyyT T 	T
 T #:.T T ".1T %)T T %-T#s(^$<Tv +/",,8<8<48!& %+0#04?C#a-(a %a %	a
 h'a a a  a $$45a $$45a ".1a a a %)a a  d38n-!a" %-T#s(^$<#ar-   r  c                   $     e Zd Z fdZd Z xZS )
SkipModulec                 \    t         |           t        j                  ddd      | _        y N
   Fbias)r   r   rW   r   linr  s    r.   r   zSkipModule.__init__  s"    99R%0r-   c                 $    | j                  |      S r   )r  r  s     r.   r   zSkipModule.forward  s    xx{r-   rM  r  s   @r.   r  r    s    1r-   r  c                   $     e Zd Z fdZd Z xZS )NestedLinearc                     t         |           |r5t        t        j                  ddd      j                               | _        y t        j                  ddd      j                         | _        y r  )r   r   r   rW   r   r   nested_linear)r9   	fsdp_wrapr   s     r.   r   zNestedLinear.__init__  sR    !%biiBU&C&H&H&J!KD!#2r!>!C!C!EDr-   c                 $    | j                  |      S r   )r  r  s     r.   r   zNestedLinear.forward  s    !!!$$r-   rM  r  s   @r.   r  r    s    F%r-   r  c                   $     e Zd Z fdZd Z xZS )	SkipModelc                     t         |           t        j                  ddd      j	                         | _        t               j	                         | _        t        t        |            | _
        y )Nr  Fr  )r  )r   r   rW   r   r   linearr  linear_skipr   r  r  )r9   double_nestr   s     r.   r   zSkipModel.__init__  sP    iiBU388:%<,,.!,"EFr-   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r
  r  r  r  s     r.   r   zSkipModel.forward  s4    KKNQq!r-   rM  r  s   @r.   r  r    s    Gr-   r  )FT)FF)T)[rm  r  r%  r  abcr   r   
contextlibr   copyr   enumr   r   typingr	   r
   r   r   r   r   r   r   unittestr   rP   torch.distributedr>  rS   torch.nnrW   torch.distributed.fsdpr   r   rv   $torch.distributed.fsdp._common_utilsr   "torch.distributed.fsdp._init_utilsr   2torch.distributed.fsdp.fully_sharded_data_parallelr   r   r   *torch.distributed.fsdp.sharded_grad_scalerr   torch.distributed.fsdp.wrapr   r   r   r   r   torch.nn.parallel.distributedr   r  *torch.testing._internal.common_distributedr    r!   r"   $torch.testing._internal.common_utilsr#   r$   r&   r0   rX   r5   rT   rs   rV   r~   r   r   r   r   r   r   r   r   r  r  r!  r1  rF  rI  rO  rU   rz  r|  r  r  r  r  r,   r-   r.   <module>r     s8    	 	 
 # "   J J J      O > R 
 I R R E H 
 P4 4 BIIs D99$$ > #99##""2299 2t 23ryy 3 3DBII D$ D .Q"- Q"h[J- [J|$7 @]J. ]J@C
m C
L
? 
.,ryy ,EJ* EJPd3i( 
 @
3/ 
3# D
 	%299 	%		 r-   