
    Phc                        d dl Z d dlZd dlZd dlZd dlmZmZmZmZm	Z	m
Z
mZmZ d dlZd dlmZ d dlmc mc mc mZ d dlmZ d dlmc mZ d dlmZmZmZ d dlmZ d dl m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z.m/Z/m0Z0m1Z1 d d	l2m3Z3m4Z4m5Z5 d d
l6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z<m=Z= ddl>m?Z?m@Z@  ej                  eB      ZCdeDdeDfdZEdej                  de#de
eeDeDeDf      fdZGdej                  de
eeDeDeDf      fdZHe	 	 	 	 d:dej                  de#deIdeIdeIdeIddfd       ZJedej                  de#ddfd       ZKdej                  de#ddfdZLdej                  de#deIdeIddf
dZMedej                  de#deeDef   deDdedeeDef   fd        ZNede#dej                  ddfd!       ZOedej                  de#deeDef   deDdeeDef   f
d"       ZPdej                  de#deeDef   deDddf
d#ZQdej                  de#ddfd$ZRde#dej                  ddfd%ZSedej                  de#deeDef   deDdeeDef   f
d&       ZTdej                  de#ddfd'ZUdej                  de#deeDef   deDddf
d(ZVde#dej                  ddfd)ZWedej                  de#deeDef   deDdeeDef   f
d*       ZXedej                  de#ddfd+       ZYedej                  de#deeDef   deDddf
d,       ZZe j                  de#de	fd-       Z\e ej                         dej                  deeDef   deDd.edeeDef   f
d/              Z^e ej                         dej                  ddfd0              Z_ede#ddfd1       Z`e ej                         dej                  deeDef   deDd.eddf
d2              Zae ej                         dej                  d.eddfd3              Zbd4e#fd5Zced4e#d6eDd7ed8eeDef   ddf
d9       Zdy);    N)AnyCallablecastDict	GeneratorIteratorno_type_checkTuple)init_from_local_shardsShardShardedTensor)DTensor)_mesh_resources)
_FSDPState._get_module_fsdp_state_if_fully_sharded_module_has_fsdp_params_is_composable_module_handleclean_tensor_nameFSDP_PREFIXFSDP_WRAPPED_MODULE)SimpleProfiler)!_cast_buffers_to_dtype_and_device_get_orig_buffer_dtypes
_lazy_init%_reset_flat_param_grad_info_if_needed)FullStateDictConfigShardingStrategyStateDictType)_replace_by_prefix   )_ext_all_gather_dtensor_ext_chunk_dtensor_ext_chunk_tensor_ext_post_unflatten_transform"_ext_pre_load_state_dict_transform)_unshard_fsdp_state_params
FLAT_PARAMmodule_namereturnc                     | j                  t         d      } | j                  t         d      } | r|  d} | j                  t        j                  d      } | S )N .)replacer   r   checkpoint_wrapper_CHECKPOINT_PREFIX)r)   s    sC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/distributed/fsdp/_state_dict_utils.py_convert_to_wrapped_module_namer2   ;   sZ    %%;K%%)<(=CK$Q'%%&8&K&KRPK    module
fsdp_statec              #      K   t        ||       sy t        ||       j                         D ]  \  }}t        |      }| | }|||f  y wN)r   r   param_module_namesr2   r4   r5   
param_namer)   fqns        r1   _param_name_infosr<   E   sc      J/#1F$$
K 6kBj\*:{**$s   A	Ac              #      K   t        ||       j                         D ]  \  }}t        |      }| | }|||f  y wr7   )r   shared_param_module_namesr2   r9   s        r1   _shared_param_name_infosr?   R   sV      $2F$!$"
K 6kBj\*:{**$"s   <>	writeback
rank0_onlyoffload_to_cpu
with_gradsc                     | |j                   vsJ d       t        | |||||      |j                   | <   |j                   |    j                          y)z
    state_dict hooks cannot use the pure context call as the checkpoint flow
    requires to enter the context in the pre-hook but leave the context in the
    post-hook. This API enters the context of ``_unshard_fsdp_state_params``.
    z`Entering the ``_unshard_fsdp_state_params`` context but _unshard_params_ctx[module] is not None.)r@   rA   rB   rC   N)_unshard_params_ctxr'   	__enter__)r4   r5   r@   rA   rB   rC   s         r1   _enter_unshard_params_ctxrG   ]   sg     777 	7 .H%.J""6* ""6*446r3   c                 z    |j                   |    j                  ddd       |j                   j                  |        y)zAA helper function to exit ``_unshard_fsdp_state_params`` context.N)rE   __exit__popr4   r5   s     r1   _exit_unshard_params_ctxrL   z   s5     ""6*33D$E""&&v.r3   c                     |j                   j                         r|j                   j                          t        ||        |j                  rt        |j                         yy)zAPerforms the pre-state_dict tasks shared by all state_dict types.N)_device_handleis_availablesynchronizer   _is_rootr   _all_handlesrK   s     r1   _common_pre_state_dict_hookrS      sN    
   --/!!--/z6"-j.E.EF r3   c                 v    t        |      r|j                  t        j                  k(  ryt	        | |d||       y)z
    Performs the pre-state_dict tasks shared by all state_dict types that require
    ``_unshard_fsdp_state_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this hook.
    NF)r@   rB   rA   )r   sharding_strategyr   NO_SHARDrG   )r4   r5   rB   rA   s       r1   #_common_unshard_pre_state_dict_hookrW      s;     	z"((,<,E,EE%r3   
state_dictprefix
param_hookc                    t        ||t         z   |       |rt        ||       s6t        |      r|j                  t
        j                  k(  st        | |       |S |j                  t        j                  k(  xr$ t        t        |j                        j                  }|xr |j                  dk7  }|r}|j                   sq|j"                  D ];  }|j%                  t&        j(                   dd      }|j+                  | | d       = |j+                  | t,                t        | |       |S t/        | |      D ]`  \  }}	}
| | }|r|j+                  |       !||v s2J d| d|j1                          d| d|
 d	|	 d
|j                   d        ||||       b t        |      r|j                  t
        j                  k(  st        | |       t3        j4                  d      }g }g }|j"                  D ]  }t7        |      }| | }||vr|r|j+                  |       -||   }|j                  j8                  r#|j4                  |k7  r|j;                  |      ||<   ||j<                  vsz|j?                  |       |j?                  ||           |rt        |      s|jA                         n|jB                  jD                  du}|rqtG        ||      }tI        |||jJ                         tM        ||      D ]?  \  }}| | }tN        jQ                  d||jR                         |jU                         ||<   A |S )z
    The post-state_dict flow that shared by all state_dict types that require
    ``_unshard_fsdp_state_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this
    hook.
    r   r-   r,   NzFSDP assumes z2 is in the state_dict but the state_dict only has z	. prefix=z, module_name=z, param_name=z rank=cpuz%FSDP is casting the dtype of %s to %s)+r    r   r   r   rU   r   rV   rL   _state_dict_typer   FULL_STATE_DICTr   r   _state_dict_configrA   rank_use_orig_params_buffer_namesr.   r/   r0   rJ   r(   r<   keystorchdevicer   rB   to_ignored_buffer_namesappend$_mixed_precision_enabled_for_buffersmixed_precisionbuffer_dtyper   r   compute_deviceziploggerinfodtypeclone)r4   r5   rX   rY   rZ   rA   no_fsdp_return	clean_keyr;   r:   r)   
cpu_devicebuffer_clean_fqnsbuffersbuffer#mixed_precision_enabled_for_buffersbuffer_dtypes	clean_fqns                     r1   $_common_unshard_post_state_dict_hookr{      sV    z6{m#<fE-j&A:&,,0@0I0II$VZ8 	##}'D'DD 	P$j&C&CDOO   8JOOq$8Nj99#11I!))%889;RI NNfXi[148 2 	&*./ 4 ):&*(M$ZNN3j  	
C5 !??$% &X^K= 9$VJOO+<A?	
  	:vs+ )N 	z"((,<,E,EE 4e$JG--	%i0	$j NN3_F--<<MMZ/"())J"7
3
 @ @@!((3z#/' .*  "*- ;;=,,99E 	,
 /3J@QRM-
(A(A &)2C%D!	,CS&,,W"(,,.
3 &E r3   c           	      <   t        | dd      r>t        j                  | j                        }|rt	        d| j                   d| dd      t        ||        t        || | j                  j                  t        t        | j                        j                         y)	aU  
    Hook that runs before model.state_dict() is called. pre-state_dict hook is
    not actually supported by ``nn.Module``. As a result, this API is called
    from ``_full_post_state_dict_hook()`` to simulate the case. Once pre-state_dict
    is supported in ``nn.Module``, this hook will be registered as a hook in
    ``nn.Module``.
    _device_meshFzFound FSDP's device_mesh z has a parent device_mesh r-   z_We do not support FULL_STATE_DICT for 2D FSDP + TP. Please use FSDP SHARDED_STATE_DICT instead.rB   rA   N)getattrr   get_parent_meshr}   RuntimeErrorrS   rW   r_   rB   r   r   rA   )r5   r4   argskwargsparent_meshs        r1   _full_pre_state_dict_hookr     s     z>51%55j6M6MN+J,C,C+DD^_j^kklmq 
  
3'!44CC+Z-J-JKVV	r3   c                 p    dt         t        t        f   dt        dt        ddffd}t        | |||      S )a!  
    Hook that runs after model.state_dict() is called before returning result to
    user. For FSDP, we may have to clone the tensors in state_dict as params go
    back to sharded version after _unshard_fsdp_state_params ends, and also remove
    the ``FSDP_WRAPPED_MODULE`` prefix.
    rX   rY   r;   r*   Nc                 r   |}t        |      }|j                  |      r|t        |      d  }t        | |   dd      s0	 | |   j	                         j                         | |<   d| |   _        y y # t        $ r>}t        j                  d| dj                   d| dt        |              Y d }~y d }~ww xY w)N_has_been_clonedFTz#Failed to clone() tensor with name z	 on rank z. This may mean that this state_dict entry could point to invalid memory regions after returning from state_dict() call if this parameter is managed by FSDP. Please check clone implementation of z	. Error: )r   
startswithlenr   rq   detachr   BaseExceptionwarningswarnr`   str)rX   rY   r;   rs   clean_prefixer5   s         r1   rZ   z._full_post_state_dict_hook.<locals>.param_hookD  s    
 	(0 -!#l"3"56I z#(:EB
",S/"7"7"9"@"@"B
337
30 C ! 9#i
GX Y) *-Ys1vh	@ s   .A/ /	B684B11B6r   r   r   r{   r4   r5   rX   rY   rZ   s    `   r1   _full_post_state_dict_hookr   6  sN    cN  
	6 0
J
 r3   c                    t        ||        t        |      r|j                  t        j                  k(  s,t        j                  d      5  t        | |d       d d d        t        |      st        |||t         z          y y # 1 sw Y   +xY w)NrG   Tr@   )
r   r   rU   r   rV   r   profilerG   r    r   )r4   r5   rX   rY   s       r1   _full_pre_load_state_dict_hookr   d  sx     z6"z"((,<,E,EE##$?@%fjDI A *%:vv;-/HI & A@s   
BBc                     t        |      r|j                  t        j                  k(  s+t	        j
                  d      5  t        | |       d d d        y y # 1 sw Y   y xY wNrL   )r   rU   r   rV   r   r   rL   r4   r5   r   r   s       r1   _full_post_load_state_dict_hookr   v  sP     	z"((,<,E,EE##$>?$VZ8 @? F??s   AAc                 v    t        | |      r!t        | |      j                  st        d      t	        ||        y)z
    Hook that runs before model.state_dict() is called. Right now, pre-state_dict
    hook is not supported by the PyTorch core. So this API is called from
    `_local_post_state_dict_hook()` to simulate the case.
    zN``local_state_dict`` can only be used when parameters are flatten and sharded.N)r   r   uses_sharded_strategyr   rS   r5   r4   r   r   s       r1   _local_pre_state_dict_hookr     s<     	V,z62HH
 	
  
3r3   c                 Z   t        || t         |       t        ||       s|S t        ||       sJ d       t        ||       j                  }|j
                  j                         }|j                         |j                  z  }|j                         |j                  z
  }|dkD  r8|d| j                  |      }t        j                  ||g|j                        g}ng }t        |||j                        }	|j                  j                  r|	j!                         }	|	|| t"         <   |S )z
    This hook create a ShardedTensor from the local flat_param and replace
    the state_dict[f"{prefix}{FLAT_PARAM}] with the ShardedTensor. No copy
    will happen. The underlying storage is the same.
    zShould have returned earlyr   N)process_group)r    r   r   r   
flat_param_unpadded_unsharded_sizenumelr`   _shard_numel_paddedviewr   from_tensor_and_offsetsr   r   r_   rB   r\   r(   )
r4   r5   rX   rY   r   
full_numelshard_offsetvalid_data_sizelocal_shardssharded_tensors
             r1   _local_post_state_dict_hookr     s0    zfXk]#;VDJ/ *f-K/KK-
F3>>J 44::<J##%
7L &&(:+I+IIO
   0166G
))*|njooV
 +j
0H0HN $$33'++-*8J&*&'r3   c                      y r7    r   s       r1    _local_post_load_state_dict_hookr     s     	r3   c                    t        ||        t        ||| t                | t         t         }||vrt	        ||       rJ d       y||   }t        |t              sJ d       t        ||       j                  }|J |j                         |j                  z
  }|j                         }|dkD  rt        |      sJ d       |d   j                  }|j                  dkD  rp|j                         |j                         k  s*J d|j                          d|j                          d       t        j                  |d|j                  g      }n|}|||<   y)	z
    This hook finds the local flat_param for this FSDP module from the
    state_dict. The flat_param should be a ShardedTensor. This hook converts
    the ShardedTensor to a tensor. No copy happen unless padding is required.
    zONo `FlatParameter` in `state_dict` for this FSDP instance but it has parametersNz4Tensors in local_state_dict should be ShardedTensor.r   z9load_local_state_dict assume one shard per ShardedTensor.zLocal shard size = z% and the tensor in the state_dict is r-   )r   r    r   r(   r   
isinstancer   r   r   r   r   r   r   tensorFpad)	r4   r5   rX   rY   r;   load_tensorr   r   shardss	            r1   _local_pre_load_state_dict_hookr     s    z6"z6fXk]+CDH[M*
.C
*#J7 	
$	
7 	S/K] >=> 
  
F3>>J!!! &&(:+I+IIO%%'F6{WWW{Qi&& ))A-$$&)9)9);; %j&6&6&8%9 :%%0%6%6%8$9<; %%a1O1O-PQK !JsOr3   c                     t        | |      r!t        | |      j                  st        d      t	        ||        t        || dd       y)zz
    Hook that runs before model.state_dict() is called. Check
    ``_full_pre_load_state_dict_hook`` for the detail.
    zP``sharded_state_dict`` can only be used when parameters are flatten and sharded.Fr~   N)r   r   r   r   rS   rW   r   s       r1   _sharded_pre_state_dict_hookr     sR     	V,z62HH
 	
  
3 (	r3   c                 l    dt         t        t        f   dt        dt        ffd}t        | |||      S )z
    The hook replaces the unflattened, unsharded parameter in the state_dict
    with a unflattened, sharded parameter (a ShardedTensor).
    rX   rY   r;   c                    | |   }j                   j                  sRt        |j                  j                  j
                  j                         j                  j                        }n-t        |j                  j                  j                        }j                   j                  r|j                         }|| |<   y )N)r   r`   
world_sizenum_devices_per_nodepgfsdp_extension)r   r`   device_meshr   )r_   _use_dtensorr$   r`   r   rN   device_countr   _fsdp_extensionr#   r}   rB   r\   )rX   rY   r;   paramr   r5   s        r1   rZ   z1_sharded_post_state_dict_hook.<locals>.param_hook(  s    3,,99.__%00%/%>%>%K%K%M++)99N 0__&33)99	N ((77+//1N(
3r3   r   r   s    `   r1   _sharded_post_state_dict_hookr     s?    )tCH~ )s ) ), 0
J
 r3   c                     t        ||       r+t        j                  d      5  t        | |       d d d        y y # 1 sw Y   y xY wr   )r   r   r   rL   r   s       r1   "_sharded_post_load_state_dict_hookr   C  s<     
F+##$>?$VZ8 @? ,??s	   9Ac                    t        ||        t        |      st        |||t         z          t	        ||       syt        ||       }|j                  st        d      t        t        |j                  j                  |j                  j                              }t        | |      D ]@  \  }}}t        |      s| t         | }n| | }	 |j                  |      }	|j$                  j&                  sBt)        |	|j*                        \  }	}
t-        |
      dk  s!J dt-        |
       d|j.                   d       |	j1                         j3                         }|	j1                         d	   }t5        j6                  ||j8                  z        |z  |z  }t-        |
      d
k(  r|
d	   j:                  j=                         }t?        j@                  t>        jB                  jD                        5  |jG                  |jH                        }ddd       ||j3                         z
  }|d	kD  rEtK        jL                  |d	|g      }n,tO        jP                  ||	jR                  |jH                        }tO        jT                  ||j8                  z  |jR                  |jH                        }t?        j@                  t>        jB                  jV                        5  tY        jZ                  |||j\                         ddd       |j_                  d	d	|      ja                  |	j1                               }|||<   |	jb                  |jd                  jf                  k7  r%|	jG                  |jd                  jf                        }	ti        jj                  |jd                        }tm        |	||j*                        }|jo                  |      ||   }tq        |||j*                        }|||<   C t?        j@                  d      5  ts        | |d       ddd       y# t        $ r t         j#                  d| d       Y w xY w# 1 sw Y   8xY w# 1 sw Y   WxY w# 1 sw Y   yxY w)z
    The hook combines the unflattened, sharded parameters (ShardedTensor) to
    a new FlatParameter and shards the new FlatParameter to the local chunk.
    NzUload_sharded_state_dict can only be called when parameters are flattened and sharded.zDid not find param with FQN zD, skipping it. The weight will not be filled if you expect it to be.   z&Expects 0 or 1 shard per rank but got z shards on rank r-   r   r!   )rp   re   )grouprG   Tr   ):r   r   r    r   r   r   r   r   dictrm   r   _fqns_param_extensionsr<   rJ   KeyErrorrn   warningr_   r   r&   r   r   r`   sizer   mathceilr   r   flattenr   r   TypeH2Drf   rl   r   r   rd   zerosrp   empty	ALLGATHERdistall_gather_into_tensorr   narrowreshapere   r}   device_typer   r   r"   getr%   rG   )r4   r5   rX   rY   handlefqn_to_param_extr;   _fqn_from_global_rootr   r   param_numel
dim_0_size
chunk_sizelocal_tensornum_paddingr   r   exts                      r1   !_sharded_pre_load_state_dict_hookr   L  s    z6"*%:vv;-/HIJ/J/F'')
 	
 F##V%6%6%H%HI 'vz:	Qj)&,Xk]3%#@ &,XcU#3 	NN#78E ,,99>z11ME6 v;? v;-'7
7HK?  **,,,.KaJ		*z'<'<<= 
 6{a%ay//779#++N,?,?,C,CD#/??:3L3L#ML E(<+=+=+???#$55;7G#HL${{ekk*:S:S  [[Z222"((!00F
  ''(;(;(E(EF++L
0H0H G ]]1a5==ejjlKF/5J+,||z66BBB!8!8!D!DE)99*:Q:QRK2{J$>$>L  ##C(4&s+< #z'A'A  0<J+,I ;L 
		 ;	<!&*E 
=	<  	NN./C.D EH H 	6 ED GF, 
=	<s6   P.Q #Q?Q"P=<P= Q
	Q	Q#c              #      K   | j                   }| j                  }t               | _         t        j                  | _        d  || _         || _        y wr7   )r_   r]   r   r   r^   )r5   old_state_dict_configold_state_dict_types      r1   "_replace_with_full_state_dict_typer     sJ     &99$55$7$9J!"/"?"?J	$9J!"5Js   AAr   c           
         t        |       }|j                  t        j                  k(  r!t	        |      }t        j                  d       nt        j                         }|5  t        j                  t        t        j                  t        t        j                  t        i} ||j                      | |||      }ddd       |j"                  rt$        j'                  d|       t)        j+                               D ]  \  }}	|j-                  |      st/        |	t0        j2                        s3|	j4                  }
t/        |	t6              r.d}
|	j9                         }|rD|d   j:                  j4                  }
n*t/        |	t<              r|	j?                         j4                  }
t$        j'                  d|tA        |	      |	j4                  |
|	jB                  |	jD                          S # 1 sw Y   3xY w)z
    _post_state_dict_hook() is called after the state_dict() of this
    FSDP module is executed. ``fsdp_state._state_dict_type`` is used to decide
    what postprocessing will be done.
    RWhen using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.Nz0FSDP finished processing state_dict(), prefix=%sr   z>FQN=%s: type=%s, shape=%s, local_shape=%s, dtype=%s, device=%s)#r   rU   r   rV   r   r   r   
contextlibnullcontextr   r^   r   LOCAL_STATE_DICTr   SHARDED_STATE_DICTr   r]   rQ   rn   ro   sorteditemsr   r   rd   Tensorshaper   r   r   r   to_localtyperp   re   )r4   rX   rY   r   r5   context_post_state_dict_hook_fnprocessed_state_dictkeyr   local_shaper   s               r1   _post_state_dict_hookr    s    @GJ##'7'@'@@4Z@	

 ((*	))+E**,G,,.K$
 
  U7
8S8STJ
F 
 
 FO!"6"<"<">?KC~~f%*VU\\*J$llfm4"&K#002F&,Qi&6&6&<&<0"(//"3"9"9KTLLLLLMM @(  A 
s    AG((G2c                    t        |       }|j                  t        j                  k(  r!t	        |      }t        j                  d       nt        |       t        j                         }|5  t        j                  t        t        j                  t        t        j                  t         i} ||j"                     || g|i | ddd       y# 1 sw Y   yxY w)z
    This is called before the core state dict saving logic of ``module``.
    ``fsdp_state._state_dict_type`` is used to decide what postprocessing will
    be done.
    r   N)r   rU   r   rV   r   r   r   _set_use_dtensorr   r   r   r^   r   r   r   r   r   r]   )r4   r   r   r5   r   _pre_state_dict_hook_fns         r1   _pre_state_dict_hookr    s     @GJ##'7'@'@@4Z@	

 	$((*	))+D**,F,,.J#

 	=
 ; ;<	
 	
 		
 
s   +ACCc                     t        | dd       rg| j                  }|t        j                  k(  rt	        ddd      |t        j
                  k(  rt        j                  d       y d| j                  _	        y y )Nr}   z&Found state_dict_type LOCAL_STATE_DICTz3DeviceMesh is not compatible with LOCAL_STATE_DICT.zKPlease set state_dict_type to SHARDED_STATE_DICT to get DTensor state_dict.zFound both state_dict_type FULL_STATE_DICT and device_mesh. Please set state_dict_type to SHARDED_STATE_DICT to get DTensor state_dict.T)
r   r]   r   r   r   r^   rn   r   r_   r   )r5   state_dict_types     r1   r  r    sv     z>40$55m<<<8E] 
  = ==NN^
 :>J))6 1r3   c                 n   t        |       }|j                  t        j                  k(  r!t	        |      }t        j                  d       nt        |       t        j                         }t        ||        |j                  rt        j                          |5  t        j                  t         t        j"                  t$        t        j&                  t(        i}|j*                  j-                         r|j*                  j/                           ||j0                     | |||       ddd       y# 1 sw Y   yxY w)z
    This is called before ``module._load_from_state_dict()``.
    ``fsdp_state._state_dict_type`` is used to decide what preprocessing will
    be done.
    r   N)r   rU   r   rV   r   r   r   r  r   r   r   rQ   r   resetr   r^   r   r   r   r   r   rN   rO   rP   r]   )r4   rX   rY   r   r5   r   _pre_load_state_dict_hook_fns          r1   _pre_load_state_dict_hookr  0  s     @GJ##'7'@'@@4Z@	

 	$((*z6"	))+I**,K,,.O(
$ $$113%%113A$Z%@%@AJ
F	
 
s   BD++D4c                    t        |       }|j                  t        j                  k(  r!t	        |      }t        j                  d       nt        j                         }|5  t        j                  t        t        j                  t        t        j                  t        i} ||j                      | |       d d d        |j"                  rt%        j&                  d       y y # 1 sw Y   ,xY w)Nr   z&FSDP model load_state_dict profiling: )r   rU   r   rV   r   r   r   r   r   r   r^   r   r   r   r   r   r]   rQ   r   dump_and_reset)r4   r   r5   r   _post_load_state_dict_hook_fns        r1   _post_load_state_dict_hookr  [  s     @GJ##'7'@'@@4Z@	

 ((*	))+J**,L,,.P)
% 	C%j&A&AB6:V 
 %%&NO  
s    AC  C)statec                 z    dt         i fdt        i fdt        ddifdt        i ffD ]  \  }}}t	        | |||        y)zR
    Registers pre-save, post-save, pre-load, and post-load state dict hooks.
    register_state_dict_pre_hook_register_state_dict_hook"_register_load_state_dict_pre_hookwith_moduleT"register_load_state_dict_post_hookN)r  r  r  r  _register_state_dict_hooks_base)r  hook_registration_fn_strhookhook_registration_fn_kwargss       r1   _register_all_state_dict_hooksr  z  sf    
 
()=rB	$&;R@0%D!	

 
./I2N	HC $(C 	(+T3N	
	Hr3   hook_registration_fn_namer  r  c                     t        |       s t        | |      |fi | y| j                  }|r t        |j                  |      |fi | yy)z2Registers ``hook`` using ``hook_registration_fn``.N)r   r   _handle_fully_sharded_module)r  r  r  r  r   s        r1   r  r    sX     % 101$V:UVLGF002KL3 r3   )FFFF)er   loggingr   r   typingr   r   r   r   r   r   r	   r
   rd   torch.distributeddistributedr   ;torch.distributed.algorithms._checkpoint.checkpoint_wrapper
algorithms_checkpointr/   torch.nnnntorch.nn.functional
functionalr   'torch.distributed._shard.sharded_tensorr   r   r   torch.distributed._tensorr   torch.distributed.device_meshr   $torch.distributed.fsdp._common_utilsr   r   r   r   r   r   r   r   #torch.distributed.fsdp._debug_utilsr   %torch.distributed.fsdp._runtime_utilsr   r   r   r   torch.distributed.fsdp.apir   r   r   torch.distributed.utilsr    _fsdp_extensionsr"   r#   r$   r%   r&   _unshard_param_utilsr'   r(   	getLogger__name__rn   r   r2   Moduler<   r?   boolrG   rL   rS   rW   r{   r   r   r   r   r   r   r   r   r   r   r   r   contextmanagerr   no_gradr  r  r  r  r  r  r  r   r3   r1   <module>r;     s       W W W    X X    
 . 9	 	 	 ?  
 7  I 
		8	$  
+II
+#-
+eCcM"#
++II+eCcM"#+   7II77 7 	7
 7 7 
7 78 /RYY /J /4 / /
GII
G
G 

GII  	
 
2 lIIll S#Xl 	l
 l 
#s(^l l^ II
 
 < *II** S#X* 	*
 
#s(^* *ZJIIJJ S#XJ 	J
 
J$9II9#-9	944II4
 
4, .II.. S#X. 	.
 
#s(^. .b	II	#-			-"II-"-" S#X-" 	-"
 
-"`II
 
: #II## S#X# 	#
 
#s(^# #L 9II9#-9	9 9 aFIIaFaF S#XaF 	aF
 
aF aFH 6: 6) 6 6 5 II5 S#X5  5  	5 
 
#s(^5   5 p  
II 
 
	 
   
F > > > >( &
II&
S#X&
 &
 	&

 
&
  &
R PIIPP 
P  P:
* 
& "  "&c3h	
 
 r3   