
    Ph                     0
   U d dl Z d dlZd dlZd dlmZmZmZ d dlmZ d dl	m
Z
mZmZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlm Z m!Z! d dl"m#Z#m$Z$m%Z&m'Z'm(Z(m)Z)m*Z*m+Z+ d d	l,m-Z-m.Z. d d
l/m0Z0 d dl1m2Z3 dZ4dZ5e5 dZ6dZ7e7 dZ8dZ9ee:   Z; e<       Z=ee   e>d<   eeeej~                  e@eAe:f   ZBeeBeeB   eeB   ee:df   f   ZCee:eCf   ZDeeD   ZEee:eeDeEf   f   ZFe j                  d        ZHe G d d             ZIe G d deI             ZJd>dej                  de:deLde;fdZMddddej                  deej                  j                  df   d eLd!eeej                        d"eeI   deJfd#ZPd$ee:eCf   d%eFd&eJddfd'ZQd(eej                  ej                  j                  f   d)e:defd*ZRdej                  d&eJdee:eCf   fd+ZSdej                  d,ee:eCf   d&eJde0fd-ZTd.ej                  j                  ddfd/ZUdej                  d0eej                  j                  df   d&eJdeFfd1ZVdej                  d.ej                  j                  d%eFd&eJdeFf
d2ZWdej                  d0eej                  j                  df   d,eFd&eJddf
d3ZXddddej                  d!eeej                        d"eeI   dee:eCf   fd4ZYddddej                  d0eej                  j                  eej                  j                     f   d!eeej                        d"eeI   deFf
d5ZZddddej                  d0eej                  j                  eej                  j                     f   d!eeej                        d"eeI   deee:eCf   eFf   f
d6Z[dej                  d,eeej                  ee:eCf   f   ee:eCf   f   dee:eCf   fd7Z\dd8dej                  d$eeej                  ee:eCf   f   ee:eCf   f   d"eeI   de0fd9Z]dd8dej                  d0eej                  j                  eej                  j                     f   d%eFd"eeI   ddf
d:Z^dd8dej                  d0eej                  j                  eej                  j                     f   d$eeej                  ee:eCf   f   ee:eCf   f   d%eFd"eeI   de0fd;Z_edd8dej                  d"eeI   ddfd<       Z`edd8dej                  d0eej                  j                  df   d"eeI   ddfd=       Zay)?    N)asdict	dataclassfield)chain)AnyCallablecastDictIterableListno_type_checkOptionalSetTupleUnion)ShardedTensor)DTensor)_gather_state_dict_offload_state_dict_to_cpu)FullOptimStateDictConfigFullStateDictConfigFullyShardedDataParallelOptimStateDictConfigShardedOptimStateDictConfigShardedStateDictConfigStateDictConfigStateDictType)._get_module_fsdp_state_if_fully_sharded_moduleFSDP_WRAPPED_MODULE)_IncompatibleKeys)DistributedDataParallel_flat_paramparam_groups.stateparams_patched_state_dict	ValueTypec               #     K   t        j                         } t        j                          	 d  t        j                          | rt        j                          y y # t        j                          | rt        j                          w w xY wwN)gc	isenableddisablecollectenable)
is_enableds    rC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/distributed/checkpoint/state_dict.py
gc_contextr2   C   s[     JJJL 	

IIK  	

IIK s   )BA ,B-B		Bc                   X    e Zd ZU dZdZeed<   dZeed<   dZeed<   dZ	eed<   dZ
eed<   y	)
StateDictOptionsa  
    This dataclass specifies how get_state_dict/set_state_dict will work.

    - ``full_state_dict``: if this is set to True, all the tensors in the
      returned state_dict will be gathered. No ShardedTensor and DTensor
      will be in the returned state_dict.

    - ``cpu_offload``: offload all the tensors to cpu. To prevent CPU OOM, if
      ``full_state_dict`` is also true, then only the rank0 will get the
      state_dict and all other ranks will get empty state_dict.

    - ``ignore_frozen_params``: if the value is True, the returned state_dict
      won't contain any frozen parameters -- the ``requires_grad`` is False.
      The default value is False.

    - ``keep_submodule_prefixes``: when ``submodules`` is not None, this option
      indicates whether to keep the submodule prefixes from the state_dict keys.
      or example, if the submodule is ``module.pretrain`` and the full FQN of
      the parameter is ``pretrain.layer1.weight`` of the param. When this option
      is True, the parameter's key in the returned state_dict will be
      ``pretrain.layer1.weight``. If the options is False, the key will be
      ``layer1.weight``.
      Note that if ``keep_submodule_prefixes`` is False, there may be conflicted
      FQNs, hence there should be only one submodule in ``submodules``.

    - ``strict``: the ``strict`` option when ``set_state_dict`` calls
      model.load_state_dict().
      The default value is False.
    Ffull_state_dictcpu_offloadignore_frozen_paramsTkeep_submodule_prefixesstrictN)__name__
__module____qualname____doc__r5   bool__annotations__r6   r7   r8   r9        r1   r4   r4   P   s=    < "OT!K!&$&$(T(FDrA   r4   c                   (   e Zd ZU  ee      Zeeee	j                  f   eee	j                  f   f   ed<    ee      Zee   ed<    ee      Zee   ed<   dZeed<   dZeed<   ej*                  Zeed<    ee      Zeej8                     ed	<   y
)_StateDictInfo)default_factoryfqn_param_mappingall_fqnssubmodule_prefixesThandle_modelhandle_optimfsdp_contextfsdp_modulesN)r:   r;   r<   r   dictrE   r
   r   strtorchTensorFQNS_Tr?   setrF   r   rG   rH   r>   rI   
contextlibnullcontextrJ   r   listrK   r   nnModuler@   rA   r1   rC   rC   w   s     	d# tc5<< %(<"== $ s3Hc#h3#(#=C=L$L$'33L(3$)$$?L$ryy/?rA   rC   modelnameskip_ddp_prefixreturnc                 v   d|vr|hS |j                  d      }g }| }t        |      D ]  \  }}t        |t              r(|dk(  sJ |j                  }|r,|j                  |       >t        |t              r||dz      t        k(  rHdj                  |      }t        |t              }	|r| d}|	j                  D 
ch c]  }
| |
 
 c}
c S t        |t              }|t        k7  s|j                  |       t        ||      }|j                  |       t        ||      } dj                  |      hS c c}
w )a  
    This API is used to convert the name of a parameter to the FQNs. For FSDP
    without `use_orig_params`, the name of FlatParameter can be mapped to
    multiple original parameters. As a result, the return type of this function
    is `Set[str]`.

    Args:
        module (nn.Module): the root model.
        name (str): the name
        skip_ddp_prefix (bool): whether to skip DDP's `module` prefix

    Returns:
        The canonical FQNs based on the model traversal.
    r$   module   )split	enumerate
isinstanceDDPr\   appendFSDP
FLAT_PARAMjoingetattr_fqnsr   )rW   rX   rY   	obj_namesfqn_obj_namescurr_objicurr_obj_nameprefix
flat_paramfqns              r1   	_get_fqnsrp      s<    $v

3IMH%i0=h$ H,,,H"$$]3$'Q:--0$Xz:
 &xq\F4>4D4DE4DS6(3%(4DEEx)<=H 33$$]3"8];  /x7H' 1* HH]#$$ Fs   ;D6)
submodulesoptionsoptims.
optim_onlyrq   rr   c                   |r|st        d      |xs
 t               }i }t               }| j                         D ]3  \  }}t	        | |      }	|	||<   |	D ]  }
|||
<   |j                  |
        5 t               }|rdt        |      }| j                         D ]F  \  }}||vrt	        | |      }	t        |	      dk(  sJ d       |	D ]  }
|j                  |
 d        H t        j                  |       }|r|j                  rSt        |j                  |j                        }t        |j                  |j                        }t        j                  }n0t!               }t#        |j                        }t        j$                  }t'        j(                  t        j*                  | |||      }nt,        j.                  }t1        d
i t3        |      ||||t5        t6        t8        j:                     |      | t        |      dkD  d	S )zW
    Verify the model and options passed by the user and generates _StateDictInfo.
    z;Optimizers are not passed in but optim_only is set to True.r]   z)Submodule FQN should only have 1 instancer$   )offload_to_cpu
rank0_only)rv   )r\   state_dict_typestate_dict_configoptim_state_dict_configr   )rE   rF   rG   rJ   rK   rH   rI   r@   )RuntimeErrorr4   rQ   named_parametersrp   addnamed_moduleslenrc   rK   r5   r   r6   r   r   FULL_STATE_DICTr   r   SHARDED_STATE_DICT	functoolspartialrx   rR   rS   rC   r   r	   r   rU   rV   )rW   rs   rt   rq   rr   rE   rF   rX   paramfqnsro   rG   r\   rK   ry   rz   rx   rJ   s                     r1   _verify_optionsr      s    &I
 	
 +)+G 	  uH--/e%#'% C%*c"LL  0 _
!//1LD&Z'UD)Dt9>N#NN>"&&#ay1  2 $$U+L "" 3&22w?R?R! '?&22w?R?R'# ,;;O 6 8&A&22'# ,>>O ((  +/$;
 "-- 	
/	+-!$ryy/<8#^&kAo	 	rA   model_state_dictoptim_state_dictinfoc                 X   d}|j                   D ]&  }t        |      }|J d       |j                  s$d} n |j                   r|st        d      |j                  r_| s]|j
                  sQ|j                  sE|j                  r|j                  s-|j                  r!t        dt        j                         d      |j                  r1|r	|t           s&|j                  r|j                  st        d|       | j                         D ]  }t        |v st        | dt         d	       y )
NFz)Expected a fsdp_state with a fsdp module.Tz:The model has FSDP modules but no FSDP root module exists.z}The option indicates that model state_dict is required to save or load, but model state_dict is empty.rank = dist.get_rank()=r$   zgThe option indicates that model state_dict is required to save, or load but optim state_dict is empty. z
 contains z6. This can happen if the model is not the root module.)rK   r   _is_rootr{   rH   rG   r7   r6   r5   r9   distget_rankrI   STATEkeysrd   )r   r   r   has_fsdp_rootr\   
fsdp_statekeys          r1   _verify_state_dictr     s@    M##CFK
%R'RR% M $ WXX
 	 ''))!!d&:&:KK'mmo'q*
 	
  %5e%<!5!5::J9KM 
  $$&%z* .* *  'rA   objapic                     t        | |      }|t        v r+t        j                  t        | j                  |      |       }|S )N)self)rf   r'   r   r   	__class__)r   r   calls      r1   _state_dict_fnr   5  s9    3D""  !<3GKrA   c                 \   |j                   si S |j                         5   t        | d             }d d d        t        j	                               D ]p  }t        | |      }t        |      dk(  sJ t        t        |            }||k7  s9dt        fd} |||      st        d| d|       |j                  |      ||<   r |j                  rgi }|j	                         D ]P  }|j                  D ]?  }|j                  |      s|j                  r	||   ||<   *|t        |      d  }	||   ||	<   A R |}|j                  rI| j!                         D ]6  \  }}
|
j"                  rt        | |      }|D ]  }|j                  |        8 t        |j%                               D ]#  \  }}|j&                  s|j                  |       % |j(                  r0|j*                  s
t-               nd}t/        ||j*                  |      S |j*                  rt1        |      S |S # 1 sw Y   xY w)	N
state_dictr]   rZ   c                    t        |      t        |       k\  ry|j                  d      }| j                  d      }d}t        |      D ];  \  }}|||   k(  r'|dz  }|t        |      k(  s"|t        |      dz
  k(  c S |dk(  r; y y)NFr$   r   r]   r\   T)r   r^   r_   )r   ro   	fqn_split	key_splitfqn_idxkey_idxkey_names          r1   verifyz%_get_model_state_dict.<locals>.verifyL  s    s8s3x' IIcN	IIcN	)29)=%GX9W#551"c)n4#*c)nq.@#@@!X- $ *> rA   zAn unexpected key, z, exists. FQN is r   r6   
ranks_only)rH   rJ   r   rT   r   rp   r   nextiterr>   r{   poprG   
startswithr8   r7   r|   requires_graditemsis_metar5   r6   tupler   r   )rW   r   r   r   r   ro   r   new_state_dictrm   new_fqnr   pr   s                r1   _get_model_state_dictr   <  s    					8^E<8:
 
 JOO%&$4yA~~4:#:D " #s#"%8=Nse#TUU(nnS1JsO5 '8 /1??$C11~~f-//*4S/N3'!#f+-0G.8oN7+ 2 % $
  002JC""UC(Ds# 	 3 z'')*Q99NN3 + $($4$4UW$
!D$4$4
 	
 
		)*55C 
	s   H!!H+r   c           	         |j                   r|st        i i       S | j                         D ]M  \  }}t        | |      }t        | |d      }t	        ||      D ]  \  }}||k7  s|j                  |      ||<   ! O |j                         5  t        t         t        | d      ||j                              cd d d        S # 1 sw Y   y xY w)NF)rY   load_state_dict)r   r9   )
rH   r    r|   rp   zipr   rJ   r	   r   r9   )	rW   r   r   r   _r   fqns_with_ddp_prefixro   fqn_with_ddp_prefixs	            r1   _load_model_state_dictr     s    
 J R((((*Q$(UK(+D2F(G$C$))2<..2E
./ )H + 
			4N5"34%dkk
 
		s   ,CC
optimc                     | j                   ry| j                  D ]N  }|t           D ]@  }|j                  t	        d      |j
                  s't        j                  |      |_        B P | j                  d       | j                  d       y)zH
    Initialize optim states by calling the step() with zero grads.
    Na  state_dict can only be used if the optimizer states are initialized (usually after one step() with gradients) or gradients are None. For the later case, state_dict will fake the gradients as zero to initialize the optimizer states. However, the gradients are not None.)closureT)set_to_none)
r%   r#   PARAMSgradr{   r   rN   
zeros_likestep	zero_grad)r   param_groupr   s      r1   _init_optim_stater     s     {{)) (Ezz%".  """--e4
 ) * 
JJtJ	OOO%rA   
optimizersc                    |j                   si S t        i t        g i}|D ]  }t        |        t	        |d             }|j
                  r2|j                         5  t        j                  | ||      }d d d        n/t        t        j                  d |j                  D                    }t        t        |t        t!        |                        }i }| j#                         D ]I  \  }	}
t%        | |	      }t!        |      dk(  sJ t'        t)        |            }|
|vr;||
   }|||<   |||<   K t        |t           j+                               D ])  }	||	   }|t           j-                  |	      |t           |<   + |t           D ]#  }|t.           D cg c]  }||   	 c}|t.        <   % |st1        t2        |t                 j5                  |t                  t1        t6        |t                 j9                  |t                   |j:                  r0|j<                  s
t?               nd}tA        ||j<                  |      S |j<                  rtC        |      S |S # 1 sw Y   xY wc c}w )Nr   c              3   .   K   | ]  }|t              y wr*   )r   ).0gs     r1   	<genexpr>z(_get_optim_state_dict.<locals>.<genexpr>  s     -TASAaiASs   r]   r   r   )"rI   r   PGr   r   rK   rJ   rc   r   rT   r   from_iterabler#   rL   r   ranger   r|   rp   r   r   r   r   r   r	   DictValueTypeupdateListDictValueTypeextendr5   r6   r   r   r   )rW   r   r   r   r   osdr&   param_pid_mappingfqn_pid_mappingr   r   r   ro   pidgroupr   s                   r1   _get_optim_state_dictr     s>   
 	,12r2+>% 1nUL13""$++E5#> %$ %---TASAS-TTUF $Ss6{1C%D E O#446
U ,4yA~%~4:& 11'.'*$'*$ 7 CJOO-.%c*"%e*.."5E
3 / RAFv O#!5 Of ! ],U34;;CJG 0 45<<SWE? B $($4$4UW$
!$*:*:z
 	
 
		)*:;;K %$* !Ps   I"I."I+	c           	         i }g }t         |t        |i}i }|j                  D ]  }|j                  t        g i       |t           D ]  }	|j
                  |	   D ]  }
|d   t           }t        |t              sJ |j                  |
       |	j                  rt        t        |t                  |
   ||
<   t        t        |t                 D ]C  }|t           }t        |t              sJ |
|v s#t        |t                 dz
  |t        |      <   E    t        t        |t                 D ]M  }|j                  t        |      d      }|dk(  r$|j                         D ]  \  }}|t        k(  r|||   |<    O |S )a  
    Extract the corresponding optim state_dict from ``optim_state_dict`` for
    ``optim`` and return the result optim state_dict.

    Args:
        model (nn.Module): the root model.
        optim (torch.optim.Optimizer): the optimizer.
        optim_state_dict (Dict[str, ValueType]): the superset optim state_dict that
            contains the optim state_dict of ``optim``.
        info (_StateDictInfo): state dict information.

    Returns:
        The optim state_dict of ``optim``.
    r]   )r   r   r#   rb   r   rE   r`   rT   r   r	   r   r   r   idgetr   )rW   r   r   r   r%   pg_state
return_osd
pg_mappingr   r   ro   r&   loaded_param_groupidxr   values                   r1   _split_optim_state_dictr     s~   * E"$H&+UB%AJ!#J))% (E--e4!"f-!&$///c"&&!%m5Ee5L!Mc!RE#J*./@BRSUBV*W&/7F%fd333f}=@B=PST=T
2&8#9:	 +X 5 ) * -/?/CDnnR_b1"9%++-JCf}!&HSM#	 .	 E rA   c                    |j                   sy |D ]j  }t        | |||      }|j                  r0|j                         5  t	        j
                  | ||      }d d d        t        |        t        |d      |       l y # 1 sw Y   *xY w)Nr   )r   )rI   r   rK   rJ   rc   optim_state_dict_to_loadr   r   )rW   r   r   r   r   r   s         r1   _load_optim_state_dictr   "  s     25%TR""$#'#@#@5"2$  % 	% 0u/0<LM  %$s   A>>B	c                    t               5  t        | t               d||      }t        | |      }t	        |i |       |cddd       S # 1 sw Y   yxY w)a  
    Return the model state_dict of ``model``.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        submodules: Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``model``.
    Frt   rq   rr   N)r2   r   r   r   r   )rW   rq   rr   r   r   s        r1   get_model_state_dictr   :  sM    , 
G!
 1=+R6 
s   3AAc                    t               5  t        |t        j                  j                        r|fn
t        |      }t        | |d||      }t        | ||      }t        i ||       |cddd       S # 1 sw Y   yxY w)a  
    Return the combined state_dict for optimizers.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules: Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``optimizers``.
    Tr   N)	r2   r`   rN   r   	Optimizerr   r   r   r   )rW   r   rq   rr   r   r   s         r1   get_optimizer_state_dictr   ]  sy    2 
 *ekk&;&;< Mz" 	
 !
 1
DI2/6 
s   AA33A<c                   t               5  t        |t        j                  j                        r|fn
t        |      }t        | |d||      }t        | |      }t        | ||      }t        |||       ||fcddd       S # 1 sw Y   yxY w)a)  
    Return the model state_dict and optimizers state_dict.

    ``get_state_dict`` can process any module that is parallelized by PyTorch
    FSDP/fully_shard, DDP/replicate, tensor_parallel/parallelize_module, and any
    combination of these parallelisms. The main functions of ``get_state_dict``
    are: 1.) returning a model and optimizer state_dict that can be resharded
    with a different number of trainers and/or different parallelisms.
    2.) hiding the parallelism-specific state_dict APIs. Users don't have to call
    these APIs.
    3.) sanity checking the result state_dict.

    The keys of the result state dictionary are the canonical FQNs (Fully
    Qualified Names).  A canonical FQN refers to the FQN based on a parameter's
    position in an nn.Module hierarchy. More specifically, a canonical FQN to a
    parameter is the FQN returned by ``module.named_parameters()`` or
    ``module.named_buffers()`` when the module is not distributed by any
    parallelisms. Since the optimizer internally uses parameter IDs to represent
    a parameter, there will be a conversion from the parameter IDs to the
    canonical FQNs when calling this API.

    ``get_state_dict`` can also process a module that is not parallelized. In
    such a case, ``get_state_dict`` only performs one function -- converting the
    optimizer parameter IDs to the canonical FQNs.

    Example:

        import torch
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.nn.parallel import DistributedDataParallel as DDP
        from torch.distributed.checkpoint.state_dict import get_state_dict

        fsdp_model = FSDP(copy.deepcopy(model))
        fsdp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
        ddp_model = DDP(copy.deepcopy(model))
        ddp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)


        ddp_state_dict, ddp_optim_state_dict = get_state_dict(ddp_model, ddp_optim)
        fsdp_state_dict, fsdp_optim_state_dict = get_state_dict(fsdp_model, fsdp_optim)

        # if we simply call ddp_model.state_dict() and fsdp_model.state_dict(),
        # the asserts will fail.
        assert ddp_state_dict == fsdp_state_dict
        assert ddp_optim_state == fsdp_optim_state_dict


    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules: Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        ``Tuple`` that contain model state_dict and optimizer state_dict.
    Fr   N)
r2   r`   rN   r   r   r   r   r   r   r   )rW   r   rq   rr   r   r   r   s          r1   get_state_dictr     s    H 
 *ekk&;&;< Mz" 	
 !
 1=0
DI+-=tD!11! 
s   A,BB
c           
      |   |si S t        t        t        |j                                     t        j
                        rt        t        t        j
                  t        t        t        f   f   |      }i }|j                         D ]  \  }}| j                         D ]y  \  }}||k7  rt        | |      }t        |      dk(  sJ d       t        t        |             d}	|j                  |j                         D 
ci c]  \  }
}|	|
z   | c}}
       {  |S t        t        t        t        f   |      S c c}}
w )Nr]   z/FQNs for a submodule should only have 1 elementr$   )r`   r   r   r   rU   rV   r	   r
   rM   r(   r   r~   rp   r   r   )rW   r   cast_state_dictr   	submodulesub_state_dictrX   mr   rm   subfqnr   s               r1   _unflatten_model_state_dictr     s!    	$tJOO-./;tBIItCN/C$CDjQ/1)8)>)>)@%I~ ..0a	> -4yA~X'XX~ d,-Q/%%AOAUAUAWXAWVf_e+AWX 1 *A Di(*55	 Ys   =D8)rr   c                    t        | |      }t               5  t        | t               d|      }t	        |i |       t        | ||      cddd       S # 1 sw Y   yxY w)a2  Load the model state_dict.

    The counterpart of ``get_model_state_dict`` to set the state_dict to the
    model. See ``set_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        model_state_dict: (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys
            * **unexpected_keys** is a list of str containing the unexpected keys
    Frt   rr   N)r   r2   r   r   r   r   )rW   r   rr   r   s       r1   set_model_state_dictr     sS    : .I. 
ueg%Q+R6%e-=tD	 
s   1AAc                    t               5  t        |t        j                  j                        r|fn
t        |      }t        | |d|      }t        i ||       t        | |||       ddd       y# 1 sw Y   yxY w)a  Load the optimizers state_dict.

    The counterpart of ``get_optimizer_state_dict`` to set the state_dict to the
    optimizers. See ``set_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        None
    Tr   N)	r2   r`   rN   r   r   r   r   r   r   )rW   r   r   rr   r   s        r1   set_optimizer_state_dictr      sk    2 
 *ekk&;&;< Mz" 	
 ujT7S2/6uj2BDI 
s   AA11A:c                .   t        | |      }t               5  t        |t        j                  j
                        r|fn
t        |      }t        | || |      }t        |||       t        | |||       t        | ||      cddd       S # 1 sw Y   yxY w)a  Load the model state_dict and optimizers state_dict.

    The counterpart of ``get_state_dict`` to set the state_dict to the model and
    optimizers.  The given ``model_state_dict`` and ``optim_state_dict`` do not
    have to be returned by ``get_state_dict`` but must meet the following
    requirements: 1) all FQNs are canonical FQNs as defined in ``get_state_dict``,
    2) if a tensor is sharded, it must be either a ShardedTensor or DTensor,
    3) optimizer state_dict cannot contain the parameter IDs; the keys should be
    the canonical FQNs.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        model_state_dict: (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys of the model state_dict.
            * **unexpected_keys** is a list of str containing the unexpected keys of the model state_dict.
    r   N)r   r2   r`   rN   r   r   r   r   r   r   r   )rW   r   r   r   rr   r   s         r1   set_state_dictr   E  s    R .I. 
 *ekk&;&;< Mz" 	
 :.>*>
 	+-=tDuj2BDI%e-=tD 
s   A*BBc                $   t        j                  t        | |      fd}|| _        t        j                  t        | |      dt
        t        t        f   ffd}|| _        t        j                  |       t        j                  |       y)a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )rW   rr   c                               S r*   r@   _state_dict_calls   r1   state_dict_callz0_patch_model_state_dict.<locals>.state_dict_call      !!rA   r   c                      |        y )N)r   r@   r   _load_state_dict_calls    r1   load_state_dict_callz5_patch_model_state_dict.<locals>.load_state_dict_call      z:rA   N)r   r   r   r   r   r
   rM   r   r   r'   r}   )rW   rr   r   r  r  r   s       @@r1   _patch_model_state_dictr    s    6 !((" 'E%--;c3h ; 1EO,01rA   c                   t        j                  t        | ||      fd}t        j                  t        | ||      dt        t
        t        f   ffd}t        j                  |       t        j                  |       t        |t        j                  j                        r|fn
t        |      }|D ]  }||_        ||_         y)a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Note that if there are multiple optimizers, all of the optimizers will be patched.
    So users only need to call one of the state_dict() to get the full result.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )rW   r   rr   c                               S r*   r@   r   s   r1   r   z4_patch_optimizer_state_dict.<locals>.state_dict_call  r  rA   r   c                      |        y )N)r   r@   r  s    r1   r  z9_patch_optimizer_state_dict.<locals>.load_state_dict_call  r  rA   N)r   r   r   r   r
   rM   r   r'   r}   r`   rN   r   r   r   r   r   )rW   r   rr   r   r  r   r  r   s         @@r1   _patch_optimizer_state_dictr    s    > !(( 	" &-- 	;c3h ; O,01 j%++"7"78 
: 
 * 4 rA   )T)brR   r   r+   dataclassesr   r   r   	itertoolsr   typingr   r   r	   r
   r   r   r   r   r   r   r   rN   torch.distributeddistributedr   torch.nnrU   'torch.distributed._shard.sharded_tensorr   torch.distributed._tensorr   .torch.distributed.checkpoint._state_dict_utilsr   r   torch.distributed.fsdpr   r   r   rc   r   r   r   r   r   $torch.distributed.fsdp._common_utilsr   r   torch.nn.modules.moduler    torch.nn.parallelr!   ra   rd   r   	PG_PREFIXr   STATE_PREFIXr   rM   rP   rQ   r'   r?   rO   intfloatPrimitiveTyper(   r   r   OptimizerStateTypecontextmanagerr2   r4   rC   rV   r>   rp   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r@   rA   r1   <module>r      s     	 0 0         A -	 	 	 6 < 
d!H	{		S%(U S] * g}ellCKL4&m(<d3CS>TT	 S)^$' #u]4E%EFFG  	 	 # # #L 	@% 	@ 	@*%RYY *%c *%D *%F *%d ,0*.P99P%++'',-P P
 RYY(P &'P Pf.3	>*.(. . 
	.bbii)>)>>? c h G99G*G	#y.GT
99
S)^$
 
 	
0&U[[22 &t &22 992 ekk++S012  2  	2 j3993;;  3 )3 	3
 3lN99Nekk++S01N #N 	N
 
N6 ,0*.	  99   RYY(   &'	  
 
#y.  N ,0*.( 99( ekk++Xekk6K6K-LLM(  RYY(	( 
 &'(  ( ^ ,0*.T299T2ekk++Xekk6K6K-LLMT2 RYY(	T2
 &'T2 4Y!334T2n6996d299d3	>&::;T#y.=QQR6 
#y.6@ +/$E99$ERYYS)^,,-tCN/CC$E &'$E $EX +/"J99"Jekk++Xekk6K6K-LLM"J )	"J
 &'"J 
"JZ +/8E998Eekk++Xekk6K6K-LLM8E RYYS)^,,-tCN/CC	8E )8E &'8E 8Ez  +/129912 &'12 
	12 12l 
 +/	;599;5 ekk++S01;5 &'	;5
 
;5 ;5rA   