
    PhX3                        d dl Z d dlmZmZmZmZmZmZmZ d dl	m
Z
 d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmc mZ d dlmZmZmZmZmZm Z  d d	l!m"Z" d d
l#m$Z$ d dl%m&Z&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl	m.Z. d dl/m0Z0 d dl1m2Z2m3Z3m4Z4 d dl5m6Z6 ee7eeee8      ee8   f   f   Z9dgZ:d(de8de7de7fdZ;	 d)deejx                     defdZ=dej|                  de?fdZ@d(dedee8   de7dej|                  fdZAdedee9eejx                     f   fd ZB G d! d"e-      ZC	 d)d#ed$e7d%ej                  d&ee.   def
d'ZEy)*    N)DictListOptionalSequenceTupleUnioncast)LoadPlan)ShardedTensor)TensorProperties)Shard)ChunkShardingSpec)BytesStorageMetadataMetadataMetadataIndexSTATE_DICT_TYPETensorStorageMetadataChunkStorageMetadata)_get_default_group)_create_chunk_sharded_tensor) create_read_items_for_chunk_list_create_read_items)_remote_device)DTensor)DefaultLoadPlanner)LoadPlanner)unflatten_state_dict)_element_wise_add_element_wise_sub_normalize_device_info)_get_device_module!load_sharded_optimizer_state_dictglobal_rankdevice_typereturnc                     |dk(  ryt        |      }|j                         rt        || |j                         z        S y)Ncpu)r!   is_availabler    device_count)r#   r$   device_modules      qC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/distributed/checkpoint/optimizer.py_gen_rank_devicer,   9   sA    e&{3M!!#%k;A[A[A]3]^^    pgc                    t         j                  j                  |       j                  }| ;t	        t        j
                               D cg c]  }d| dt        ||        }}nJt	        | j                               D cg c](  }d| dt        t        j                  | |      |       * }}t        dt        t        t        t        t        f      |            S c c}w c c}w )Nrank:/r   dim
placements)distdistributed_c10d_get_pg_default_devicetyperangeget_world_sizer,   sizeget_global_rankr   r	   r   r   r   str)r.   pg_device_typeidxr4   s       r+   _create_colwise_specr@   B   s     **AA"EJJN	z T0023
3 C5*3?@A3 	 
 RWWY'
' C5*4+?+?C+H.YZ[' 	 
 U>3#678*E 


s   C -C%valc                    t        |       t        u rt        | j                               dk(  ryt        | j                         d   j                        t        u ryt        | j                         d   j                        t
        u rt        d      yt        |       t
        u rAt        | j                        t
        u st        | j                        t        u rt        d      y)Nr   FTz2Cannot handle DTensor nested insided ShardedTensorzCannot handle nested DTensor)r8   r   lenlocal_shardstensorr   
ValueError_local_tensor)rA   s    r+   _is_nested_tensorrH   V   s    CyM!s!"a'  "1%,,->  "1%,,-8D   
cg	S7*!!"m3788r-   propsr;   c                     t        j                  || j                  | j                  | j                  | j
                  t        t         j                  t        |      j                                     S )N)r;   dtypelayoutrequires_grad
pin_memorydevice)
torchemptyrK   rL   rM   rN   r	   rO   r!   current_device)rI   r;   r$   s      r+   _alloc_tensorrS   h   sS    ;;kk||))##ELL"4["A"P"P"RS r-   
state_dictc                    i }d}| j                         D ]  \  }}d|j                         f||<   t        |      s't        |j	                               dk(  sJ d       t        |t              sJ d       |j	                         d   }|j                  j                  |j                  j                  f||<   |j                  j                  } ||fS )a+  
    Load the right TP slice of the optimizer state.

    This is not easy since the per-tensor slicing can't be inferred from checkpoint metadata.
    We take advantage of the model state_dict producing a sliced ST to figure out what we need to load.
    This is pretty fragile and it might be easier for FSDP to compute this info for us.
    Returns a dictionary where keys are the same of the state_dict and the value is a tuple of
    (offset, size) for the current rank TP slice.
    N.B. The state_dict *MUST* come from FSDP.sharded_state_dict.
    N   z%Cannot handle ST with multiple shardsz$Can only handle nested ShardedTensorr   )itemsr;   rH   rC   rD   
isinstancer   metadatashard_offsetsshard_sizesrE   _process_group)rT   specsdp_pgkeyvalueshards         r+   _get_state_dict_2d_layoutrb   s   s     #%E)-E &&(
UEJJL)c
U#E&&()Q.767.} 656  &&(+E,,**E#J LL//E )" 	 r-   c                        e Zd ZU eeef   ed<   eed<   eed<   deee	e
   f   ddf fdZdefdZd	edej                  f fd
Z xZS )_ReaderWithOffsettranslationrT   rY   fqn_to_offsetr%   Nc                 l    t         |           || _        t        i       | _        i | _        i | _        y N)super__init__rf   r   rY   rT   re   )selfrf   	__class__s     r+   rj   z_ReaderWithOffset.__init__   s0    * r-   c           	         g }i | _         | j                  j                         D ]  \  }}| j                  j                  |   }t        |t              s|t        |||      z  }A|| j                  vr|t        |||      z  }`| j                  |   }t        |j                               dk(  sJ |j                         d   }t        t        j                  t        |j                  j                  |            t        j                  |j                  j                               g}t#        |t%        t&        |      |      }|D ]  }	|	j(                  j*                  J t-        |	j(                  j*                  |      }
t/        j0                  |	j(                  t        j                  |
            }|| j                   |	j(                  <    ||z  } t3        |      S )NrV   r   )offsetssizes)offset)re   rT   rW   rY   state_dict_metadatarX   r   r   rf   rC   rD   r   rP   Sizer   rZ   r[   r   r	   r   
dest_indexrp   r   dataclassesreplacer
   )rk   requestsfqnobjmdrp   original_shardlocal_chunksreqsrioriginal_offsetoriginal_indexs               r+   create_local_planz#_ReaderWithOffset.create_local_plan   s   --/HC2237Bc=1.sB<<$,,,.sB<<'',Fs'')*a/// --/2N$!JJ)*33AA6
  **^%<%<%H%HI	L 4T/4lD
 }}++777"3MM((&# "-!4!4MM%**_*E" 3A  /  HQ 0R !!r-   indexc                 V    t         |   | j                  j                  ||            S rh   )ri   lookup_tensorre   get)rk   r   rl   s     r+   r   z_ReaderWithOffset.lookup_tensor   s&    w$T%5%5%9%9%%GHHr-   )__name__
__module____qualname__r   r   __annotations__r   r   r=   r   intrj   r
   r   rP   Tensorr   __classcell__)rl   s   @r+   rd   rd      sm    m]233d3+=&> 4 ,"8 ,"\I= IU\\ I Ir-   rd   model_state_dictoptimizer_keystorage_readerplannerc                 n   |j                         }t        |       \  }}t        j                  j	                  |      j
                  }t        |      }|fg }	t        t        j                               D ]6  }
t        ||
|j                         z        }|	j                  d|
 d|        8 t        d|	      }nt        |      }i }i }|j                  j                         D ]  \  }}|j                   |   }|d   |k7  rt#        |t$              rd||<   5|j&                  j)                         dk(  r%t+        |j,                  |j&                  |      ||<   w|mt/        t+        |j,                  |j&                  |      t        j0                         t        j                         |j                         t3                     ||<   |d	   }|j5                  |d|j&                  f      d   }|j7                  t9        j:                  |      |j,                        }g }t        j0                  |      }|j<                  D ]i  }t?        t@        |jB                        jE                         |k7  r/|j                  tG        t+        |j,                  |jH                  |      |
             k tK        jL                  |||      }||v r(||   d    t?        tN        tP           ||   d         ||<   |||<    tS        jT                  |||tW        |      n|       tY        ||j                         }|S )a  
    Load a state_dict in conjunction with FSDP sharded optimizer state.

    This is the current recommended way to checkpoint FSDP.
    >>> # xdoctest: +SKIP
    >>> import torch.distributed.checkpoint as dist_cp
    >>> # Save
    >>> model: torch.nn.Model
    >>> optim_params = model.parameters()
    >>> optim = torch.optim.SGD(optim_params, lr=0.01)
    >>> # Save
    >>> with FSDP.state_dict_type(model, StateDictType.SHARDED_STATE_DICT):
    >>>     state_dict = {
    >>>         "optimizer": FSDP.optim_state_dict(model, optim),
    >>>         "model": model.state_dict()
    >>>     }
    >>>     dist_cp.save_state_dict(
    >>>         state_dict=optim_state,
    >>>         storage_writer=dist_cp.FileSystemWriter("checkpoint"),
    >>>         planner=dist_cp.DefaultSavePlanner(),
    >>>     )
    >>>
    >>> # Load
    >>> with FSDP.state_dict_type(model_tp, StateDictType.SHARDED_STATE_DICT):
    >>>     model_state_dict = model_tp.state_dict()
    >>>     checkpoint = {
    >>>         "model": model_state_dict
    >>>     }
    >>>     dist_cp.load_state_dict(
    >>>         state_dict=checkpoint,
    >>>         storage_reader=dist_cp.FileSystemReader(checkpoint_file),
    >>>         planner=dist_cp.DefaultLoadPlanner(),
    >>>     )
    >>>     model.load_state_dict(checkpoint["model_state"])
    >>>
    >>>     optim_state = dist_cp.load_sharded_optimizer_state_dict(
    >>>         model_state_dict,
    >>>         optimizer_key="optimizer",
    >>>         storage_reader=dist_cp.FileSystemReader("checkpoint"),
    >>>     )
    >>>
    >>>     flattened_osd = FSDP.optim_state_dict_to_load(
    >>>        model, optim, optim_state["optimizer"]
    >>>     )
    >>>
    >>>     optim.load_state_dict(flattened_osd)
    Nr0   r1   r   r2   z
<bytes_io>rV   )rank
world_sizenum_devices_per_noder.      )rE   rY   )process_group)rT   r   r   )-read_metadatarb   r5   r6   r7   r8   r!   r9   r:   r    r)   appendr   r@   rq   rW   planner_datarX   r   r;   numelrS   
propertiesr   get_rankr   r   build_metadatarP   rr   shards_metadatar	   r   	placementr   r   r[   r   +_init_from_local_shards_and_global_metadatar   r   dist_cpload_state_dictrd   r   )r   r   r   r   rY   layout_specsr^   dp_pg_device_typer*   r4   idevice_infosharding_specrT   rf   r_   r`   key_pathspec_key
alloc_sizest_mdrD   current_rankshard_mdsts                            r+   r"   r"      s   j ++-H34DEL%--DDUKPP&'89M}
t**,-A01BAHbHbHdDdeKaS+78 . *aJG,U3 #%J.0M2288:
U((-A;-'e12*JsO ::"+E,<,<ejjJ[\JsO]:e..

<MN]]_..0%2%?%?%A%'JsO  {H%))(T5::4FGJJ!00

:&(8(8E L==/L!11););<AAC#$ ##,!,,h.B.BDU  "*	 2 JJe5B
 L( *1-9%)SM<#9!#<&c" !JsOq ;v %494E!-07	 &j(2G2GHJr-   )cudarh   )Frt   typingr   r   r   r   r   r   r	   $torch.distributed.checkpoint.plannerr
   rP   torch.distributeddistributedr5   +torch.distributed._shard.sharded_tensor.apir   0torch.distributed._shard.sharded_tensor.metadatar   -torch.distributed._shard.sharded_tensor.shardr   :torch.distributed._shard.sharding_spec.chunk_sharding_specr   torch.distributed.checkpoint
checkpointr   %torch.distributed.checkpoint.metadatar   r   r   r   r   r   "torch.distributed.distributed_c10dr   #torch.distributed.fsdp._shard_utilsr   ,torch.distributed.checkpoint.planner_helpersr   r   torch.distributed.remote_devicer   torch.distributed._tensorr   ,torch.distributed.checkpoint.default_plannerr   r   )torch.distributed.checkpoint._nested_dictr   "torch.distributed.checkpoint.utilsr   r   r    torch._utilsr!   r=   r   STATE_DICT_2D_LAYOUT__all__r,   ProcessGroupr@   r   boolrH   rS   rb   rd   StorageReaderr"    r-   r+   <module>r      s    E E E 9    E M ? / .  B L ; - = J  ,Cx'>'M!NNO 
 (
# C S  '+""#(5<< D $) # S ^c^j^j ""
$*;*;!<<="J;I* ;ID &*	L%LL ))L k"	L
 Lr-   