
    PhD                        d dl Z d dlmZmZ d dlmZmZmZmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZmZmZ d dlmZ  e j0                  e      Zdeeef   fdZdej:                  defd	Z G d
 de      Z  G d de      Z!y)    N)abcdefaultdict)	AnyDictIterableListOptionaloverloadSequenceTupleUnion)_MultiDeviceReplicator
GradScalerOptState)ProcessGroupreturnc                  (    t         j                  i dS )N)stagefound_inf_per_device)r   READY     uC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/distributed/fsdp/sharded_grad_scaler.py_refresh_per_optimizer_stater      s    ^^R@@r   tensorc                 N    | j                   xs | j                  j                  dv S )N)xlacpu)is_cudadevicetype)r   s    r   _is_supported_devicer"      s     >>AV]]//>AAr   c                   4    e Zd ZdZdej
                  ddfdZy)_GeneralMultiDeviceReplicatorz
    Lazily serves tensor to request device. This class extends
    _MultiDeviceReplicator to allow support for "cpu" as a device.
    master_tensorr   Nc                 :    t        |      sJ || _        i | _        y N)r"   master_per_device_tensors)selfr%   s     r   __init__z&_GeneralMultiDeviceReplicator.__init__   s    #M222#EG r   )__name__
__module____qualname____doc__torchTensorr+   r   r   r   r$   r$      s!    
Hell Ht Hr   r$   c                       e Zd ZdZdddddej
                  j                  fdeded	ed
ede	de
e   ddf fdZedej                  dej                  fd       Zedeej                     deej                     fd       Zedeej                  df   deej                  df   fd       Zedeej                     deej                     fd       Zdeej                  eej                     f   deej                  eej                     f   fdZdeej                     dej                  dej                  ddfdZ	 d"dej.                  j0                  dej                  dej                  de	deej4                  ej                  f   f
dZdej.                  j0                  ddfdZdej                  ddfdZd#d e
eeej                  f      ddfd!Z xZS )$ShardedGradScaleraA	  
    ShardedGradScaler helps perform gradient scaling in a shard aware manner. It extends
    functionality from GradScaler:
    * Supports Pytorch DDP and FSDP implementations
    * Support CPU offloaded tensors (as used in fully sharded data parallel[FSDP])
    * Supports the custom Mixed Precision loss dtype (fp16, bf16) that FSDP returns
    * Sync inf/nan for scaled gradient tensors on any torch.device (where tensors are placed) across
    nodes

    Example::

        # Creates a ShardedGradScaler once at the beginning of training.
        scaler = ShardedGradScaler()

        for epoch in epochs:
            for input, target in data:
                optimizer.zero_grad()
                output = model(input)
                loss = loss_fn(output, target)

                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
                scaler.scale(loss).backward()

                # scaler.step() first unscales gradients of the optimizer's params.
                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
                # otherwise, optimizer.step() is skipped.
                scaler.step(optimizer)

                # Updates the scale for next iteration.
                scaler.update()

    See :class:`GradScaler` for explanation of scaling/unscaling and more use cases.

    Args:
        init_scale (float, optional, default=2.**16):  Initial scale factor.
        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
            :meth:`update` if inf/NaN gradients occur in an iteration.
        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
            that must occur for the scale to be multiplied by ``growth_factor``.
        enabled (bool, optional):  If ``False``, disables gradient scaling. :meth:`step` simply
            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
            Default: ``True``
        process_group (ProcessGroup, optional, default=torch.distributed.group.WORLD):
            process group for sharding
    g      @g      ?g       @i  T
init_scalebackoff_factorgrowth_factorgrowth_intervalenabledprocess_groupr   Nc                 ~    t         |   |||||       | j                  r|| _        t	        t
              | _        y y )N)r4   r5   r6   r7   r8   )superr+   _enabledr9   r   r   _per_optimizer_states)r*   r4   r5   r6   r7   r8   r9   	__class__s          r   r+   zShardedGradScaler.__init__R   sJ     	!)'+ 	 	
 ==!.D)45Q)RD& r   outputsc                      y r'   r   r*   r?   s     r   scalezShardedGradScaler.scalef       r   c                      y r'   r   rA   s     r   rB   zShardedGradScaler.scalej   rC   r   .c                      y r'   r   rA   s     r   rB   zShardedGradScaler.scalen   rC   r   c                      y r'   r   rA   s     r   rB   zShardedGradScaler.scaler   rC   r   c                      j                   s|S t        |t        j                        rt	        |      sJ  j
                   j                  |j                          j
                  J | j
                  j                  |j                  d      z  }|j                  |j                        S g dt        t        j                  t        t        j                     f   f fd |      S )NTr    non_blockingvalc                 L   t        | t        j                        rt        |       sJ t	              dk(  rYj
                  j                  | j                         j
                  J j                  t        j
                               | d   j                  | j                        z  }|j                  | j                        S t        | t        j                        r5t        |       }t        | t         t"        f      r t        |       |      S |S t%        d      )Nr   z2outputs must be a Tensor or an iterable of Tensors)
isinstancer0   r1   r"   len_scale_lazy_init_scale_growth_trackerr    appendr$   getr!   dtyper   r   maplisttuple
ValueError)rJ   
scaled_valiteratorapply_scaler*   stashs      r   rY   z,ShardedGradScaler.scale.<locals>.apply_scale   s    #u||,+C000u:?{{*<<SZZH;;222LL!>t{{!KL 58<<

#;;
 "syy11#s||,{C0cD%=1$49X..QRRr   )r<   rL   r0   r1   r"   rN   rO   r    tor!   rR   r   r   )r*   r?   scaled_outputrY   rZ   s   `  @@r   rB   zShardedGradScaler.scalev   s     }}Ngu||,'000{{"44W^^D;;***#dkknn~~D '5 ' M !%%gmm4457	SU5<<%,,1G#GH 	S( 7##r   grads	found_inf	inv_scalec                 l   t        |      dk(  ry |j                         dk(  sJ d       |j                         dk(  sJ d       |D ]  }|j                  j                  dk7  r+t        j                  d|j                         t        d      t        j                  |      j                         j                         du s3t        j                  |      j                         j                         du rt        j                  d	g      |_         y |xj                  |j                         z  c_         y )
Nr      z%inv_scale must be a 1-element tensor.z%found_inf must be a 1-element tensor.r   z2tensor device is %s but was expected to be ``cpu``zDGradients were found on a non-CPU device when expected to be on CPU.T      ?)rM   numelr    r!   logerrorrV   r0   isinfanyitemisnanr   data)r*   r]   r^   r_   grads        r   *_foreach_non_finite_check_and_unscale_cpu_z<ShardedGradScaler._foreach_non_finite_check_and_unscale_cpu_   s     u:? A%N'NN% A%N'NN%D{{5(		HKK !. 
 D!%%',,.$6;;t$((*//1T9!&se!4			Y^^--	# r   	optimizer
allow_fp16c           
      "   t        |      }t        |      }t        d       }t        j                         5  |j                  D ]9  }|d   D ]-  }	|	j
                  |s2|	j
                  j                  t        j                  k(  rt        d      |	j
                  j                  r|	j
                  j                  t        j                  u r[|	j
                  j                  t        j                        j                         }
|
j                  t        j                        |	_        |	j
                  j                         }n|	j
                  }||j                     |j                     j                  |       0 < |j!                         D ]  \  }}|j#                         D ]  }|d   j                  j                  dk(  r2| j%                  ||j'                  |      |j'                  |             Qt        j(                  ||j'                  |      |j'                  |               	 d d d        |j*                  s3| j,                  J |j'                  | j,                  j                         |j*                  S # 1 sw Y   TxY w)Nc                       t        t              S r'   )r   rT   r   r   r   <lambda>z3ShardedGradScaler._unscale_grads_.<locals>.<lambda>   s	    T9Jr   paramsz%Attempting to unscale FP16 gradients.r   r   )r$   r   r0   no_gradparam_groupsrk   rR   float16rV   	is_sparser!   float32coalesce_valuesr    rP   itemsvaluesrl   rQ   *_amp_foreach_non_finite_check_and_unscale_r)   rN   )r*   rm   r_   r^   rn   per_device_inv_scaleper_device_found_infper_device_and_dtype_gradsgroupparamparam_grad_fp32
to_unscaler    per_dtype_gradsr]   s                  r   _unscale_grads_z!ShardedGradScaler._unscale_grads_   s
     =YG<YG &11J%K"]]_"//"8_Ezz) &EJJ,<,<,M()PQQzz++
 !::++u}}<.3jjooemm.L.U.U.WO)8)=)=emm)LEJ%*ZZ%7%7%9
%*ZZ
.z/@/@A"((fZ() - 0. ,F+K+K+M',335EQx++u4GG!044V<044V< HH!044V<044V< 6 ,N1 R $77;;*** $$T[[%7%78#777Y _s   G:JJc                 x   | j                   sy | j                  d       | j                  t        |         }|d   t        j
                  u rt        d      |d   t        j                  u rt        d      | j                  J | j                  j                         j                         j                         }t        j                  ddt        j                  | j                  j                        }| j!                  |||d      |d	<   t        j
                  |d<   | j                  t        |         }g }|d	   j#                         D ]  }|j                  j$                  d
k(  ro|j'                         }|j)                  t+        j,                  |d| j.                        j1                                |j3                  |j5                                |j)                  t+        j,                  |d| j.                        j1                                 |r t        j6                  j9                  |       y y )Nunscale_r   zMunscale_() has already been called on this optimizer since the last update().z(unscale_() is being called after step().)ra   g        )rR   r    Tr   r   )async_opr   )r<   _check_scale_growth_trackerr=   idr   UNSCALEDRuntimeErrorSTEPPEDrN   double
reciprocalfloatr0   fullrw   r    r   r{   r!   cudarP   dist
all_reducer9   
get_futurecopy_r   futureswait_all)r*   rm   optimizer_stater_   r^   future_handlesv	v_on_cudas           r   r   zShardedGradScaler.unscale_   s   }}((444R	]C7#x'8'88_  W%)9)99IJJ {{&&&KK&&(335;;=	JJ#U]]4;;3E3E
	 372F2Fy)T3
./ $,#4#4  44R	]C !78??AAxx}}%FFH	%%OO!D8J8J jl
 	(%%OOD0B0B jl B" MM"">2 r   c                    | j                   | j                  J |j                         dk\  r;| xj                   | j                  z  c_         | j                  j	                  d       y| j                  dz   }|| j
                  k(  r;| xj                   | j                  z  c_         | j                  j	                  d       y|| _        y)z
        If found_inf is 1.0 (True), then scale is multiplied by backoff_factor and growth_tracker is set to zero.
        Otherwise, scale is multiplied by the growth factor when the growth interval is reached.
        Nrb   r   ra   )rN   _growth_trackerrh   _backoff_factorfill__growth_interval_growth_factor)r*   r^   
successfuls      r   _amp_update_scale_cpu_z(ShardedGradScaler._amp_update_scale_cpu_1  s    
 {{&4+?+?+KKK>>s"KK4///K  &&q)--1JT222t222$$**1-'1$r   	new_scalec           	         | j                   sy| j                  d      \  }}|t        |t              r| j                  j                  |       nd}t        |t        j                  j                        sJ |       |j                         dk(  sJ |       |j                  du sJ |       | j                  j                  |       n| j                  j                         D cg c]7  }|d   j                         D ]  }|j                  |j                  d      ! 9 }}}t!        |      d	kD  sJ d
       |d	   }t!        |      dkD  r"t#        dt!        |            D ]
  }	|||	   z  } |j                  j$                  dk(  r| j'                  |       nLt        j(                  | j                  | j*                  || j,                  | j.                  | j0                         t3        t4              | _        yc c}}w )a  
        Updates the scale factor.
        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
        the scale is multiplied by ``growth_factor`` to increase it.
        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
        used directly, it's used to fill GradScaler's internal scale tensor. So if
        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
        affect the scale GradScaler uses internally.)
        Args:
            new_scale (float or :class:`torch.cuda.FloatTensor`, optional, default=None):  New scale factor.
        .. warning::
            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
            been invoked for all optimizers used this iteration.
        Nupdatez[new_scale should be a float or a 1-element torch.cuda.FloatTensor with requires_grad=False.ra   Fr   TrH   r   z,No inf checks were recorded prior to update.r   )r<   r   rL   r   rN   r   r0   r   FloatTensorrc   requires_gradr   r=   r{   r[   r    rM   ranger!   r   _amp_update_scale_r   r   r   r   r   r   )
r*   r   rN   r   reasonstater^   
found_infsfound_inf_combinedis
             r   r   zShardedGradScaler.updateC  s   " }}"&"B"B8"L )U+!!),v!)UZZ-C-CDLfLD (A-5v5- ..%7??7!!), "77>>@@E!&'=!>!E!E!GI FMME!G F@   z?Q&V(VV&!+A:"q#j/2A&*Q-7& 3 }}!!U*++,>?((KK((&''(()) &11M%N"5s   '<G<)Tr'   ) r,   r-   r.   r/   r   r   WORLDr   intboolr	   r   r+   r
   r0   r1   rB   r   r   r   r   r   rl   optim	Optimizerr   r    r   r   r   r   __classcell__)r>   s   @r   r3   r3   !   s   .d $ #"#04

0@0@SS S 	S
 S S  -S 
S( U\\ ell   T%,,/ D4F   U5<<#45 %c@Q:R   Xell3 8N  )$U\\8ELL+AAB)$	u||Xell33	4)$V.%. <<. <<	.
 
.F  =8;;((=8 <<=8 <<	=8
 =8 
ellELL(	)=8~13%++"7"7 13D 13f2 2 2$=Ouell/B)C D =OPT =Or   r3   )"loggingcollectionsr   r   typingr   r   r   r   r	   r
   r   r   r   r0   torch.distributeddistributedr   torch.cuda.amp.grad_scalerr   r   r   "torch.distributed.distributed_c10dr   	getLoggerr,   rd   strr   r1   r   r"   r$   r3   r   r   r   <module>r      s     ( X X X    S S ;g!Ad38n AB B$ B	H$: 	H_O
 _Or   