
    Ph,              
          d dl Z d dlZd dlmZmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlmc mc mc mZ d dlmZmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lm Z  d d
l!m"Z" d dl#m$Z$ g dZ% e$e&      Z'e G d d             Z( G d d      Z)deee*df   de	e   de*fdZ+de dee
e*   e
e,   f   fdZ-de(deee*df   de	e   dee,ef   fdZ.y)    N)	dataclassfield)AnyCallableDictListOptionalTupleUnion)eventsmetrics)
WorkerSpec)LocalElasticAgent)SignalExceptionStd)ChildFailedError)RendezvousParameters)parse_rendezvous_endpoint)
get_logger)LaunchConfigelastic_launchlaunch_agentc                      e Zd ZU dZeed<   eed<   eed<   dZeed<   dZeed<   dZ	eed	<   d
Z
eed<    ee      Zeeef   ed<   dZeed<   dZeed<   dZeed<   dZeed<   dZee   ed<   dZee   ed<   ej2                  Zeeeeef   f   ed<   ej2                  Zeeeeef   f   ed<    ee      Zeeef   ed<   dZee   ed<   d Zy)r   a	  
    Creates a rendezvous config.

    Args:
        min_nodes: Minimum amount of nodes that the user function will
                        be launched on. Elastic agent ensures that the user
                        function start only when the min_nodes amount enters
                        the rendezvous.
        max_nodes: Maximum amount of nodes that the user function
                        will be launched on.
        nproc_per_node: On each node the elastic agent will launch
                            this amount of workers that will execute user
                            defined function.
        rdzv_backend: rdzv_backend to use in the rendezvous (zeus-adapter, etcd).
        rdzv_endpoint: The endpoint of the rdzv sync. storage.
        rdzv_configs: Key, value pair that specifies rendezvous specific configuration.
        rdzv_timeout: Legacy argument that specifies timeout for the rendezvous. It is going
            to be removed in future versions, see the note below. The default timeout is 900 seconds.
        run_id: The unique run id of the job (if not passed a unique one will be
                deduced from run environment - flow workflow id in flow - or auto generated).
        role: User defined role of the worker (defaults to "trainer").
        max_restarts: The maximum amount of restarts that elastic agent will conduct
                    on workers before failure.
        monitor_interval: The interval in seconds that is used by the elastic_agent
                        as a period of monitoring workers.
        start_method: The method is used by the elastic agent to start the
                    workers (spawn, fork, forkserver).
        log_dir: base log directory where log files are written. If not set,
                one is created in a tmp dir but NOT removed on exit.
        redirects: configuration to redirect stdout/stderr to log files.
                Pass a single ``Std`` enum to redirect all workers,
                or a mapping keyed by local_rank to selectively redirect.
        tee: configuration to "tee" stdout/stderr to console + log file.
        metrics_cfg: configuration to initialize metrics.
        local_addr: address of the local node if any. If not set, a lookup on the local
                machine's FQDN will be performed.
    ..note:
        `rdzv_timeout` is a legacy argument that will be removed in future.
        Set the timeout via `rdzv_configs['timeout']`

    	min_nodes	max_nodesnproc_per_node run_iddefault_rolerolerdzv_endpointetcdrdzv_backend)default_factoryrdzv_configsrdzv_timeout   max_restarts   monitor_intervalspawnstart_methodNlog_dirlog_line_prefix_template	redirectsteemetrics_cfg
local_addrc                     d}| j                   dk7  r| j                   | j                  d<   y d| j                  vr|| j                  d<   y y )Ni  r&   timeout)r'   r%   )selfdefault_timeouts     iC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/distributed/launcher/api.py__post_init__zLaunchConfig.__post_init__[   sN    "+/+<+<Di(d///+:Di( 0    ) __name__
__module____qualname____doc__int__annotations__r   strr    r!   r#   r   dictr%   r   r   r'   r)   r+   floatr-   r.   r	   r/   r   NONEr0   r   r1   r2   r3   r9    r:   r8   r   r      s	   (T NNFCD#M3L##(#>L$sCx.>L#L# e L#!GXc]!.2hsm2,/HHIuS$sCx.()4&)hhCsDcN"	#."'"=Kc3h= $J$;r:   r   c                   2    e Zd ZdZdedeeedf   fdZd Z	y)r   a  
    Launches an torchelastic agent on the container that invoked the entrypoint.

        1. Pass the ``entrypoint`` arguments as non ``kwargs`` (e.g. no named parameters)/
           ``entrypoint`` can be a function or a command.
        2. The return value is a map of each worker's output mapped
           by their respective global rank.

    Usage

    ::

    def worker_fn(foo):
        # ...

    def main():
        # entrypoint is a function.
        outputs = elastic_launch(LaunchConfig, worker_fn)(foo)
        # return rank 0's output
        return outputs[0]

        # entrypoint is a command and ``script.py`` is the python module.
        outputs = elastic_launch(LaunchConfig, "script.py")(args)
        outputs = elastic_launch(LaunchConfig, "python")("script.py")
    config
entrypointNc                      || _         || _        y N)_config_entrypoint)r6   rG   rH   s      r8   __init__zelastic_launch.__init__~   s    
 %r:   c                 V    t        | j                  | j                  t        |            S rJ   )r   rK   rL   list)r6   argss     r8   __call__zelastic_launch.__call__   s    DLL$*:*:DJGGr:   )
r;   r<   r=   r>   r   r   r   rA   rM   rQ   rE   r:   r8   r   r   c   s0    4&& (C-.&Hr:   r   rH   rP   returnc                     t        | t              r| j                  S t        | t              r(| t        j
                  k(  rt        d |D        d      S | S y)a  Retrieve entrypoint name with the rule:
    1. If entrypoint is a function, use ``entrypoint.__qualname__``.
    2. If entrypoint is a string, check its value:
        2.1 if entrypoint equals to ``sys.executable`` (like "python"), use the first element from ``args``
            which does not start with hifen letter (for example, "-u" will be skipped).
        2.2 otherwise, use ``entrypoint`` value.
    3. Otherwise, return empty string.
    c              3   2   K   | ]  }|d    dk7  s|  yw)r   -NrE   ).0args     r8   	<genexpr>z'_get_entrypoint_name.<locals>.<genexpr>   s     >A#s   r   )
isinstancer   r;   rA   sys
executablenext)rH   rP   s     r8   _get_entrypoint_namer]      sL     *h'"""	J	$'>>CCr:   rdzv_parametersc                     | j                   dk7  ry| j                  }|j                         }|st        d      t	        |d      \  }}|dk(  rt        d| d      ||fS )Nstatic)NNzKEndpoint is missing in endpoint. Try to add --master-addr and --master-portr&   )default_portzport is missing in endpoint: z. Try to specify --master-port)backendendpointstrip
ValueErrorr   )r^   rc   master_addrmaster_ports       r8   _get_addr_and_portrh      s     (*''H~~HY
 	
  9PRSKb+H:5ST
 	
 %%r:   rG   c                 x   | j                   sDt        t        j                         j                        }t
        j                  d|       || _         t        ||      }t
        j                  d|| j                  | j                  | j                  | j                   | j                  | j                  | j                  | j                  | j                   | j"                  | j$                  d       t'        d
| j                  | j                  | j                   | j                  | j                  | j(                  d| j                  }t+        |      \  }}t-        | j.                  | j                  |t1        |      t3        j4                  |      | j                  | j                   | j6                  | j8                  ||| j(                        }t;        || j<                  | j"                  | j>                        }	d}
	 tA        jB                  tA        jD                  | j$                               |	jG                         }tI        jJ                  |	jM                                |jO                         rtQ        ||jR                        |jT                  |
r|jV                  jY                          S S # tP        $ r  tZ        $ r' d	}
tI        jJ                  |	j]                                 t^        $ r% tI        jJ                  |	j]                                 w xY w# |
r|jV                  jY                          w w xY w)Nz3config has no run_id, generated a random run_id: %sa  Starting elastic_operator with launch configs:
  entrypoint       : %(entrypoint)s
  min_nodes        : %(min_nodes)s
  max_nodes        : %(max_nodes)s
  nproc_per_node   : %(nproc_per_node)s
  run_id           : %(run_id)s
  rdzv_backend     : %(rdzv_backend)s
  rdzv_endpoint    : %(rdzv_endpoint)s
  rdzv_configs     : %(rdzv_configs)s
  max_restarts     : %(max_restarts)s
  monitor_interval : %(monitor_interval)s
  log_dir          : %(log_dir)s
  metrics_cfg      : %(metrics_cfg)s
)rH   r   r   r   r   r#   r!   r%   r)   r+   r.   r2   )rb   rc   r   r   r   r3   )r    local_world_sizerH   rP   rdzv_handlerr)   r+   r0   r1   rf   rg   r3   )specr-   r.   r/   T)namefailuresFrE   )0r   rA   uuiduuid4r?   loggerwarningr]   infor   r   r   r#   r!   r%   r)   r+   r.   r2   r   r3   rh   r   r    tuplerdzv_registryget_rendezvous_handlerr0   r1   r   r-   r/   r   initialize_metricsMetricsConfigrunr   recordget_event_succeeded	is_failedr   rn   return_valuesrk   shutdownr   get_event_failed	Exception)rG   rH   rP   r   entrypoint_namer^   rf   rg   rl   agentshutdown_rdzvresults               r8   r   r      s   
 ==TZZ\%%&LfU*:t<O
KK	1 *))))$33mm"//#11"//"// & 7 7~~!--	
< + ##%%}}""""$$ 

O  2/BK[[..4["99/J((00""JJ$$D ((!'!@!@	E M )""7#8#89K9K#LMe//12
 #$ 
 ## &&(     e,,./ e,,./ &&( s   =BJ2 2A%LL L9)/rZ   ro   dataclassesr   r   typingr   r   r   r   r	   r
   r   -torch.distributed.elastic.rendezvous.registrydistributedelastic
rendezvousregistryru   torch.distributed.elasticr   r   *torch.distributed.elastic.agent.server.apir   :torch.distributed.elastic.agent.server.local_elastic_agentr   )torch.distributed.elastic.multiprocessingr   r   0torch.distributed.elastic.multiprocessing.errorsr   $torch.distributed.elastic.rendezvousr   *torch.distributed.elastic.rendezvous.utilsr   'torch.distributed.elastic.utils.loggingr   __all__r;   rq   r   r   rA   r]   r?   rh   r   rE   r:   r8   <module>r      s
     ( D D D E E 5 A X J M E P >
<	H	 C; C; C;L$H $HNhT)*26s),&)&
8C=(3-'(&&m)m)hT)*m) s)m) 
#s(^	m)r:   