a
    h,                     @   s`  d dl Z d dlZd dlmZmZ d dlmZmZmZm	Z	 d dl
m  m  m  mZ d dlmZmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lm Z  d dl!m"Z" g dZ#e"e$Z%eG dd dZ&G dd dZ'e	ee(df e)e e(dddZ*ee+ee( ee, f dddZ-e&e	ee(df e)e e.e,ef dddZ/dS )    N)	dataclassfield)AnyCallableOptionalUnion)eventsmetrics)
WorkerSpec)LocalElasticAgent)DefaultLogsSpecs	LogsSpecsSignalException)ChildFailedError)RendezvousParameters)parse_rendezvous_endpoint)
get_logger)LaunchConfigelastic_launchlaunch_agentc                   @   s  e Zd ZU dZeed< eed< eed< dZee ed< dZ	e
ed< d	Ze
ed
< dZe
ed< dZe
ed< eedZee
ef ed< dZeed< dZeed< dZeed< dZe
ed< dZee
 ed< eedZee
e
f ed< dZee
 ed< dZe
ed< dd ZdS )r   a	  
    Creates a rendezvous config.

    Args:
        min_nodes: Minimum amount of nodes that the user function will
                        be launched on. Elastic agent ensures that the user
                        function start only when the min_nodes amount enters
                        the rendezvous.
        max_nodes: Maximum amount of nodes that the user function
                        will be launched on.
        nproc_per_node: On each node the elastic agent will launch
                            this amount of workers that will execute user
                            defined function.
        rdzv_backend: rdzv_backend to use in the rendezvous (zeus-adapter, etcd).
        rdzv_endpoint: The endpoint of the rdzv sync. storage.
        rdzv_configs: Key, value pair that specifies rendezvous specific configuration.
        rdzv_timeout: Legacy argument that specifies timeout for the rendezvous. It is going
            to be removed in future versions, see the note below. The default timeout is 900 seconds.
        run_id: The unique run id of the job (if not passed a unique one will be
                deduced from run environment - flow workflow id in flow - or auto generated).
        role: User defined role of the worker (defaults to "trainer").
        max_restarts: The maximum amount of restarts that elastic agent will conduct
                    on workers before failure.
        monitor_interval: The interval in seconds that is used by the elastic_agent
                        as a period of monitoring workers.
        start_method: The method is used by the elastic agent to start the
                    workers (spawn, fork, forkserver).
        metrics_cfg: configuration to initialize metrics.
        local_addr: address of the local node if any. If not set, a lookup on the local
                machine's FQDN will be performed.
        local_ranks_filter: ranks for which to show logs in console. If not set, show from all.
        event_log_handler: name of the event logging handler as registered in
          `elastic/events/handlers.py <https://docs.pytorch.org/docs/stable/elastic/events.html>`_.


    .. note::
        `rdzv_timeout` is a legacy argument that will be removed in future.
        Set the timeout via `rdzv_configs['timeout']`

    	min_nodes	max_nodesnproc_per_nodeN
logs_specs run_idZdefault_rolerolerdzv_endpointZetcdrdzv_backend)default_factoryrdzv_configsrdzv_timeout   max_restartsg?monitor_intervalspawnstart_methodlog_line_prefix_templatemetrics_cfg
local_addrnullevent_log_handlerc                 C   sF   d}| j dkr| j | jd< nd| jvr0|| jd< | jd u rBt | _d S )Ni  r!   timeout)r"   r    r   r   )selfZdefault_timeout r/   L/var/www/auris/lib/python3.9/site-packages/torch/distributed/launcher/api.py__post_init___   s    



zLaunchConfig.__post_init__)__name__
__module____qualname____doc__int__annotations__r   r   r   r   strr   r   r   r   dictr    r   r"   r$   r%   floatr'   r(   r)   r*   r,   r1   r/   r/   r/   r0   r   "   s&   
)r   c                   @   s2   e Zd ZdZeeeedf dddZdd Z	dS )r   a  
    Launches an torchelastic agent on the container that invoked the entrypoint.

        1. Pass the ``entrypoint`` arguments as non ``kwargs`` (e.g. no named parameters)/
           ``entrypoint`` can be a function or a command.
        2. The return value is a map of each worker's output mapped
           by their respective global rank.

    Usage

    ::

    def worker_fn(foo):
        # ...

    def main():
        # entrypoint is a function.
        outputs = elastic_launch(LaunchConfig, worker_fn)(foo)
        # return rank 0's output
        return outputs[0]

        # entrypoint is a command and ``script.py`` is the python module.
        outputs = elastic_launch(LaunchConfig, "script.py")(args)
        outputs = elastic_launch(LaunchConfig, "python")("script.py")
    N)config
entrypointc                 C   s   || _ || _d S N)_config_entrypoint)r.   r;   r<   r/   r/   r0   __init__   s    zelastic_launch.__init__c                 G   s   t | j| jt|S r=   )r   r>   r?   list)r.   argsr/   r/   r0   __call__   s    zelastic_launch.__call__)
r2   r3   r4   r5   r   r   r   r8   r@   rC   r/   r/   r/   r0   r   k   s
   r   )r<   rB   returnc                 C   sF   t | tr| jS t | tr>| tjkr8tdd |D dS | S ndS dS )a  Retrieve entrypoint name with the rule:
    1. If entrypoint is a function, use ``entrypoint.__qualname__``.
    2. If entrypoint is a string, check its value:
        2.1 if entrypoint equals to ``sys.executable`` (like "python"), use the first element from ``args``
            which does not start with hifen letter (for example, "-u" will be skipped).
        2.2 otherwise, use ``entrypoint`` value.
    3. Otherwise, return empty string.
    c                 s   s   | ]}|d  dkr|V  qdS )r   -Nr/   ).0argr/   r/   r0   	<genexpr>       z'_get_entrypoint_name.<locals>.<genexpr>r   N)
isinstancer   r2   r8   sys
executablenext)r<   rB   r/   r/   r0   _get_entrypoint_name   s    


rN   )rdzv_parametersrD   c                 C   sX   | j dkrdS | j}| }|s(tdt|dd\}}|dkrPtd| d||fS )NZstatic)NNzKEndpoint is missing in endpoint. Try to add --master-addr and --master-portr!   )default_portzport is missing in endpoint: z. Try to specify --master-port)backendendpointstrip
ValueErrorr   )rO   rR   master_addrmaster_portr/   r/   r0   _get_addr_and_port   s    

rW   )r;   r<   rB   rD   c                 C   s  | j s&tt j}td| || _ t||}td|| j	| j
| j| j | j| j| j| j| j| jj| j| jd tf | j| j| j | j	| j
| jd| j}t|\}}t| j| j|t|t|| j| j||| j| jd}t|| j| j| jd}	d}
zz^t !t "| j |	# }t$%|	& | j |' r>t(||j)d|j*W W |
rV|j+,  S  t(yl    Y nR t-y   d	}
t$%|	. | j  Y n( t/y   t$%|	. | j  Y n0 W |
r|j+,  n|
r|j+,  0 d S )
Nz3config has no run_id, generated a random run_id: %sa/  Starting elastic_operator with launch configs:
  entrypoint         : %(entrypoint)s
  min_nodes          : %(min_nodes)s
  max_nodes          : %(max_nodes)s
  nproc_per_node     : %(nproc_per_node)s
  run_id             : %(run_id)s
  rdzv_backend       : %(rdzv_backend)s
  rdzv_endpoint      : %(rdzv_endpoint)s
  rdzv_configs       : %(rdzv_configs)s
  max_restarts       : %(max_restarts)s
  monitor_interval   : %(monitor_interval)s
  log_dir            : %(log_dir)s
  metrics_cfg        : %(metrics_cfg)s
  event_log_handler  : %(event_log_handler)s
)r<   r   r   r   r   r   r   r    r$   r%   Zlog_dirr)   r,   )rQ   rR   r   r   r   r*   )r   Zlocal_world_sizer<   rB   rdzv_handlerr$   r%   rU   rV   r*   r,   )specr   r'   r(   T)namefailuresF)0r   r8   uuiduuid4r6   loggerwarningrN   infor   r   r   r   r   r    r$   r%   r   Zroot_log_dirr)   r,   r   r*   rW   r
   r   tuplerdzv_registryZget_rendezvous_handlerr   r'   r(   r	   Zinitialize_metricsZMetricsConfigrunr   recordZget_event_succeededZ	is_failedr   r[   Zreturn_valuesrX   shutdownr   Zget_event_failed	Exception)r;   r<   rB   r   Zentrypoint_namerO   rU   rV   rY   ZagentZshutdown_rdzvresultr/   r/   r0   r      s    
 



r   )0rK   r\   Zdataclassesr   r   typingr   r   r   r   Z-torch.distributed.elastic.rendezvous.registryZdistributedZelasticZ
rendezvousregistryrb   Ztorch.distributed.elasticr   r	   Z*torch.distributed.elastic.agent.server.apir
   Z:torch.distributed.elastic.agent.server.local_elastic_agentr   Z)torch.distributed.elastic.multiprocessingr   r   r   Z0torch.distributed.elastic.multiprocessing.errorsr   Z$torch.distributed.elastic.rendezvousr   Z*torch.distributed.elastic.rendezvous.utilsr   Z'torch.distributed.elastic.utils.loggingr   __all__r2   r^   r   r   r8   rA   rN   ra   r6   rW   r9   r   r/   r/   r/   r0   <module>	   s8   H(
