
    shQ                        S SK r S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKJ	r	  S SK
Jr  S SKJrJr  S SKJr  S SKJrJrJrJr  S SKJs  Js  Jr  S SKJs  Js  Js  Jr  S SKJ r J!r!J"r"  S SK#J$r$J%r%  S S	K&J'r'J(r(  S S
KJ)r)  S SK*J+r+  / SQr,Sr-Sr.\+" \/5      r0\ " S S5      5       r1 " S S5      r2 " S S\3\5      r4 " S S5      r5 " S S5      r6\ " S S5      5       r7S\34S jr8 " S S\ Rr                  5      r: " S S \:5      r;g)!    N)defaultdict)contextmanager)	dataclassfield)Enum)AnyCallableOptionalUnion)EventEventSourcerecord)prof
put_metric)ProcessFailureSignalException)RendezvousGracefulExitError)
get_logger)
WorkerSpecWorkerWorkerStateWorkerGroup	RunResultElasticAgentSimpleElasticAgentz!torchelastic/agent/terminal_statedefaultc                       \ rS rSr% Sr\\S'   \\S'   \R                  \S'   Sr
\\   \S'   Sr\\\S4   \S'   S	r\\S
'   Sr\\S'   Sr\\S'   Sr\\   \S'   Sr\\   \S'   Sr\\   \S'   S rS rSrg)r   /   a  Blueprint information about a particular type of worker.

For a given role, there must only exist a single worker spec.
Worker spec is expected to be homogeneous across all nodes (machine),
that is each node runs the same number of workers for a particular spec.

Args:
    role: user-defined role for the workers with this spec
    local_world_size: number local workers to run
    fn: (deprecated use entrypoint instead)
    entrypoint: worker function or command
    args: arguments to pass to ``entrypoint``
    rdzv_handler: handles rdzv for this set of workers
    max_restarts: number of max retries for the workers
    monitor_interval: monitor status of workers every ``n`` seconds
    master_port: fixed port to run the c10d store on rank 0
                 if not specified then will chose a random free port
    master_addr: fixed master_addr to run the c10d store on rank 0
                 if not specified then will chose hostname on agent rank 0
    redirects: redirect std streams to a file,
               selectively redirect for a particular
               local rank by passing a map
    tee: tees the specified std stream(s) to console + file,
         selectively tee for a particular local rank by passing a map,
         takes precedence over ``redirects`` settings.

rolelocal_world_sizerdzv_handlerNfn
entrypoint args   max_restartsg?monitor_intervalmaster_portmaster_addr
local_addrc                     U R                   S:  d   eU R                  S:  d   eU R                  (       a*  [        R                  " S[
        S9  U R                  U l        U R                  (       d   eg )Nr   zJWorkerSpec.fn will be deprecated, please use WorkerSpec.entrypoint instead)category)r    r(   r"   warningswarnDeprecationWarningr#   selfs    ~/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/distributed/elastic/agent/server/api.py__post_init__WorkerSpec.__post_init__Z   s^    $$q((($$q(((77MM<+
 #ggDO    c                     [        U R                  [        5      (       a)  [        R                  R                  U R                  5      $ U R                  c   eU R                  R                  $ )zGet the entry point name.

If the entrypoint is a function (e.g. ``Callable``) returns its ``__qualname__``
else if the entrypoint is a binary (e.g. ``str``), returns the binary name.
)
isinstancer#   strospathbasename__qualname__r1   s    r3   get_entrypoint_nameWorkerSpec.get_entrypoint_nameg   sN     doos++77##DOO44??...??///r6   )r#   )__name__
__module__r=   __firstlineno____doc__r9   __annotations__intrdzvRendezvousHandlerr"   r
   r	   r#   r   r%   tupler'   r(   floatr)   r*   r+   r4   r>   __static_attributes__r$   r6   r3   r   r   /   s    8 I(((!B!-1JhT)*1D%L#!e!!%K#%!%K#% $J$
0r6   r   c                   V    \ rS rSrSr/ SQr    SS\S\S\S\S\4
S	 jjrS
 rS r	Sr
g)r   t   a,  A worker instance.

Contrast this with ``WorkerSpec`` that represents the specifications of a
worker. A ``Worker`` is created from a ``WorkerSpec``. A ``Worker`` is to
a ``WorkerSpec`` as an object is to a class.

The ``id`` of the worker is interpreted
by the specific implementation of ``ElasticAgent``. For a local
agent, it could be the ``pid (int)`` of the worker, for a remote
agent it could be encoded as ``host:port (string)``.

Args:
    id (Any): uniquely identifies a worker (interpreted by the agent)
    local_rank (int): local rank of the worker
    global_rank (int): global rank of the worker
    role_rank (int): rank of the worker across all workers that have the same role
    world_size (int): number of workers (globally)
    role_world_size (int): number of workers that have the same role
id
local_rankglobal_rank	role_rank
world_sizerole_world_sizerO   rP   rQ   rR   rS   c                 N    S U l         Xl        X l        X0l        X@l        XPl        g NrM   )r2   rO   rP   rQ   rR   rS   s         r3   __init__Worker.__init__   s0       *
 !,
 (  * %4r6   c           
          SU R                    SU R                   SU R                   SU R                   SU R                   3
$ )Nzlocal_rank=z,global_rank=z,role_rank=z,world_size=z,role_world_size=rO   rP   rQ   rR   rS   r1   s    r3   __str__Worker.__str__   sJ    $//*-8H8H7I$..)doo5F 4 457	
r6   c                     [        U 5      $ rU   )r9   r1   s    r3   __repr__Worker.__repr__   s    4yr6   )rP   rN   rO   rQ   rS   rR   N)r_   r_   r_   )r@   rA   r=   rB   rC   	__slots__rE   rV   rZ   r]   rJ   r$   r6   r3   r   r   t   s\    (I !44 4 	4
 4 4B
r6   r   c                   P    \ rS rSrSrSrSrSrSrSr	Sr
S	r\S
S S\4S j5       rSrg)r      a  A state of the ``WorkerGroup``.

Workers in a worker group change state as a unit. If a single worker
in a worker group fails the entire set is considered failed::

  UNKNOWN - agent lost track of worker group state, unrecoverable
  INIT - worker group object created not yet started
  HEALTHY - workers running and healthy
  UNHEALTHY - workers running and unhealthy
  STOPPED - workers stopped (interrupted) by the agent
  SUCCEEDED - workers finished running (exit 0)
  FAILED - workers failed to successfully finish (exit !0)


A worker group starts from an initial ``INIT`` state,
then progresses to ``HEALTHY`` or ``UNHEALTHY`` states,
and finally reaches a terminal ``SUCCEEDED`` or ``FAILED`` state.

Worker groups can be interrupted and temporarily put into ``STOPPED`` state
by the agent. Workers in ``STOPPED`` state are scheduled to be restarted
in the near future by the agent. Some examples of workers being put into
``STOPPED`` state are:

1. Worker group failure|unhealthy observed
2. Membership change detected

When actions (start, stop, rdzv, retry, etc) on worker group fails
and results in the action being partially applied to the worker group
the state will be ``UNKNOWN``. Typically this happens on uncaught/unhandled
exceptions during state change events on the agent. The agent is not
expected to recover worker groups in ``UNKNOWN`` state and is better off
self terminating and allowing the job manager to retry the node.
UNKNOWNINITHEALTHY	UNHEALTHYSTOPPED	SUCCEEDEDFAILEDstatereturnc                 H    U [         R                  [         R                  1;   $ )zReturn the state of the Worker.

Returns:
     True if the worker state represents workers still running
     (e.g. that the process exists but not necessarily healthy).
)r   re   rf   )rj   s    r3   
is_runningWorkerState.is_running   s      ,,k.C.CDDDr6   r$   N)r@   rA   r=   rB   rC   rc   rd   re   rf   rg   rh   ri   staticmethodboolrm   rJ   r$   r6   r3   r   r      sQ     D GDGIGIFE- ED E Er6   r   c                   .    \ rS rSrSr/ SQrS\4S jrSrg)r      zA set of ``Worker`` instances.

The class defines a set of ``Worker`` instances for the given ``WorkerSpec`` managed by ``ElasticAgent``. Whether the worker
group contains cross instance workers or not depends on the implementation of the agent.
)specworkersstore
group_rankgroup_world_sizerj   r*   r)   rs   c                    Xl         [        U R                   R                  5       Vs/ s H  n[        US9PM     snU l        S U l        S U l        S U l        S U l        S U l	        [        R                  U l        g s  snf )N)rO   )rs   ranger    r   rt   ru   rv   rw   r*   r)   r   rd   rj   )r2   rs   is      r3   rV   WorkerGroup.__init__  sn    	6;DII<V<V6WX6W!,6WX 
 $ %%
 Ys   A<)rv   rw   r*   r)   rs   rj   ru   rt   N)	r@   rA   r=   rB   rC   r`   r   rV   rJ   r$   r6   r3   r   r      s    	I&Z &r6   r   c            	           \ rS rSrSr/ SQrS\S\S\4S jrS\	4S	 jr
\S
\	4S j5       r\S\4S j5       r\S\S\S\\\4   4S j5       rSrg)_RoleInstanceInfoi  zThe class is used by the agent to exchange the information with other agents.

The information is used to determine the rank of the workers that agent
manages in heterogeneous environments, where different agents can have
different number of workers.
r   rankr    r   r   r    c                 (    Xl         X l        X0l        g)zInitialize the agent class instance.

Args:
    role (str): user-defined role for the workers with this spec
    rank (int): the rank of the agent
    local_world_size (int): number of local workers to run
Nr~   )r2   r   r   r    s       r3   rV   _RoleInstanceInfo.__init__  s     		 0r6   rk   c                     U R                   U R                  U R                  S.n[        R                  " U5      R                  SS9$ )Nr~   UTF-8encoding)r   r   r    jsondumpsencode)r2   	dict_datas     r3   	serialize_RoleInstanceInfo.serialize*  s?    IIII $ 5 5
	
 zz)$++W+==r6   datac                 t    [         R                  " U R                  SS95      n[        US   US   US   5      $ )Nr   r   r   r   r    )r   loadsdecoder}   )r   r   s     r3   deserialize_RoleInstanceInfo.deserialize2  s?    JJt{{G{<=	 fy0)<N2O
 	
r6   c                     U R                   UR                   :X  a  U R                  UR                  -
  $ U R                   UR                   :  a  gg)N   r_   )r   r   )obj1obj2s     r3   compare_RoleInstanceInfo.compare9  s;    99		!99tyy((YY"r6   roles_infosc                 p    Su  p#[        U 5       H!  u  pEUR                  U:X  d  M  US:X  a  UnUnM#     X#4$ )N)r_   r_   r_   )	enumerater   )r   r   	start_idxend_idxidx	role_infos         r3   find_role_boundaries&_RoleInstanceInfo.find_role_boundariesB  sC    #	'4NC~~%? #I	 5
 ##r6   )r    r   r   N)r@   rA   r=   rB   rC   r`   r9   rE   rV   bytesr   ro   r   r   listrH   r   rJ   r$   r6   r3   r}   r}     s     5I
1S 
1 
1s 
1>5 > 
% 
 
 s   $$ $c $eCHo $ $r6   r}   c                   r    \ rS rSr% Sr\\S'   \" \S9r	\\
\4   \S'   \" \S9r\\
\4   \S'   S\4S jrS	rg
)r   iM  a  Return results of the worker executions.

Run results follow an "all-or-nothing" policy where the run is successful if and
only if ALL local workers managed by this agent complete successfully.

If the result is successful (e.g. ``is_failed() = False``) then the ``return_values``
field contains the outputs (return values) of the workers managed by THIS agent mapped
by their GLOBAL ranks. That is ``result.return_values[0]`` is the return value of
global rank 0.

.. note:: ``return_values`` are only meaningful for when the worker entrypoint
          is a function. Workers specified as a binary entrypoint do not canonically
          have a return value and the ``return_values`` field is meaningless and
          may be empty.

If ``is_failed()`` returns ``True`` then the ``failures`` field contains the
failure information, again, mapped by the GLOBAL rank of the worker that failed.

The keys in ``return_values`` and ``failures`` are mutually exclusive, that is,
a worker's final state can only be one of: succeeded, failed. Workers intentionally
terminated by the agent according to the agent's restart policy, are not represented
in either ``return_values`` nor ``failures``.
rj   )default_factoryreturn_valuesfailuresrk   c                 <    U R                   [        R                  :H  $ rU   )rj   r   ri   r1   s    r3   	is_failedRunResult.is_failedk  s    zz[////r6   r$   N)r@   rA   r=   rB   rC   r   rD   r   dictr   rE   r   r   r   rp   r   rJ   r$   r6   r3   r   r   M  sK    0 $)$$?M4S>?*/*EHd3&'E04 0r6   r   rk   c                  T    [         R                  " [         R                  " 5       5      $ rU   )socketgetfqdngethostnamer$   r6   r3   _get_fq_hostnamer   o  s    >>&,,.//r6   c                       \ rS rSrSr\R                  \4S\S\	4S jj5       r
\R                  \4S\S\4S jj5       rSrg)	r   is  a  An agent process responsible for managing one or more worker processes.

The worker processes are assumed to be regular distributed PyTorch scripts.
When the worker process is created by the agent, the agent provides the
necessary information for the worker processes to properly initialize
a torch process group.

The exact deployment topology and ratio of agent-to-worker is dependent
on the specific implementation of the agent and the user's job placement
preferences. For instance, to run a distributed training job on GPU with
8 trainers (one per GPU) one can:

1. Use 8 x single GPU instances, place an agent per instance, managing
   1 worker per agent.
2. Use 4 x double GPU instances, place an agent per instance, managing
   2 workers per agent.
3. Use 2 x quad GPU instances, place an agent per instance, managing
   4 workers per agent.
4. Use 1 x 8 GPU instance, place an agent per instance, managing
   8 workers per agent.

Usage
::

 group_result = agent.run()
  if group_result.is_failed():
    # workers failed
    failure = group_result.failures[0]
    logger.exception("worker 0 failed with exit code : %s", failure.exit_code)
  else:
    return group_result.return_values[0] # return rank 0's results

r   rk   c                     [         e)a4  Run the agent.

Supports retrying the worker group on failures up to ``max_restarts``.

Returns:
    The result of the execution, containing the return values or
    failure details for each worker mapped by the worker's global rank.

Raises:
    Exception - any other failures NOT related to worker process
NotImplementedErrorr2   r   s     r3   runElasticAgent.run  s
     "!r6   c                     [         e)zReturn the ``WorkerGroup`` for the given ``role``.

Note that the worker group is a mutable object and hence in a
multi-threaded/process environment it may change state.
Implementors are encouraged (but not required) to return
a defensive read-only copy.
r   r   s     r3   get_worker_groupElasticAgent.get_worker_group  
     "!r6   r$   N)r@   rA   r=   rB   rC   abcabstractmethodDEFAULT_ROLEr9   r   r   r   r   rJ   r$   r6   r3   r   r   s  s^     D 	* " "y " " 	+7 "S "K " "r6   r   c                      \ rS rSrSrS-S\S\4S jjr\4S\	S\
4S jjr\R                  S	\
S\\\4   4S
 j5       r\R                   S.S	\
S\SS4S jj5       r\R                  S	\
S\4S j5       r\R                  \R.                  S4S\R0                  S\SS4S jj5       r\S	\
SS4S j5       r\S\S\S\S\\   4S j5       r\S	\
SS4S j5       r\S	\
SS4S j5       r \\4S\	S\4S jj5       r!S\"4S jr#S\"4S jr$S\SS4S jr%S\S\S\	4S jr&\'S\	4S  j5       r(   S/S\	S!\)S\*\   S"\*\	   S#\*\   S\"4S$ jjr+S%\4S& jr,S' r-S.S(\4S) jjr.\4S\	S\4S* jjr/S+ r0S,r1g)0r   i  zAn ``ElasticAgent`` that manages one particular type of worker role.

An ``ElasticAgent`` that manages workers (``WorkerGroup``) for a single ``WorkerSpec``
such as one particular type of worker role.
rs   exit_barrier_timeoutc                     [        U5      U l        U R                  R                  R                  U l        S U l        X l        SU l        g Nr   )r   _worker_grouprs   r'   _remaining_restarts_store_exit_barrier_timeout_total_execution_time)r2   rs   r   s      r3   rV   SimpleElasticAgent.__init__  s>    (.#'#5#5#:#:#G#G %9"%&"r6   r   rk   c                     U R                   $ rU   )r   r   s     r3   r   #SimpleElasticAgent.get_worker_group  s    !!!r6   worker_groupc                     [         e)zStart ``worker_group.spec.local_world_size`` number of workers.

This is according to worker spec for the worker group .
Returns a map of ``local_rank`` to worker ``id``.
r   r2   r   s     r3   _start_workers!SimpleElasticAgent._start_workers  s
     "!r6   F
is_restartNc                     [         e)zStop all workers in the given worker group.

Implementors must deal with workers in all states defined by
``WorkerState``. That is, it must gracefully handle stopping
non-existent workers, unhealthy (stuck) workers, etc.
r   )r2   r   r   s      r3   _stop_workers SimpleElasticAgent._stop_workers  s
     "!r6   c                     [         e)znCheck on the workers for the ``worker_group``.

This function also returns the new state of the worker group.
r   r   s     r3   _monitor_workers#SimpleElasticAgent._monitor_workers  s
     "!r6   	death_sigc                     [         e)zClean up any resources that were allocated during the agent's work.

Args:
    death_sig: Signal to send to the child process, SIGTERM is default
r   )r2   r   r   s      r3   	_shutdownSimpleElasticAgent._shutdown  r   r6   c                 &   UR                   nU R                  S5         UR                  R                  5       nSSS5        WR                  nUR
                  nUR                  nUR                  =(       d    UR                  R                  nUR                  =(       d    UR                  R                  nX@l
        U R                  S5         U R                  XEXb5      n	SSS5        W	Ul        XAl        XQl        Xal        Xql        Xl	        UR                  U R                   -
  n
["        R%                  SUR&                  U
UUUUU	 Vs/ s H  oR(                  PM     snU	 Vs/ s H  oR*                  PM     snU	 Vs/ s H  oR,                  PM     snU	 Vs/ s H  oR.                  PM     snU	 Vs/ s H  oR                  PM     snS.5        g! , (       d  f       GN= f! , (       d  f       GN= fs  snf s  snf s  snf s  snf s  snf )zRun rendezvous for the workers specified by the worker spec.

Assigns workers a new global rank and world size.
Updates the rendezvous store for the worker group.

RENDEZVOUSNASSIGN_WORKER_RANKSa  [%(role)s] Rendezvous complete for workers. Result:
  restart_count=%(restart_count)s
  master_addr=%(master_addr)s
  master_port=%(master_port)s
  group_rank=%(group_rank)s
  group_world_size=%(group_world_size)s
  local_ranks=%(local_ranks)s
  role_ranks=%(role_ranks)s
  global_ranks=%(global_ranks)s
  role_world_sizes=%(role_world_sizes)s
  global_world_sizes=%(global_world_sizes)s
)r   restart_countr*   r)   rv   rw   local_ranks
role_ranksglobal_ranksrole_world_sizesglobal_world_sizes)rs   record_durationr!   next_rendezvousru   r   rR   r*   bootstrap_store_infor)   r   _assign_worker_ranksrt   rv   rw   r'   r   loggerinfor   rO   rQ   rP   rS   )r2   r   rs   	rdzv_inforu   rv   rw   r*   r)   rt   r   workers               r3   _rendezvousSimpleElasticAgent._rendezvous  s      !!,/))99;I 0^^
$// &&T)*H*H*T*T&&T)*H*H*T*T!!"78//#3G 9  '"",(8%#. #. ))D,D,DD
< 		!.**($4@GHf 1 1H>EFgF//gFBI J'!3!3' JJQ$R'%;%;'$RGN&OwV'8'8w&O	
5 0/ 98@  IF J$R&Os5   GG(G:&G?HH	7H
G%(
G7rv   rw   c           
         [         R                  R                  SS5      S:X  a"  X4R                  -  nX$R                  -  nUnUnGOSn	Sn
[	        UR
                  X$R                  5      nUR                  U	 U 3UR                  5       5        US:X  Ga[  UR                  [        U5       Vs/ s H  nSU 3PM
     sn5      nU Vs/ s H  n[        R                  U5      PM     nn[        S 5      nSnU H3  nUUR
                  ==   UR                  -  ss'   UUR                  -  nM5     Sn[        S 5      n/ n/ n[        U5       H  u  nnUR                  U
 U 35        UR                  [        R                  " UUUUR
                     UUR
                     /5      5        UUR                  -  nUUR
                  ==   UR                  -  ss'   M     UR!                  UU5        [        R"                  " UR                  U
 U 35      5      u  nnnn/ n[        UR                  5       H'  n[%        UUU-   UU-   UUS	9nUR                  U5        M)     U$ s  snf s  snf )
a  Determine proper ranks for worker processes.

Fast Path: when all workers have the same role and world size. We calculate
the global rank to be group_rank * group_world_size + local_rank. And the
`role_world_size` is the same as `global_world_size`. No TCP store is used in
this case. This is only enabled when users set the environment variable
`TORCH_ELASTIC_WORKER_IDENTICAL` to 1.

Time complexity: each worker O(1), overall O(1)

Slow Path: when workers have different roles and world sizes. We use the
the following algorithm:

1. Each agent writes its configuration(group_rank, group_world_size
   , num_workers) to the common store.
2. The rank 0 agent reads all the role_info from the store and
   determines each agents worker ranks.
3. Determine the global rank: the global rank of the workers is computed
   by cumulative sum of the local_world_size for all workers in front of it.
   For efficiency reasons each worker is assigned a base global rank
   such that it's workers are in the range [base_global_rank,
   base_global_rank + local_world_size).
4. Determine the role rank: The role rank is determined using the algorithms
   in the point 3 with the exception that the ranks are calculated with
   respect to the role name.
5. The rank 0 agent writes the assigned ranks to the store.
6. Each agent reads the assigned ranks from the store.

Time complexity: each worker O(1), rank0 O(n), overall O(n)
TORCH_ELASTIC_WORKER_IDENTICAL01ztorchelastic/role_info/ztorchelastic/assigned_ranks/r   c                      gr   r$   r$   r6   r3   <lambda>9SimpleElasticAgent._assign_worker_ranks.<locals>.<lambda>e      r6   c                      gr   r$   r$   r6   r3   r   r   l  r   r6   rY   )r:   environgetr    r}   r   setr   	multi_getry   r   r   r   appendr   r   	multi_setr   r   )r2   ru   rv   rw   rs   global_world_sizebase_global_rankbase_role_rankrS   ROLE_INFO_PREFIXASSIGNED_RANKS_PREFIXagent_role_inforz   role_infos_bytes
info_bytes
role_infos
role_sizesglobal_sizer   r   keysvaluesrt   rO   r   s                            r3   r   'SimpleElasticAgent._assign_worker_ranks*  s   F ::>>:C@CG 03H3H H),A,AA-N/O8$B!/		:'<'<O II)*:,79R9R9TU Q#(??<ABR<ST<Sq.qc2<ST$ 
 '7&6
 &11*=&6  
 )3
!+Iy~~.)2L2LL.9#=#==K ", $% (3
$-j$9LAyKK#8"9! =>MM

 0 + *9>> : *9>> :		 %	(B(BB$y~~.)2L2LL. %:  f- 

599(='>zl%KLM !  5 56J%,z9(:5, /F NN6" 7 m Us   .I#I(c                 R   UR                   R                  n[        R                  SU5        U R	                  U5        [        R                  SU5        U R                  U5      nUR                  5        H  u  pEUR                  U   nXVl        M     [        R                  Ul        g)ax  Start a fresh set of workers for the worker_group.

Essentially, a rendezvous followed by a ``start_workers``.
The caller should first call ``_stop_workers()`` to stop running workers
prior to calling this method.

Optimistically sets the state of the worker group that
just started as ``HEALTHY`` and delegates the actual monitoring
of state to ``_monitor_workers()`` method
z [%s] Rendezvous'ing worker groupz[%s] Starting worker groupN)rs   r   r   r   r   r   itemsrt   rN   r   re   rj   )r2   r   r   
worker_idsrO   w_idr   s          r3   _initialize_workers&SimpleElasticAgent._initialize_workers  s       %%6= 	&0$7((6
 * 0 0 2J!))*5FI !3 )00r6   c                     UR                   R                  n[        R                  SU5        U R	                  USS9  [
        R                  Ul        U R                  U5        g)zCRestart (stops, rendezvous, starts) all local workers in the group.z[%s] Stopping worker groupT)r   N)	rs   r   r   r   r   r   rg   rj   r  )r2   r   r   s      r3   _restart_workers#SimpleElasticAgent._restart_workers  sS       %%0$7<D9(00  .r6   c                 T   [         R                  " 5       nSn U R                  U5      n[        [         R                  " 5       U-
  5      U l        U R                  U5        U R                  U5        UU(       d  U R                  5         [        [         R                  " 5       U-
  5      U l        $ ! [         a   n[        R                  SU5         S nAOSS nAf[         aC  n[        R                  SUR                  5        U R                  UR                  5        Sne S nAff = f U(       d  U R                  5         [        [         R                  " 5       U-
  5      U l        g ! U(       d  U R                  5         [        [         R                  " 5       U-
  5      U l        f = f)NFz Rendezvous gracefully exited: %sz/Received %s death signal, shutting down workersT)time	monotonic_invoke_runrE   r   _record_metrics_record_worker_eventsr   r   r   r   r   warningsigval)r2   r   
start_timeshutdown_calledresultes         r3   r   SimpleElasticAgent.run  s8   ^^%
 %	L%%d+F),T^^-=
-J)KD&  (&&v. # ),T^^-=
-J)KD& + 	?KK:A>> 	NNLahhWNN188$"O		 ? # ),T^^-=
-J)KD& # ),T^^-=
-J)KD&s6   AB1 1
D&;CE( D&#>D!!D&&E( (?F'c                 f    U R                  S[        R                  [        R                  " 5       S9$ )Nri   )rj   source	raw_error)_construct_eventr   AGENT	traceback
format_excr1   s    r3   get_event_failed#SimpleElasticAgent.get_event_failed  s2    $$$$**, % 
 	
r6   c                 >    U R                  S[        R                  S9$ )Nrh   )rj   r!  )r#  r   r$  r1   s    r3   get_event_succeeded&SimpleElasticAgent.get_event_succeeded  s&    $$$$ % 
 	
r6   r  c           	      P   U R                   R                   H  nUR                  R                  UR                  5      nU R                  X!5      nU(       a   [        R                  " UR                  5      OS n[        U R                  U[        R                  X%5      5        M     g rU   )r   rt   r   r   rP   _get_worker_stater   r   error_file_datar   r#  r   WORKER)r2   r  r   failurerj   r"  s         r3   r  (SimpleElasticAgent._record_worker_events  sw    ((00Foo))&*<*<=G//?E?F

7#:#:;DI4((0B0BFVW	 1r6   r   c                 V   UR                   R                  UR                  5      nUR                  [        R
                  [        R                  1;   a  U(       d  gU(       d  UR                  UR                  ;   a  UR                  R                  $ [        SUR                   35      e)N
TERMINATEDzUnknown worker: )
r   r   rP   rj   r   rf   ri   r   value
ValueError)r2   r   r  r0  s       r3   r-  $SimpleElasticAgent._get_worker_state  s~    //%%f&8&89<<K11;3E3EFFw**f.B.BB<<%%%/0B0B/CDEEr6   rj   c           
   #   Z  #    [         R                  " 5       n S v   [         R                  " 5       nX2-
  S-  n[        U R                  U[        R
                  US95        g ! [         R                  " 5       nX2-
  S-  n[        U R                  U[        R
                  US95        f = f7f)Ni  )rj   r!  duration_ms)r  perf_counterr   r#  r   r$  )r2   rj   r  end_timer8  s        r3   r   "SimpleElasticAgent.record_duration  s     &&(
		((*H#0D8K%%(9(9{ &  ((*H#0D8K%%(9(9{ & s   B+A" AB+"AB((B+r!  r"  r8  c                 f   U R                   nUR                  nUR                  UR                  5       S.nU(       aR  UR                  4US'   UR
                  4US'   UR                  4US'   UR                  n	[        UR                  5      n
OS n	S n
[        R                  " U5      nUR                  R                  5       U	UR                  U
UR                  [!        5       UU R"                  UR                  R%                  5       UUUR&                  U R(                  -
  US.n[+        SU 3X,S9$ )N)rw   entry_pointrO   rQ   rS   )run_idrP   rv   	worker_idr   hostnamerj   total_run_timerdzv_backendr"  metadataagent_restartsr8  ztorchelastic.worker.status.)r!  rC  )r   rs   rw   r>   rO   rQ   rS   rP   r9   rN   r   r   r!   
get_run_idrv   r   r   r   get_backendr'   r   r   )r2   rj   r!  r   r"  r8  wgrs   mdrP   r?  md_strrC  s                r3   r#  #SimpleElasticAgent._construct_event  s*    ww " 3 3335
  & 1 13B|%//1B{O%+%;%;$=B ! ,,KFIIIKIB''224&--"II(*"88 --99;""//$2J2JJ&
 )%1&
 	
r6   group_resultsc                    UR                  5       nU R                  U5        U R                  R                  nU R                  UR
                  :g  n[        SUR                   S3S5        U R                  SU(       + =(       a    U5        U R                  SU(       + =(       a    U(       + 5        U R                  SU=(       a    U5        U R                  SU=(       a    U(       + 5        g )Nworkers.z
.run_totalr   run_success_with_retriesrun_success_no_retriesrun_failed_with_retriesrun_failed_no_retries)	r   _record_flakiness_metricr   rs   r   r'   r   r   _record_metric_with_condition)r2   rK  r   rs   restarts_happeneds        r3   r  "SimpleElasticAgent._record_metrics0  s    !++-	%%i0!!&& 448I8IIXdii[
3Q7**&I(K:K	
 	**$)m&M<M8M	
 	**%y'F5F	
 	**#Y%H7H3H	
r6   c                     U R                   R                  nU(       a  [        SUR                   SU 3S5        g [        SUR                   SU 3S5        g )NrM  .r   r   )r   rs   r   r   )r2   metric_name	conditionrs   s       r3   rS  0SimpleElasticAgent._record_metric_with_conditionC  sL    !!&&$))Ak];Q?$))Ak];Q?r6   r   c                     U(       a  SnO;U R                   R                  nSSU R                  S-   -  UR                  S-   -  -
  nU R                   R                  n[	        SUR
                   S3[        U5      5        g )Ng      Y@r   rM  z
.flakiness)r   rs   r   r'   r   r   rE   )r2   r   	flakinessrs   s       r3   rR  +SimpleElasticAgent._record_flakiness_metricJ  sx    I%%**D)A)AA)E F!!A%! I !!&&Xdii[
3S^Dr6   c                    U R                   R                  nUR                  n[        R	                  SXR                  5       5        U R                  U R                   5        UR                  nUR                  n U R                   R                  [        R                  :w  d   e[        R                  " U5        U R                  U R                   5      nUR                  nX`R                   l	        [        SU S3U R                   5        [        SU SUR"                  R%                  5        3S5        U[        R&                  :X  a3  [        R	                  SUU R(                  5        U R+                  5         U$ U[        R,                  [        R.                  1;   a  U R                   S:  ah  [        R	                  SUUR"                  U R                   UR0                  5        U =R                   S-  sl        U R3                  U R                   5        OU R5                  U R                   5        [        R.                  U R                   l	        U$ U[        R6                  :X  a`  UR9                  5       nU R                   R:                  nUS:  a3  [        R	                  S	UUU5        U R3                  U R                   5        O[=        S
U SUR"                   S35      eGMm  )Nz([%s] starting workers for entrypoint: %srM  z.remaining_restartsrW  r   zW[%s] worker group successfully finished. Waiting %s seconds for other agents to finish.r   zD[%s] Worker group %s. %s/%s attempts left; will restart worker groupzH[%s] Detected %s new nodes from group_rank=%s; will restart worker group[z] Worker group in z state)r   rs   r   r   r   r>   r  r(   r!   rj   r   rd   r  sleepr   r   r   namelowerrh   r   _exit_barrierrf   ri   r'   r  r   re   num_nodes_waitingrv   	Exception)	r2   r   rs   r(   r!   
run_resultrj   rd  rv   s	            r3   r  SimpleElasticAgent._invoke_runV  s    !!&&yy6>V>V>X	
 	  !3!3400((%%++{/?/????JJ'(..t/A/ABJ$$E',$$':;T=U=UV$q)9)9);(<=qA---F..	 ""$!!;00+2D2DEE++a/KK5 

00)) ,,1,))$*<*<=&&t'9'9:/:/A/AD&&,%%+---$0$B$B$D!!//::
$q(KK4 )" ))$*<*<=v/

|6B g r6   c                 L   [         R                  SU R                  R                  U R                  5        [
        R
                  " 5       n [        R                  " U R                  U R                  R                  [        U R                  S9  [         R                  S[
        R
                  " 5       U-
  5        g! [         a&  n[         R                  SUR                  5        e SnAf[         a/    [         R                  S[
        R
                  " 5       U-
  5         gf = f)a#  
Define a barrier that keeps the agent process alive until all workers finish.

Wait for ``exit_barrier_timeout`` seconds for all agents to finish
executing their local workers (either successfully or not). This
acts as a safety guard against user scripts that terminate at different
times.
zOLocal worker group finished (%s). Waiting %s seconds for other agents to finish)ru   rR   
key_prefixbarrier_timeoutz2Done waiting for other agents. Elapsed: %s secondszGot termination signal: %sNz2Error waiting on exit barrier. Elapsed: %s seconds)r   r   r   rj   r   r  
store_utilbarrierr   rw   _TERMINAL_STATE_SYNC_IDr   r  r  re  	exception)r2   startr  s      r3   rc   SimpleElasticAgent._exit_barrier  s     	<$$&&		
 			kk-->>2 $ : :	 KKD		e#  	NN7B 	D		e#	s   A/B< <
D#!C''9D#"D#)r   r   r   r   r   )i,  )F)NNN)2r@   rA   r=   rB   rC   r   rI   rV   r   r9   r   r   r   r   r   rE   r   r   rp   r   r   r   signalSIGTERMSignalsr   r   r   r   r   r   r  r  r   r   r'  r*  r  r-  r   r   r   r
   r#  r  rS  rR  r  rc  rJ   r$   r6   r3   r   r     s   'Z 'u ' ,8 "S "K " 	"; "4S> " " 	<A	"'	"59	"		" 	" 	"[ "Y " " 	*0..U""FJ"	" " 
;
 ;
 ;
 
;
~ 
i!$i8;iCMi	fi 
iZ 
1 1 1 
1< 
/[ /T / 
/ 
* L Ly L 
L,
% 

U 
XI X$ XF F	 Fc F S  " $(#''+)
)
 )
  	)

 C=)
 e_)
 
)
V
Y 
&@
E$ 
E '3 C Cy CJ"r6   r   )<r   r   r:   rq  r   r  r%  r.   collectionsr   
contextlibr   dataclassesr   r   enumr   typingr   r	   r
   r   $torch.distributed.elastic.rendezvousdistributedelastic
rendezvousrF   %torch.distributed.elastic.utils.storeutilsru   rk   torch.distributed.elastic.eventsr   r   r   !torch.distributed.elastic.metricsr   r   )torch.distributed.elastic.multiprocessingr   r   r   'torch.distributed.elastic.utils.loggingr   __all__rm  r   r@   r   r   r   r9   r   r   r}   r   r   ABCr   r   r$   r6   r3   <module>r     s     	      # % (  1 1 3 3 : : G G > U L > > 	H	 A0 A0 A0HG GT3E#t 3El& &@6$ 6$r 0 0 0B0# 0;"377 ;"|L Lr6   