
    shX                     p    S SK r S SKrS SKJs  Js  Js  Jr   " S S\R                  R                  5      r	g)    Nc                       \ rS rSrSrS\R                  R                  S\R                  4S jr
\S 5       rS rS rS	 rS
 rSS\4S jjrS rSrg)PostLocalSGDOptimizer   a;  
Wraps an arbitrary :class:`torch.optim.Optimizer` and runs `post-local SGD <https://arxiv.org/abs/1808.07217>`_,
This optimizer runs local optimizer at every step.
After the warm-up stage, it averages parameters periodically afer the local optimizer is applied.

Args:
    optim: The local optimizer.
    averager: A model averager instance to run post-localSGD algorithm.

Example::

    >>> # xdoctest: +SKIP("undefined variables")
    >>> import torch
    >>> import torch.distributed as dist
    >>> import torch.distributed.algorithms.model_averaging.averagers as averagers
    >>> import torch.nn as nn
    >>> from torch.distributed.optim import PostLocalSGDOptimizer
    >>> from torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook import (
    >>>   PostLocalSGDState,
    >>>   post_localSGD_hook,
    >>> )
    >>>
    >>> model = nn.parallel.DistributedDataParallel(
    >>>    module, device_ids=[rank], output_device=rank
    >>> )
    >>>
    >>> # Register a post-localSGD communication hook.
    >>> state = PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=100)
    >>> model.register_comm_hook(state, post_localSGD_hook)
    >>>
    >>> # Create a post-localSGD optimizer that wraps a local optimizer.
    >>> # Note that ``warmup_steps`` used in ``PostLocalSGDOptimizer`` must be the same as
    >>> # ``start_localSGD_iter`` used in ``PostLocalSGDState``.
    >>> local_optim = torch.optim.SGD(params=model.parameters(), lr=0.01)
    >>> opt = PostLocalSGDOptimizer(
    >>>     optim=local_optim,
    >>>     averager=averagers.PeriodicModelAverager(period=4, warmup_steps=100)
    >>> )
    >>>
    >>> # In the first 100 steps, DDP runs global gradient averaging at every step.
    >>> # After 100 steps, DDP runs gradient averaging within each subgroup (intra-node by default),
    >>> # and post-localSGD optimizer runs global model averaging every 4 steps after applying the local optimizer.
    >>> for step in range(0, 200):
    >>>    opt.zero_grad()
    >>>    loss = loss_fn(output, labels)
    >>>    loss.backward()
    >>>    opt.step()
optimaveragerc                 R    Xl         U R                   R                  U l        X l        g N)r   param_groupsr   )selfr   r   s      ڃ/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/distributed/optim/post_localSGD_optimizer.py__init__PostLocalSGDOptimizer.__init__:   s    
 JJ33     c                 .    U R                   R                  $ r	   )r   stater   s    r   r   PostLocalSGDOptimizer.state?   s    zzr   c                 6    U R                   R                  5       $ r	   )r   __repr__r   s    r   r   PostLocalSGDOptimizer.__repr__C   s    zz""$$r   c                 l    U R                   R                  5       nU R                  R                  US'   U$ )z
This is the same as :class:`torch.optim.Optimizer` :meth:`state_dict`,
but adds an extra entry to record model averager's step to the checkpoint
to ensure reload does not cause unnecessary warm up again.
step)r   
state_dictr   r   )r   optim_state_dicts     r   r    PostLocalSGDOptimizer.state_dictF   s2      ::002#'==#5#5 r   c                     U R                   R                  U5        SU;   a  US   U R                  l        g[        R
                  " S5        SU R                  l        g)a'  
This is the same as :class:`torch.optim.Optimizer` :meth:`load_state_dict`,
but also restores model averager's step value to the one
saved in the provided ``state_dict``.

If there is no ``"step"`` entry in ``state_dict``,
it will raise a warning and initialize the model averager's step to 0.
r   z]Loaded state dict does not contain a step counter for an averager. Setting step counter to 0.r   N)r   load_state_dictr   r   warningswarn)r   r   s     r   r   %PostLocalSGDOptimizer.load_state_dictP   sN     	

"":.Z!+F!3DMMMM- "#DMMr   c                 ~    U R                   R                  5         U R                  R                  U R                  S9  g)z9
Performs a single optimization step (parameter update).
)paramsN)r   r   r   average_parametersr
   r   s    r   r   PostLocalSGDOptimizer.stepc   s-     	

((0A0A(Br   set_to_nonec                 6    U R                   R                  US9  g )N)r%   )r   	zero_grad)r   r%   s     r   r'   PostLocalSGDOptimizer.zero_gradj   s    

5r   c                 :    U R                   R                  U5        g r	   )r   add_param_group)r   param_groups     r   r*   %PostLocalSGDOptimizer.add_param_groupm   s    

"";/r   )r   r   r
   N)T)__name__
__module____qualname____firstlineno____doc__torchr   	Optimizer	averagersModelAveragerr   propertyr   r   r   r   r   boolr'   r*   __static_attributes__ r   r   r   r      se    /b!ekk33 !y?V?V !
    % #&C6T 60r   r   )
r   r2   6torch.distributed.algorithms.model_averaging.averagersdistributed
algorithmsmodel_averagingr4   r   r3   r   r9   r   r   <module>r>      s)      J Jf0EKK11 f0r   