
    sh$?                        S SK r S SKJrJr  S SKJr  S SKJrJrJ	r	J
r
  S SKrS SKJr  S SKJrJrJr  S SKJr  \ R*                  " \5      rS\\\4   4S jrS	\R6                  S\4S
 jr " S S\5      r " S S\5      rg)    N)abcdefaultdict)Iterable)AnyOptionaloverloadUnion)_MultiDeviceReplicator
GradScalerOptState)ProcessGroupreturnc                  (    [         R                  0 S.$ )N)stagefound_inf_per_device)r   READY     ~/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/distributed/fsdp/sharded_grad_scaler.py_refresh_per_optimizer_stater      s    ^^R@@r   tensorc                     U R                   =(       d;    U R                  R                  SSSSS[        R                  R                  5       4;   $ )Nxlacpuhpumtiaxpu)is_cudadevicetypetorch_C_get_privateuse1_backend_name)r   s    r   _is_supported_devicer$      sH    >> V]]//..04  r   c                   >    \ rS rSrSrS\R                  SS4S jrSrg)_GeneralMultiDeviceReplicator   z{
Lazily serves tensor to request device. This class extends
_MultiDeviceReplicator to allow support for "cpu" as a device.
master_tensorr   Nc                 B    [        U5      (       d   eXl        0 U l        g N)r$   master_per_device_tensors)selfr(   s     r   __init__&_GeneralMultiDeviceReplicator.__init__%   s     #M2222#EG r   )r,   r+   )	__name__
__module____qualname____firstlineno____doc__r!   Tensorr.   __static_attributes__r   r   r   r&   r&      s!    
Hell Ht Hr   r&   c                     ^  \ rS rSrSrSSSSSS\R                  R                  4S	\S
\	S\	S\	S\
S\S\\   SS4U 4S jjjr\S\R"                  S\R"                  4S j5       r\S\\R"                     S\\R"                     4S j5       r\S\\R"                  S4   S\\R"                  S4   4S j5       r\S\\R"                     S\\R"                     4S j5       rS\\R"                  \\R"                     4   S\\R"                  \\R"                     4   4S jr S$S\R.                  R0                  S\R"                  S\R"                  S\S\\R4                  \R"                  4   4
S jjrS\R.                  R0                  SS4S jrS\R"                  SS4S  jrS%S!\\\	\R"                  4      SS4S" jjrS#rU =r $ )&ShardedGradScaler+   a  
ShardedGradScaler helps perform gradient scaling in a shard aware manner. It extends
functionality from GradScaler:
* Supports Pytorch DDP and FSDP implementations
* Support CPU offloaded tensors (as used in fully sharded data parallel[FSDP])
* Supports the custom Mixed Precision loss dtype (fp16, bf16) that FSDP returns
* Sync inf/nan for scaled gradient tensors on any torch.device (where tensors are placed) across
nodes

Example::

    # Creates a ShardedGradScaler once at the beginning of training.
    scaler = ShardedGradScaler()

    for epoch in epochs:
        for input, target in data:
            optimizer.zero_grad()
            output = model(input)
            loss = loss_fn(output, target)

            # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
            scaler.scale(loss).backward()

            # scaler.step() first unscales gradients of the optimizer's params.
            # If gradients don't contain infs/NaNs, optimizer.step() is then called,
            # otherwise, optimizer.step() is skipped.
            scaler.step(optimizer)

            # Updates the scale for next iteration.
            scaler.update()

See :class:`GradScaler` for explanation of scaling/unscaling and more use cases.

Args:
    init_scale (float, optional, default=2.**16):  Initial scale factor.
    growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
        :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
    backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
        :meth:`update` if inf/NaN gradients occur in an iteration.
    growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
        that must occur for the scale to be multiplied by ``growth_factor``.
    enabled (bool, optional):  If ``False``, disables gradient scaling. :meth:`step` simply
        invokes the underlying ``optimizer.step()``, and other methods become no-ops.
        Default: ``True``
    process_group (ProcessGroup, optional, default=torch.distributed.group.WORLD):
        process group for sharding
cudag      @g      ?g       @i  Tr   
init_scalebackoff_factorgrowth_factorgrowth_intervalenabledprocess_groupr   Nc           	         > [         TU ]  UUUUUUS9  U R                  (       a  Xpl        [	        [
        5      U l        g g )N)r;   r<   r=   r>   r?   )superr.   _enabledr@   r   r   _per_optimizer_states)	r-   r   r;   r<   r=   r>   r?   r@   	__class__s	           r   r.   ShardedGradScaler.__init__\   sK     	!)'+ 	 	
 ==!.)45Q)RD& r   outputsc                     g r*   r   r-   rG   s     r   scaleShardedGradScaler.scaler   s    <?r   c                     g r*   r   rI   s     r   rJ   rK   u   s    HKr   .c                     g r*   r   rI   s     r   rJ   rK   x   s    TWr   c                     g r*   r   rI   s     r   rJ   rK   {   s    PSr   c                   ^ ^^ T R                   (       d  U$ [        U[        R                  5      (       a  [	        U5      (       d   eT R
                  c  T R                  UR                  5        T R
                  c   eUT R
                  R                  UR                  SS9-  nUR                  UR                  5      $ / mS[        [        R                  [        [        R                     4   4UU U4S jjmT" U5      $ )NTr   non_blockingvalc                 z  > [        U [        R                  5      (       a  [        U 5      (       d   e[	        T5      S:X  a[  TR
                  c  TR                  U R                  5        TR
                  c   eTR                  [        TR
                  5      5        U TS   R                  U R                  5      -  nUR                  U R                  5      $ [        U [        R                  5      (       a:  [        TU 5      n[        U [         ["        45      (       a  [        U 5      " U5      $ U$ [%        S5      e)Nr   z2outputs must be a Tensor or an iterable of Tensors)
isinstancer!   r5   r$   len_scale_lazy_init_scale_growth_trackerr   appendr&   getr    dtyper   r   maplisttuple
ValueError)rR   
scaled_valiteratorapply_scaler-   stashs      r   ra   ,ShardedGradScaler.scale.<locals>.apply_scale   s    #u||,,+C0000u:?{{*<<SZZH;;222LL!>t{{!KL 58<<

#;;
 "syy11#s||,,{C0cD%=119X..QRRr   )rC   rT   r!   r5   r$   rV   rW   r   tor    rZ   r	   r   )r-   rG   scaled_outputra   rb   s   `  @@r   rJ   rK   ~   s     }}Ngu||,,'0000{{"44W^^D;;***#dkknn~~D '5 ' M !%%gmm4457	SU5<<%,,1G#GH 	S 	S( 7##r   	optimizer	inv_scale	found_inf
allow_fp16c           
         [        U5      n[        U5      n[        S 5      n[        R                  " 5          UR                   GHI  nUS    GH;  n	U	R
                  c  M  U(       d3  U	R
                  R                  [        R                  :X  a  [        S5      eU	R
                  R                  (       a  U	R
                  R                  [        R                  L a[  U	R
                  R                  [        R                  5      R                  5       n
U
R                  [        R                  5      U	l        U	R
                  R                  5       nOU	R
                  nX{R                     UR                     R                  U5        GM>     GML     UR!                  5        HR  u  pUR#                  5        H9  n[        R$                  " UUR'                  U5      UR'                  U5      5        M;     MT     S S S 5        UR(                  (       d4  U R*                  c   eUR'                  U R*                  R                  5        UR(                  $ ! , (       d  f       N_= f)Nc                       [        [        5      $ r*   )r   r\   r   r   r   <lambda>3ShardedGradScaler._unscale_grads_.<locals>.<lambda>   s	    T9Jr   paramsz%Attempting to unscale FP16 gradients.)r&   r   r!   no_gradparam_groupsgradrZ   float16r^   	is_sparser    float32coalesce_valuesr   rX   itemsvalues*_amp_foreach_non_finite_check_and_unscale_rY   r,   rV   )r-   rf   rg   rh   ri   per_device_inv_scaleper_device_found_infper_device_and_dtype_gradsgroupparamparam_grad_fp32
to_unscaler   per_dtype_gradsgradss                  r   _unscale_grads_!ShardedGradScaler._unscale_grads_   s     =YG<YG &11J%K"]]_"//"8_Ezz) &EJJ,<,<,M()PQQzz++
 !::++u}}<.3jjooemm.L.U.U.WO)8)=)=emm)LEJ%*ZZ%7%7%9
%*ZZ
./@/@A"((fZ() - 0. ,F+K+K+M',335EDD,008,008 6 ,N1 D $77;;*** $$T[[%7%78#777K _s   GI
I c           	         U R                   (       d  g U R                  S5        U R                  [        U5         nUS   [        R
                  L a  [        S5      eUS   [        R                  L a  [        S5      eU R                  c   eU R                  R                  5       R                  5       R                  5       n[        R                  " SS[        R                  U R                  R                  S9nU R!                  XUS5      US	'   [        R
                  US'   U R                  [        U5         n/ n/ n/ nUS	   R#                  5        H  nU R$                  S
:w  a  UR                  R&                  S
:X  an  UR)                  U5        UR+                  U R$                  5      nUR)                  U5        UR)                  [,        R.                  " USU R0                  S95        M  UR)                  [,        R.                  " USU R0                  S95        M     U H  n	U	R3                  5         M     U(       a  [        R4                  " Xg5        g g )Nunscale_r   zMunscale_() has already been called on this optimizer since the last update().z(unscale_() is being called after step().)   g        )rZ   r   Tr   r   )async_opr}   )rC   _check_scale_growth_trackerrD   idr   UNSCALEDRuntimeErrorSTEPPEDrV   double
reciprocalfloatr!   fullrt   r   r   rx   _devicer    rX   rd   dist
all_reducer@   wait_foreach_copy_)
r-   rf   optimizer_staterg   rh   worksfound_inf_on_cpusfound_inf_on_devicesfound_inf_on_deviceworks
             r   r   ShardedGradScaler.unscale_   s   }}((444R	]C7#x'8'88_  W%)9)99IJJ {{&&&KK&&(335;;=	JJ#U]]4;;3E3E
	 372F2F)T3
./ $,#4#4  44R	]C!()?@GGII||u$)9)9)>)>%)G!((3&/ll4<<&@#$++,?@OO+d$BTBT OOIDDVDVW J DIIK   !2I r   c                    U R                   b  U R                  c   eUR                  5       S:  a;  U =R                   U R                  -  sl         U R                  R	                  S5        gU R                  S-   nX R
                  :X  a;  U =R                   U R                  -  sl         U R                  R	                  S5        gX l        g)z
If found_inf is 1.0 (True), then scale is multiplied by backoff_factor and growth_tracker is set to zero.
Otherwise, scale is multiplied by the growth factor when the growth interval is reached.
Ng      ?r   r   )rV   _growth_trackeritem_backoff_factorfill__growth_interval_growth_factor)r-   rh   
successfuls      r   _amp_update_scale_cpu_(ShardedGradScaler._amp_update_scale_cpu_  s    
 {{&4+?+?+KKK>>s"KK4///K  &&q)--1J222t222$$**1-'1$r   	new_scalec           
      ,   U R                   (       d  gU R                  S5      u  p#Ub  [        U[        5      (       a  U R                  R                  U5        GOSnUR                  R                  U R                  :X  d   U5       eUR                  5       S:X  d   U5       eUR                  SL d   U5       eU R                  R                  U5        GO%U R                  R                  5        VVs/ s H8  nUS   R                  5         H  nUR                  UR                  SS9PM     M:     nnn[        U5      S	:  d   S
5       eUS	   n[        U5      S:  a#  [!        S[        U5      5       H
  n	XU	   -  nM     UR                  R                  S:X  a  U R#                  U5        OM[$        R&                  " U R                  U R(                  UU R*                  U R,                  U R.                  5        [1        [2        5      U l        gs  snnf )a)  
Updates the scale factor.
If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
the scale is multiplied by ``growth_factor`` to increase it.
Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
used directly, it's used to fill GradScaler's internal scale tensor. So if
``new_scale`` was a tensor, later in-place changes to that tensor will not further
affect the scale GradScaler uses internally.)
Args:
    new_scale (float or :class:`torch.Tensor`, optional, default=None):  New scale factor.
.. warning::
    :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
    been invoked for all optimizers used this iteration.
Nupdateznew_scale should be a float or a 1-element torch.cuda.FloatTensor or                     torch.FloatTensor with requires_grad=False.r   Fr   TrP   r   z,No inf checks were recorded prior to update.r   )rC   r   rT   r   rV   r   r   r    r   numelrequires_gradcopy_rD   rx   rd   rU   ranger   r!   _amp_update_scale_r   r   r   r   r   r   )
r-   r   rV   r   reasonstaterh   
found_infsfound_inf_combinedis
             r   r   ShardedGradScaler.update'  s   " }}"&"B"B8"L )U++!!),A  !'',,<DfD< (A-5v5- ..%7??7!!), "77>>@@E!&'=!>!E!E!GI FMME!G F@   z?Q&V(VV&!+A:"q#j/2A&Q-7& 3 }}!!U*++,>?((KK((&''(()) &11M%N"5s   3?H)r   rD   r@   )Tr*   )!r0   r1   r2   r3   r4   r   r}   WORLDstrr   intboolr   r   r.   r   r!   r5   rJ   r\   r]   r   r	   optim	Optimizerdictr   r   r   r   r   r6   __classcell__)rE   s   @r   r8   r8   +   sJ   .d # #"#04

0@0@SS S 	S
 S S S  -S 
S S, ?U\\?ell? ?KT%,,/KD4FK KWU5<<#45W%c@Q:RW WSXell3S8NS S)$U\\8ELL+AAB)$	u||Xell33	4)$`  68;;((68 <<68 <<	68
 68 
ellELL(	)68p2J%++"7"7 2JD 2Jh2 2 2$@Ouell/B)C D @OPT @O @Or   r8   ) loggingcollectionsr   r   collections.abcr   typingr   r   r   r	   r!   torch.distributeddistributedr   torch.amp.grad_scalerr
   r   r   "torch.distributed.distributed_c10dr   	getLoggerr0   loggerr   r   r   r5   r   r$   r&   r8   r   r   r   <module>r      s     ( $ 1 1    N N ; 
		8	$Ad38n A $ 	H$: 	H|O
 |Or   