
    sh                       S SK r S SKrS SKJrJr  S SKJrJrJrJ	r	  S SK
r
S SKJr  S SKJs  Js  Jr  S SKJr  S SKJs  Jr  S SKJr  S SKJr  S SKJr  S SKJrJrJ r J!r!J"r"J#r#J$r$J%r%  S SK&J'r'J(r(J)r)J*r*J+r+  S S	K,J-r-  S S
K.J/r/  S SK0J1r1J2r2J3r3J4r4  S SK5J6r7  \Rp                  " \95      r:Sr; " S S\5      r<S\Rz                  S\>\?\   \?\Rz                     4   4S jr@S\Rz                  S\?\   4S jrAS\S\Rz                  S\B4S jrC\S\S\Rz                  S\4S j5       rDS\S\Rz                  4S jrE\S\S\Rz                  SS4S j5       rF\S\SS4S j5       rG\S\S\(S\
R                  S\
R                  SS4
S j5       rI\S\S\(S \B4S! j5       rJS\	\(   SS4S" jrKS\	\(   SS4S# jrL\S\S\	\(   S$\S\Rz                  S%\>\S&4   S'\M\N\4   S\>\>\S&4   \M\N\4   4   4S( j5       rO\S\S\	\(   SS4S) j5       rP\S\S\	\(   S*\S\Rz                  S+\S,\S\4S- j5       rQ\S\S\(SS4S. j5       rR\S\S\Rz                  SS4S/ j5       rS\S\S\
R&                  Rz                  S\>\\4   4S0 j5       rT\S\S\Rz                  S\(S1\S\4
S2 j5       rU\\
R                  " 5       S\S\(S1\4S3 j5       5       rWS\S\(S1\SS4S4 jrXS\S\(S1\SS4S5 jrY\S\S\(S\B4S6 j5       rZ\S\S\(SS4S7 j5       r[\S\S8\
R                  S\>\
R                  \
R                  4   4S9 j5       r]\S\S\(S:\
R                  S\
R                  4S; j5       r^\S\S\(SS4S< j5       r_\S\S\(S=\
R                  4S> j5       r`\S\S\(S=\
R                  4S? j5       ra\S\(4S@ j5       rbSA\
R                  SB\cSS4SC jrd\S\S:\
R                  SD\'4SE j5       reSF\
R                  SG\
R                  SS4SH jrf\S\S\B4SI j5       rg\\
R                  " 5       S\S\Rz                  4SJ j5       5       rh\S\SS4SK j5       ri\S\SS4SL j5       rj\S\SM\	\(   SN\<SS4SO j5       rk\S\SM\(S\(4SP j5       rlS\(S\*4SQ jrm\S\S\Rz                  SS4SR j5       rn\S\S\Rz                  SS4SS j5       ro\S\S\Rz                  4ST j5       rp\S\S\Rz                  SU\S\(SS4
SV j5       rqS\S\	\(   SS4SW jrrS\S\	\(   S%\>\S&4   S'\M\N\4   SS4
SX jrs\S\S\Rz                  SS4SY j5       rtSZ\
R                  S\
R                  S\
R                  4S[ jruS\\?\(   4S] jrv\S\S\Rz                  S\>\?\
R                     \?\	\
R                        4   4S^ j5       rx\S\S_\?\N   S\?\
R                     4S` j5       rySa\?\
R                     Sb\?\	\
R                        Sc\
R                  SS4Sd jr{g)e    N)autoEnum)AnyCallableno_type_checkOptional)Variable)register_multi_grad_hook)LOW_PRECISION_HOOKS)_assert_in_training_states
_FSDPState_get_module_fsdp_state_is_composable_log_post_backward_hook_no_dispatch_record_streamclean_tensor_nameTrainingState)FlatParameterFlatParamHandleHandleShardingStrategyHandleTrainingState'RESHARD_AFTER_FORWARD_HANDLE_STRATEGIES)HYBRID_SHARDING_STRATEGIES)BackwardPrefetch)_apply_to_tensors_cast_forward_inputs	_p_assert
_to_kwargs)_pytree)_use_orig_paramslimit_all_gathers_use_full_prec_in_evalc                   0    \ rS rSr\" 5       r\" 5       rSrg)_PrefetchMode5    N)__name__
__module____qualname____firstlineno__r   BACKWARDFORWARD__static_attributes__r&       y/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/distributed/fsdp/_runtime_utils.pyr$   r$   5   s    vHfGr.   r$   modulereturnc                 
   / n/ n[        5       nU R                  5        H_  n[        U5      nUc  M  XS;  d  M  [        XT5      (       d  M,  UR	                  U5        UR                  U5        UR                  U5        Ma     X4$ )a  
Returns a tuple containing:
1. A list of the root ``_FSDPState`` instances in the module tree rooted at
``module`` without any duplicates and following the ``module.modules()``
traversal order (which is assumed to be depth-first).
2. A corresponding list of the root modules owning the states in the first
list.

This is similar to :func:`_get_fsdp_states_with_modules` except that we
must call :func:`_is_fsdp_root` to force a lazy initialization to determine
the FSDP root in case lazy initialization has not yet happened.
)setmodulesr   _is_fsdp_rootaddappend)r0   fsdp_root_statesfsdp_root_modulesvisited_fsdp_states	submoduleoptional_states         r/   "_get_fsdp_root_states_with_modulesr=   :   s     *,)++.5^^%	/	:&9n88##N3##N3$$Y/ & ..r.   c                      [        U 5      u  pU$ )z/See :func:`_get_fsdp_root_states_with_modules`.)r=   )r0   r8   _s      r/   _get_fsdp_root_statesr@   Z   s    <VDr.   statec                 N    [        X5        U R                  c   eU R                  $ )z
Returns if ``state`` corresponds to that of an FSDP root.

For the wrapper code path, ``state`` and ``module`` should be the same. For
the non-wrapper code path, ``state`` should be ``module`` 's state.
)
_lazy_init_is_rootrA   r0   s     r/   r5   r5   `   s&     u>>%%%>>r.   root_modulec                    U R                   b  gU R                  R                  5       (       d  [        S5      eSU l         [	        U [
        R                  /5        [        X5        [        R                  " U5      U l
        [        U 5        [        X5      u  p#[        X#U R                  5        U R                  R!                  XU R"                  5        [%        X5        U $ )a  
Performs initialization lazily, typically right before the first forward
pass. The laziness is needed to ensure that the parameter device/dtype and
the FSDP hierarchy have finalized. This method's actual logic only runs on
the root FSDP instance, which performs initialization for all non-root FSDP
instances to avoid partial initialization.

For the non-composable code path, ``state`` and ``root_module`` should be
the same, namely the FSDP instance itself.
Nz(FSDP does not support CPU only executionT)rD   _device_handleis_availableRuntimeErrorr   r   IDLE%_check_flat_params_on_expected_devicetraversal_utils_get_fsdp_states_all_fsdp_states_init_streams'_get_buffers_and_dtypes_for_computation!_cast_buffers_to_dtype_and_devicecompute_device_exec_order_datainitprocess_group"_share_state_and_init_handle_attrs)rA   rF   buffersbuffer_dtypess       r/   rC   rC   m   s     ~~!,,.. EFF ENu}'9'9&:;)%=,==kJE%DUXG%ge>R>RS	E4G4GH&u:Lr.   c                    [         R                  " S5      n[        R                  " U5       H  nUR                  (       dT  UR
                  R                  U R                  :w  a0  [        SUR
                  R                   SU R                   S35      eUR                  (       d  M{  UR
                  R                  U:w  d  M  [        SUR
                  R                   S35      e   g)z
Checks that all ``FlatParameter``s in ``module`` 's tree managed by
``state`` are on the expected device for *lazy initialization*.
cpuz6An FSDP-managed module unexpectedly has parameters on z". Make sure to move the module to z before training.zOAn FSDP-managed module with parameter CPU offloading enabled has parameters on zG. Make sure to not move the module from CPU when offloading parameters.N)torchdevicerM   _get_fsdp_handles_offload_params
flat_paramrS   rJ   )rA   r0   
cpu_devicehandles       r/   rL   rL      s    
 e$J!33F;&&!!((E,@,@@H$$++,,N''((9; 
 ###(9(9(@(@J(N%%+%6%6%=%=$> ?KL  <r.   
root_statec                    U R                   nU(       a  UR                  5         0 n[         H  n[        5       X4'   M     U R                  R
                  U l        U R                   H  nUR                  n[        US5      (       a  [        S5      eUR                  SL=(       a    [        S UR                   5       5      Ul        UR                  (       d  Mt  [        R                  R                  S5        M     U R                    GHG  n[         H7  n[#        [        Xd5      SU 35        X4   R%                  ['        Xd5      5        M9     X`L a  MK  [#        UR(                  SL =(       d    UR(                  (       + S5        SUl        U R*                  Ul        U R,                  Ul        U R.                  Ul        U R0                  Ul        U R2                  Ul        U R                  Ul        U R4                  Ul        UR6                  b  U R2                  UR6                  l        UR                   nU(       d  GM7  UR                  5         GMJ     UR;                  5        H%  u  pG[=        U5      S	:w  d  M  [?        S
U SU 35      e   g)z
Shares data structure state from the ``root_state`` to all FSDP states in
``root_module`` 's module tree, and initializes handle attributes. These
are done together to require a single loop over the states.
_in_backward_optimizerszDFSDP optimizer in backward only supported with use_orig_params=True!Nc              3   :   #    U  H  n[        US 5      v   M     g7f)re   N)hasattr).0params     r/   	<genexpr>5_share_state_and_init_handle_attrs.<locals>.<genexpr>   s       O
CU%GE455CUs   zfsdp.optimizer_in_backwardzFSDP state missing attribute zcNon-root FSDP instance's `_is_root` should not have been set yet or should have been set to `False`F   z"Expects one homogeneous value for z	 but got ) _handleinit_flat_param_attributesHOMOGENEOUS_ATTR_NAMESr3   rT   all_handles_all_handlesr`   rg   rJ   _paramsany_has_optim_in_backwardr\   _C_log_api_usage_oncerO   r   r6   getattrrD   _unshard_stream_post_backward_stream_pre_unshard_stream_all_reduce_stream_default_stream_free_event_queue_fsdp_extensioncompute_streamitemslen
ValueError)rc   rF   rb   attr_name_to_values	attr_namer`   
fsdp_stateattr_valuess           r/   rW   rW      sL    F))+/1+	),& ,(99EEJ))&&
:899V  )3(:(:$(F )
3 O
CMCUCUO
 L
% (((HH(()EF * !11
/I
./	{;  *..wz/MN 0 # 	4'Bz/B/B+B9	

 $
%/%?%?
"+5+K+K
()3)G)G
&(2(E(E
%%/%?%?
"&0&A&A
#'1'C'C
$%%18B8R8RJ&&5##6--/; 2< #6";";"=	{q 4YKyV  #>r.   c                    U R                   (       d   eU R                  R                  5       (       d   e[        S U R                   5       5      nU R
                  (       a	  U(       a  SOSnU R                  R                  5       U l        U R                  b  U R                  U R                  l	        U R                  R                  US9U l        U R                  R                  US9U l        U R                  R                  US9U l        U(       a   U R                  R                  5       U l        gU R                  U l        g)z
Initializes CUDA streams for overlapping communication, computation, and
data transfers. The streams should be shared across FSDP instances.
c              3   H   #    U  H  nUR                   [        ;   v   M     g 7fN)sharding_strategyr   )rh   r   s     r/   rj    _init_streams.<locals>.<genexpr>   s#      0J 	$$(BB0s    "r   N)priority)rD   rH   rI   rs   rO   r!   current_streamr|   r~   r   Streamrx   ry   rz   r{   )rA   uses_hybrid_shardinghigh_prioritys      r/   rP   rP      s    >>>,,.... 00   116JBPQM!00??AE(/4/D/D, "00777OE #("6"6"="=}"="UE !& 4 4 ; ;] ; SE *>##% 
CHCXCX 
r.   rb   unshard_streampre_unshard_streamc                 n   U(       d  gU R                   R                  U5         UR                  5       nSSS5        W(       a  UR                  U5        U R                  (       aY  U R
                  R                  5       nU(       a8  [        R                  R                  S5         UR                  5         SSS5        U R                   R                  U5         UR                  5         UR                  5         SSS5        g! , (       d  f       N= f! , (       d  f       Nd= f! , (       d  f       g= f)a  
Unshards the handles in ``handles``. If the handles are in
:meth:`summon_full_params` and are using mixed precision, then they are
forced to full precision.

Postcondition: handle's ``FlatParameter`` 's data is the padded
unsharded flat parameter on the compute device.
Nz%FullyShardedDataParallel.rate_limiter)rH   streampre_unshardwait_streamr!   r}   dequeue_if_neededr\   profilerrecord_functionsynchronizeunshardpost_unshard)rA   rb   r   r   ran_pre_unshardevents         r/   _unshardr     s     				$	$%7	8 ,,. 
9""#56''99;//7 !!# 
			$	$^	4 
5	4 
9	8  
5	4s#   D&D!D&
D
D#&
D4free_unsharded_flat_paramc                 h   UR                  U5        U R                  (       ay  U(       ar  [        R                  R                  R                  5       (       dE  U R                  R                  5       nUR                  5         U R                  R                  U5        UR                  5         SUl        g)z|
Reshards the handle. ``free_unsharded_flat_param`` indicates whether to
free the handle's padded unsharded flat parameter.
FN)reshardr!   r\   distributed_functional_collectivesis_torchdynamo_compilingrH   Eventrecordr}   enqueuepost_reshard_prefetched)rA   rb   r   
free_events       r/   _reshardr   1  s     NN,-#<  88QQSS --335J##++J7
 Fr.   c                 4    U (       a  U R                  5         g g r   )unshard_gradrb   s    r/   _unshard_gradsr   I        r.   c                 4    U (       a  U R                  5         g g r   )reshard_gradr   s    r/   _reshard_gradsr   P  r   r.   
unshard_fnargs.kwargsc                    [         R                  R                  S5         U(       a*  UR                  [        R
                  :X  a  XE4sSSS5        $ [        R                  U l        U R                  R                  XR                  5        U(       a  [        R                  Ul        Ub  U" X5        [        X5        U(       a  UR                  (       aq  UR                  R                   cZ  [         R"                  " UR                  R$                  [         R&                  " S5      S9R)                  5       UR                  l        U R*                  =(       a    U R*                  R,                  (       + nU(       aB  U R.                  R0                  (       a'  U R.                  R2                  n[5        U/UQ70 UD6u  pE[7        XXE5        XE4sSSS5        $ ! , (       d  f       g= f)a  
Runs the pre-forward logic. This includes an opportunity to unshard
currently sharded parameters such as those for the current forward and
registering post-backward hooks for these current parameters. This function
also converts forward ``args`` and ``kwargs`` to the given precision.

Args:
    handles (List[FlatParamHandle]): Handles giving the parameters used in
        the current forward.
    unshard_fn (Optional[Callable]): A callable to unshard any currently
        sharded parameters or ``None`` to not do any unsharding.
    module (nn.Module): Module whose forward this method runs right before;
        expected by the hook signature.
    args (Tuple[Any, ...]): Module forward ``args``.
    kwargs (Dict[str, Any]): Module forward ``kwargs``.
z%FullyShardedDataParallel._pre_forwardNr[   r]   )r\   r   r   _training_stater   BACKWARD_PREr   FORWARD_BACKWARDtraining_staterT   record_pre_forwardtrainingr,   _register_post_backward_hookr_   r`   	_cpu_grad
zeros_like_local_shardr]   
pin_memoryrm   _force_full_precisionmixed_precisioncast_forward_inputsparam_dtyper   )_register_post_backward_reshard_only_hook)rA   rb   r   r0   r   r   should_cast_forward_inputsinput_dtypes           r/   _pre_forwardr   W  sc   2 
	'	'(O	P f,,0C0P0PP
 < 
Q	P  -==11&//J%8%@%@F"!u% 	%U3 f,,1B1B1L1L1T*/*:*:!!..u||E7J+jl '
 MME%--"E"EE 	# &%*?*?*S*S161F1F1R1RK/MdMfMLD1%N|G 
Q	P	Ps   (GE:G
G$c                 P   U(       d  gUR                   (       d!  [        XU R                  U R                  5        SUl        [
        R                  R                  R                  5       (       de  U R                  R                  5       nU R                  b#  UR                  U R                  5        SU l        OUR                  U R                  5        [
        R                  R                  S5         [!        X["        R$                  5        SSS5        g! , (       d  f       g= f)z'Unshards parameters in the pre-forward.NFz.FullyShardedDataParallel._pre_forward_prefetch)r   r   rx   rz   _needs_pre_forward_unshardr\   r   r   r   rH   r   _unshard_event
wait_eventr   r   r   _prefetch_handler$   r,   )rA   rb   r   s      r/   _pre_forward_unshardr     s       5 5u7P7PQ(-F%44MMOO--<<>+%%e&:&:;#'E &&u'<'<=		'	'8
 	(=(=>
 
 
s   3D
D%
reshard_fninputoutputc                    [         R                  R                  S5         U(       a)  UR                  [        R
                  :X  a  UsSSS5        $ U R                  R                  U5        Ub  U" X5        [        XXQ5      n[        R                  U l        U(       a  [        R                  Ul        UsSSS5        $ ! , (       d  f       g= f)a  
Runs the post-forward logic. This includes an opportunity to reshard
currently unsharded parameters such as those used in the current forward
and registering pre-backward hooks on the forward outputs.

Args:
    handles (List[FlatParamHandle]): Handles giving the parameters used in
        the current forward.
    reshard_fn (Optional[Callable]): A callable to reshard any currently
        unsharded parameters (e.g. from the current forward) or ``None`` to
        not do any resharding.
    module (nn.Module): Module whose forward just ran, which should be a
        fully sharded module (see [Note: Fully Sharded Module]); expected
        by the hook signature.
    input (Any): Unused; expected by the hook signature.
    output (Any): Forward pass output; pre-backward hooks are registered on
        the tensors that require gradients in this output.

Postcondition: Each ``FlatParameter`` 's data points to the sharded flat
parameter.
z&FullyShardedDataParallel._post_forwardN)r\   r   r   r   r   r   rT   record_post_forward_register_pre_backward_hooksr   rK   r   )rA   rb   r   r0   r   r   s         r/   _post_forwardr     s    < 
	'	'(P	Q f,,0C0P0PP	 
R	Q 	226:!u% .eVL,11%8%=%=F" 
R	Q	Qs   'B?A$B??
Cc                     U(       d  gU R                   (       + =(       a    UR                  [        ;   n[        XU5        g)z(Reshards parameters in the post-forward.N)rD   _sharding_strategyr   r   )rA   rb   r   s      r/   _post_forward_reshardr     s=     
 NN 	Q%%)PP  U56r.   c                    [         R                  R                  S5         [        X5        [	        U R
                  SLS5        U R
                  (       d1  [        U 5      (       a  [        XX#5      sSSS5        $ X#4sSSS5        $ U R                  nU(       a  UR                  nO([        R                  " U5      n[        S U 5       5      nU(       ac  [        [        UR                  5       5      R!                  5       [#        U R$                  R!                  5       5      U R&                  S9  SU l        O{[+        U SS5      (       ai  [-        X5      u  nn[/        U5      S	:  aE  [/        U5      S	:  a6  [        S
 [1        Xx5       5       5      (       a  [        XxU R&                  5        SU l        U R2                  (       aP  U R4                   V	s/ s H"  n	U	R                  (       d  M  U	R                  PM$     nn	U H  nSUl        SUl        M     [;        U R<                  R?                  5       U R@                  U RB                  5        [E        U RF                  5        [         R                  R                  S5         [I        X#U R&                  S5      u  pSSS5        W
(       a  U
S	   O	[K        5       nW(       a  US	   O0 n[        XX#5      sSSS5        $ s  sn	f ! , (       d  f       NL= f! , (       d  f       g= f)a  
Runs pre-forward logic specific to the root FSDP instance, which should run
before any individual module's pre-forward. This starts with an attempt at
lazy initialization (which only runs non-vacuously once). Otherwise, if
this is called on a non-root FSDP instance, then it returns directly.

Args:
    module (nn.Module): Module for which this logic tries to run. It may or
        may not be the root. If not, then this method does not do anything.
z*FullyShardedDataParallel._root_pre_forwardNz$Expects a root FSDP to have been setc              3   8   #    U  H  oR                   v   M     g 7fr   )r   )rh   rb   s     r/   rj   $_root_pre_forward.<locals>.<genexpr>  s      3;B,,7s   )rX   rY   r]   T!_needs_buffer_dtype_restore_checkFr   c              3   D   #    U  H  u  pUR                   U:g  v   M     g 7fr   dtype)rh   bufferbuffer_dtype_for_computations      r/   rj   r   3  s*      A< LL$@@As    z#FullyShardedDataParallel._to_kwargs)&r\   r   r   rC   r   rD   r   _root_cast_forward_inputrm   r   rM   r^   rs   rR   dictnamed_buffersvalueslist_buffer_name_to_orig_dtyperS   r   rw   rQ   r   zipforward_prefetchrO   r   r   _wait_for_computation_streamrH   r   rx   rz   %_reset_flat_param_grad_info_if_neededrq   r   tuple)rA   r0   r   r   rb    should_cast_buffers_to_full_prechandlesrX   buffer_dtypes_for_computationr   
args_tuplekwargs_tuples               r/   _root_pre_forwardr     s   " 
	'	'(T	U5!%..,.TU~~ e$$/tL 
V	U < 
V	U" /5/K/K,
 &77?G/2 3;B3 0, ,-V1134;;="5#C#C#J#J#LM++ 7;E3U?GG 8F-7|aC(E$F$J @CA   6@T@T 7<E3!! #("8"8"8J%% #
"""8  
 "481%*" " 	%  //1!!%%	

 	.e.@.@A
 ^^++,QR'1e22E($J S !+z!}$0ab'tDm 
V	Uz$ SR_ 
V	UsJ   AK2:K2EK2K+K;BK2K!7K2K2!
K/	+K22
L c                 V   U R                   (       a  U R                   R                  (       + nOSnUR                  =(       d    U R                  (       + =(       a    U=(       a    U R                  R
                  nU(       a'  U R                  R                  n[        U/UQ70 UD6u  p#X#4$ NT)rm   r   r   r"   r   cast_root_forward_inputsr   r   )rA   r0   r   r   force_full_precisionr   r   s          r/   r   r   ]  s     }}#(==#F#FF# 
	< < <<VBV"9



8
8  "-2-B-B-N-N+KI$I&I<r.   unusedc                    U(       a$  [        US5      (       a  UR                  (       a  U$ [        R                  R	                  S5         U R
                  (       a2  U R                  (       d!  [        X5        [        U R                  5        ORU(       aK  [        R                  /n[        U 5      (       a  UR                  [        R                  5        [        X5        [        R                  U l        U(       d  UsSSS5        $ ["        R$                  Ul        UR(                  (       a  UR*                  (       d"  [-        U UU R.                  U R0                  5        [        R2                  R4                  R7                  5       (       d3  U R8                  R;                  5       R=                  U R.                  5        SUl        [        R                  R	                  S5         [?        X[@        RB                  5        SSS5        URE                  5         SUl        UsSSS5        $ ! , (       d  f       N0= f! , (       d  f       g= f)z
Prepares ``_handle`` 's ``FlatParameter`` s for gradient computation.

Args:
    module (nn.Module): Fully sharded module (see [Note: Fully Sharded
        Module]).
_ran_pre_backward_hookz+FullyShardedDataParallel._pre_backward_hookNFz/FullyShardedDataParallel._pre_backward_prefetchT)#rg   r   r\   r   r   rD   _post_backward_callback_queued&_register_post_backward_final_callbackr   rq   r   rK   r   r7   r   r   r   r   r   r   _needs_pre_backward_unshardr   r   rx   rz   r   r   r   rH   r   r   r   r$   r+   prepare_gradient_for_backward)rA   r0   rb   gradr   allowed_statess         r/   _pre_backward_hookr  q  s   $ 	F455))		'	'(U	V >>%"F"F25A1%2D2DE+001Ne$$%%m&D&DE&u=,== # 
W	V$ "5!A!A-- %%))--	 $$<<UUWW$$335AA%BWBWX .3*^^++=
 UM,B,BC
 	,,.(,%W 
W	VJ
 
K 
W	Vs,   B3H=CH='H, H=,
H:	6H==
Ic                    [        X[        5        UR                  nSUl        [        R
                  R                  R                  S5         [        U [        R                  /5        [        UR                  [        R                  [        R                  4;   SUR                   35        [        R                  Ul        UR                   c
   SSS5        gUR                   R"                  (       a  [%        S5      e['        X5        U R(                  (       d+  UR*                  (       a  UR-                  5          SSS5        g[        R.                  R0                  R3                  5       (       d3  U R4                  R7                  U R8                  R;                  5       5        U R8                  R=                  U R4                  5         UR                   R>                  n[A        U 5      (       di  UR                   RB                  URD                  :w  aE  URF                  (       d4  UR                   RI                  URD                  5      UR                   l        URJ                  (       a  [M        X5        O[O        X5        [Q        X@R4                  5        SSS5        SSS5        g! , (       d  f       N= f! , (       d  f       g= f)a  
Reduce-scatters the gradient of ``handle`` 's ``FlatParameter``.

Precondition: The ``FlatParameter`` 's ``.grad`` attribute contains the
unsharded gradient for the local batch.

Postcondition:
- If using ``NO_SHARD``, then the ``.grad`` attribute is the reduced
unsharded gradient.
- Otherwise, the ``_saved_grad_shard`` attribute is the reduced sharded
gradient (accumulating with any existing gradient).
Tz,FullyShardedDataParallel._post_backward_hookz8Expects `BACKWARD_PRE` or `BACKWARD_POST` state but got Nz,FSDP does not support gradients of gradients))r   loggerr`   _post_backward_calledr\   autogradr   r   r   r   r   r   r   r   r   BACKWARD_POSTr  requires_gradrJ   _post_backward_reshard_sync_gradientsr    _use_unsharded_grad_viewsr   r   r   ry   r   rH   r   r   data_low_precision_hook_enabledr   _reduce_dtyper   touses_sharded_strategy_reduce_grad_reduce_grad_no_shardr   )rA   rb   r`   r   autograd_computed_grads        r/   _post_backward_hookr    s   ( E62""J'+J$		 	 	0	06
 	#5=+I+I*JK
 	""#002E2S2STUFvG]G]F^_	

 "5!B!B??"!
 
" ??((MNNu-$$&&0021
 
8   88QQSS''33$$335 !!(()D)DE%/__%9%9"/66OO))V-A-AA 44'1'9'9&:N:N'O
$++U+%e4 '&(C(C# FC
 
B FEC
 
s4   BJ:A$J:BJ:CJ)J:)
J7	3J::
Kc                     [         R                  R                  S5         [        R                  U l        [        R                  Ul        [        X5        S S S 5        g ! , (       d  f       g = f)Nz9FullyShardedDataParallel._post_backward_hook_reshard_only)
r\   r   r   r   r   r   r   r
  r   r  )rA   rb   r   s      r/    _post_backward_reshard_only_hookr    sO    
 
	'	'C
  -==!4!B!Bu-
 
 
s   6A
A-c                     [        X5      n[        XU5        [        R                  R	                  S5         [        X[        R                  5        S S S 5        g ! , (       d  f       g = f)Nz0FullyShardedDataParallel._post_backward_prefetch)_should_free_in_backwardr   r\   r   r   r   r$   r+   )rA   rb   r   r   s       r/   r  r    sS    
 !9 GU56
 
	'	':
 	(>(>?
 
 
s   A
A)c                 r    UR                   (       d  gU R                  =(       d    UR                  [        ;   $ )z\
Returns whether FSDP should free the unsharded flat parameter in the
post-backward or not.
F)r  r  r   r   )rA   rb   s     r/   r  r  &  s5     ''
 	 	P$$(OOr.   c                    UR                   nUR                  [        R                  [        R                  4;   nUR
                  R                  nSUl        [        X5      u  pVU R                  GcS  [        XPR                  5        UR                  (       a  UR                  OU R                  n[        R                  " UUUS9  U(       a  [         R"                  R$                  R'                  5       (       d%  U R(                  R+                  U R,                  5        U R.                  R1                  U R(                  5         [3        X`R(                  5        [        R4                  " X`R6                  S9  [        X`R8                  5        [;        XU5      n[=        XU5         SSS5        g[        X`R8                  5        OU R                  U R>                  XV5        [;        XU5      n[=        XU5        g! , (       d  f       NY= f)z
For sharded strategies, this runs gradient reduction, sharded gradient
accumulation if needed, and the post-reduction callback.
Ngroup) r`   r   r   HYBRID_SHARD_HYBRID_SHARD_ZERO2r  r  _get_reduce_scatter_tensors
_comm_hook_div_if_needed_gradient_predivide_factor_use_fake_reduce_fake_process_grouprV   distreduce_scatter_tensorr\   r   r   r   r{   r   ry   rH   r   r   
all_reduce_inter_node_pg_gradient_postdivide_factor_accumulate_sharded_grad_post_reduce_grad_callback_comm_hook_state)	rA   rb   r`   uses_hybrid_sharded_strategyunsharded_gradpadded_unsharded_gradnew_sharded_gradpggrad_to_offloads	            r/   r  r  :  s    ""J#)#<#<++22A $   __))NJO.I/+ ,.N.NO && &&$$ 	
 	""!	

 ($$<<UUWW((44U5P5PQ%%,,U-E-EF ++;=U=UV 08L8LM/1R1RS":#3# +5/J GF 	')J)JK""$9	
 /u>NOOuo>' GFs   A"G::
Hr1  c                 &   [        UR                  U R                  5      5      nU R                  US   R                  5       -  UR                  5       -
  nUS:  a  [        R
                  " USU/5      OUn[        R                  " US   5      nXE4$ )zG
Returns the input and output tensors to reduce-scatter, respectively.
r   )r   chunk
world_sizenumelFpadr\   
empty_like)rA   r1  chunksnumel_to_padr2  r3  s         r/   r"  r"  u  s     .&&u'7'789F##fQioo&77.:N:N:PPL4@14Dnq,/0.  ''q	2 22r.   sharded_gradc                     UR                   n[        XU5        [        US5      nU(       a+  [        X#R                  5        U=R                  U-  sl        OX#l        UR                  nU$ )z
Accumulates the reduce-scattered sharded gradient with any existing sharded
gradient if needed, returning the gradient to offload (if CPU offloading is
enabled).
_saved_grad_shard)r`   _cast_grad_to_param_dtyperg   _check_grad_to_accumulaterA  )rA   rb   r?  r`   accumulate_gradr5  s         r/   r-  r-    sa     ""Je:> j*=>O!,0L0LM$$4$'3$ 22Or.   c                    UR                   nU R                  cj  [        UR                  U R                  5        [
        R                  " UR                  U R                  S9  [        UR                  U R                  5        O&U R                  U R                  UR                  5        UR                  (       d  [        XR                  U5        UR                  R                  n[        XU5        g)z
For no-shard, this runs gradient reduction (which directly covers any
gradient accumulation implicitly) and the post-reduction callback.
Nr  )r`   r#  r$  r  r%  r(  r*  rV   r,  r/  _keep_low_precision_gradsrB  r  r.  )rA   rb   r`   r5  s       r/   r  r    s     ""Jz(H(HI
u/B/BCz(I(IJ//A ++!%*E oo**Ouo>r.   r5  c                 2    [        XU5        [        U5        g)z
This callback captures any logic to run after the gradient reduction
finishes. Currently, this offloads the gradient to CPU if CPU offloading is
enabled and uses sharded gradient views if ``use_orig_params=True``.
N)_offload_grad%_post_backward_use_sharded_grad_views)rA   rb   r5  s      r/   r.  r.    s     %1)&1r.   c                    UR                   (       d  g UR                  =(       a    UR                  (       + nUR                  R                  R                  UR                  5       US9  [        UR                  U R                  5        g )N)non_blocking)
r_   r  rt   r`   r   copy_detachr   r  ry   )rA   rb   r5  rK  s       r/   rH  rH    sp     !! //U8U8U4UL
%% | &  33U5P5PQr.   c                    U R                   (       d  g U R                  5         U R                  5         U R                  (       a  U R	                  5         U R
                  R                   HW  nUR                  c  M  [        US5      (       d  M%  UR                   H  nUR                  5         M     WR                  SS9  MY     U R                  5         U R                  (       a  S U R
                  l        g g g )Nre   T)set_to_none)r    _reset_is_grad_none_use_sharded_grad_viewsrt   prepare_gradient_for_optimr`   rr   r  rg   re   step	zero_gradr   r_   r   )rb   
orig_paramoptims      r/   rI  rI    s    ""   ""$$$))+ ++33J*w50 0 (??EJJL @ D1 4 	446!!*.F' "! %r.   tensor
div_factorc                 4    US:  a  U R                  U5        g g )Nrl   )div_)rW  rX  s     r/   r$  r$    s    A~J r.   ri   c                 @   [        U [        R                  /5        [        U 5      (       ds  UR                  UR                  :w  aX  UR
                  nUR
                  R                  UR                  S9Ul        [        X0R                  R                  5       5        ggg)a  
Casts ``sharded_grad`` back to the full parameter dtype so that the
optimizer step runs with that dtype. This performs an actual cast if
1. parameters were in reduced precision during the forward since then
gradients would be in that reduced precision, or
2. parameters were not in reduced precision but gradients were in
reduced precision for communication.
However, if a low precision communication hook is registered, then this
dtype cast happens in the hook instead.
r   N)
r   r   r   r  r   r  r  r   rH   r   )rA   r?  ri   low_prec_grad_datas       r/   rB  rB    s      u}'E'E&FG&u--,2D2D2S)..(--00u{{0C 	# 4 4 C C E	
 3T-r.   r3  accumulated_gradc                     [        UR                  U R                  :H  SUR                   SU R                   35        [        UR                  U R                  :H  SUR                   SU R                   35        g )NzDShape mismatch when accumulating gradients: existing gradient shape=z new gradient shape=zFDevice mismatch when accumulating gradients: existing gradient device=z new gradient device=)r   shaper]   )r3  r]  s     r/   rC  rC    s     "2"8"88	##3#9#9": ;.445	7 #3#:#::	$$4$;$;#< =/667	9r.   c                 (    U R                   [        ;   $ r   )r#  r   )rA   s    r/   r  r  1  s    222r.   c                    [        U R                  S5        U nUR                  (       a  U R                  R	                  5       nUR                  UR                  5        UR                  ULa  UR                  UR                  5        UR                  R                  (       a(  U R                  R	                  5       R                  5         UR                  R                  5         U R                   Ht  n[        U5        [        U5        [         R"                  Ul        UR&                  nU(       d  MC  SUl        SUl        SUl        [.        R"                  Ul        SUl        Mv     SUl        g)z
This waits for the post-backward to finish and performs some final cleanup.
This runs at the end of the entire backward pass and should only be called
on the root FSDP instance.
zJThe post-backward callback should only be called on the root FSDP instanceFN)r   rD   r  rH   r   r   ry   r{   cpu_offloadoffload_paramsr   rT   	next_iterrO   _catch_all_reshard_finalize_paramsr   rK   r   rm   r   r  _post_forward_indexr   r   r   r   )rA   r0   rc   r   r   rb   s         r/   _post_backward_final_callbackrh  6  s&    T J!!--<<> 	"":#C#CD((>&&z'D'DE!!00   //1==?))+,,
:&$$1$6$6
!##6,1F)16F.)-F&%8%=%=F"!&F - 16J-r.   c           
          U R                   (       a  U R                   R                  R                  5       U R                   R                  R                  R                  5       :H  =(       a    U R                   R                  (       + nU(       a  g[        X R                   5      n[        X R                   U5        gg! [         a!  n[        SSU  S[        U5       3SS9  UeSnAff = f)ac  
Reshards the parameters that may not have been resharded in the
post-backward hook. This can happen when a module's output is used in the
forward pass, meaning that its pre-backward hook runs (unsharding the
parameter), but the post-backward hook does not run because the output was
not jused in the loss computation corresponding to this backward pass.
NFz+Got exception in the catch-all reshard for : )raise_assertion_error)
rm   r`   data_ptrr   _skipped_use_sharded_viewsr  r   	Exceptionr   str)rA   already_reshardedr   es       r/   re  re  e  s    == ((113==++88AACD A
 @@@  !(@(V%UMM+DE   9%3q6(K"'	

 s   BB= +B= =
C(C##C(c                    U R                   nU(       d  gUR                  n[        R                  R                  R                  5       (       a0  [        US5      (       a  UR                  nUR                  5         U?O[        US5      (       aq  [        UR                  5      n[        UR                  5      S-   n[        XE:H  SUR                   35        UR                  S   R                  5         [        US5        UR                  (       aQ  U R                  (       d  gUR                   (       d  UR#                  5         [        [        US5      S5        S	Ul        gg)
z3Finalizes the parameters before the next iteration.N_post_backward_hook_handle_post_backward_hook_staterl   z(Invalid: ``_post_backward_hook_state``: r   r  z@Expects `_post_backward_called` to be set on the `FlatParameter`F)rm   r`   r\   r   r   r   rg   rs  remover   rt  intr  r   delattrr  rt   rR  r  )rA   rb   r`   pbhs_handlepost_backward_hook_state_len%expected_post_backward_hook_state_lens         r/   rf  rf    s/   
 ]]F""J00IIKK:;<<$??K 5::;;+.z/S/S+T(47
8P8P4QTU4U1,U::;_;_:`a 004;;=J ;<$$
 ,,--/J 78N	
 ,1
(  r.   current_handleprefetch_modec                    U(       d  g[        X5      nU(       d  gUR                  nU[        R                  :X  a  [        R
                  Ul        OEU[        R                  :X  a  [        R                  Ul        O[        SU R                   SU 35      e[        XU R                  U R                  5        XCl        SUl        g)zh
Prefetches the next handles if needed (without synchronization). An empty
handles key cannot prefetch.
NzInvalid prefetch mode on rank rj  T)_get_handle_to_prefetchr   r$   r+   r   r   r,   r   rankr   rx   rz   r   )rA   r{  r|  rb   prev_training_states        r/   r   r     s     $U;F !00...!4!A!A	-//	/!4!<!<9%**RWXX UE1153L3LM0Fr.   c                    [        U5      n[        R                  [        R                  [        R                  4n[        X#;   SU SU 35        U R                  nSnU[        R                  :X  a  U R                  [        R                  :X  d2  U[        R                  :X  aa  U R                  [        R                  :X  aC  UR                  U5      nU(       a&  UR                  (       a  UR                  (       d  UnU$ Sn U$ U[        R                  :X  aQ  U R                  (       a@  UR                  U5      nU(       a&  UR                  (       a  UR                  (       d  UnU$ SnU$ )a;  
Returns a :class:`list` of the handles keys to prefetch for the next
module(s), where ``current_handle`` represents the current module.

"Prefetching" refers to running the unshard logic early (without
synchronization), and the "next" modules depend on the recorded execution
order and the current training state.
z!Prefetching is only supported in z but currently in N)_get_training_stater   r   r
  r,   r   rT   backward_prefetchr   get_handle_to_backward_prefetchr  r   r   get_handle_to_forward_prefetchr   )rA   r{  r   valid_training_stateseodtarget_handletarget_handle_candidates          r/   r~  r~    sP    )8N(())##
 /
+,A+B C&'	)
 
 
 C/3M-:::##'7'D'DD-;;;##'7'E'EE"%"E"En"U#'CC+773M  !M  
.66	65;Q;Q"%"D"D^"T#'BB+773M  !Mr.   c                 2    [        U S5        U R                  $ )z8Returns the training state of the handles in ``handle``.zExpects a non-empty handle)r   r   r   s    r/   r  r  
  s     f23!!!r.   c                 J   U R                    H  nUR                  5         M     U R                   R                  5         U R                  R	                  US5      n[
        R                  " [        X[        5      nU R                   R                  UR                  USSS95        g)z-
Registers a pre-forward hook on ``module``.
NTprependwith_kwargs)_pre_forward_handlesru  clear_fully_sharded_module_to_handleget	functoolspartialr   r   r7   register_forward_pre_hookrA   r0   forward_handlemodule_param_handlehooks        r/   _register_pre_forward_hookr    s      44 5	$$&??CCFDQe2FD 
%%((t(Nr.   c                 L   U R                    H  nUR                  5         M     U R                   R                  5         U R                  R	                  US5      n[
        R                  " [        U U[        5      nU R                   R                  UR                  U5      5        g)z
Registers a post-forward hook on ``module``. Even if the module has no
handles, we should register the hook since it will register the module's
pre-backward hook.
N)_post_forward_handlesru  r  r  r  r  r  r   r   r7   register_forward_hookr  s        r/   _register_post_forward_hookr  &  s      55 6	%%'??CCFDQ	D 
&&v'C'CD'IJr.   c                    U R                    H  nUR                  5         M     U R                   R                  5         [        R                  " [
        U 5      nU R                   R                  UR                  USSS95        g)a  
Registers root pre-forward hook on ``module``, which should be the local
FSDP root.

NOTE: For the current composable FSDP design, we have each application of
``fully_shard()`` to a module to indicate that that module is the local
FSDP root. We may remove this assumption in the future, in which case we
will need to register this root pre-forward hook on any candidate module
that may be the local FSDP root.
Tr  N)_root_pre_forward_handlesru  r  r  r  r   r7   r  )rA   r0   r  r  s       r/   _register_root_pre_forward_hookr  =  sn      99 :	##))+.6D	##**((t(Nr.   outputsc                   ^ ^^ [         R                  " 5       (       d  U$ T R                  (       a  ST l        T(       a  STl        STl        S[         R                  S[         R                  4UUU 4S jjn[        XB5      $ )a|  
Registers pre-backward hooks on the tensors that require gradients in the
forward pass outputs ``outputs``, which were computed using the
``FlatParameter`` s of ``handles``.

Args:
    module (nn.Module): Fully sharded module (see [Note: Fully Sharded
        Module]).

Returns:
    Forward pass outputs with pre-backward hooks registered to tensors that
    require gradients.
Ftr1   c           
         > U R                   (       aa  U R                  [        R                  R                  R                  [        R                  " [        TTT5      5      5        T(       a  STl	        U $ r   )
r  register_hookr\   utilshooksunserializable_hookr  r  r  r  )r  rb   r0   rA   s    r/   _register_hook4_register_pre_backward_hooks.<locals>._register_hookv  sU    ??OO!!55%%&8%P
 592r.   )r\   is_grad_enabledrD   r   r  r   Tensorr   )rA   r0   r  rb   r  s   `` ` r/   r   r   U  sj    ,   ""~~/4,-2* ).%	%,, 	5<< 	 	 ^55r.   c                    [         R                  " 5       (       d  gU(       d  gUR                  n[         R                  R                  R                  5       (       aX  [        US5      nU(       d  UR                  (       d  g[        R                  " [        X5      nUR                  U5      nXRl        g[        US5      nU(       d  UR                  (       d  gUR                  U5      n[        UR                  SLS5        UR                  R                   S   S   nUc   eUR#                  [        R                  " [        X5      5      nXu4Ul        g)aQ  
Registers post-backward hooks on the ``FlatParameter`` s'
``AccumulateGrad`` objects to reshard and to reduce-scatter gradients.

The ``AccumulateGrad`` object represents the last function that finalizes
the ``FlatParameter`` 's gradient, so it only runs after its entire
gradient computation has finished.

We register the post-backward hook only once in the *first* forward that a
``FlatParameter`` participates in. This relies on the ``AccumulateGrad``
object being preserved through multiple forwards.

NOTE: We follow this heuristic to prefer the *first* forward to target the
parameter mixed precision case, where there are *separate*
``AccumulateGrad`` objects across the different forwards. (Without
parameter mixed precision, the ``AccumulateGrad`` objects are the same.) If
we instead prefer the *last* forward, then the hook runs early.
Nrs  rt  zZThe `grad_fn` is needed to access the `AccumulateGrad` and register the post-backward hookr   )r\   r  r`   r   r   r   rg   r  r  r  r  "register_post_accumulate_grad_hookrs  	expand_asr   grad_fnnext_functionsr  rt  )rA   rb   r`   already_registeredr  hook_handletemp_flat_paramacc_grads           r/   r   r     s%   0   """"J00IIKK$Z1MNZ%=%=  !4eD CCDI0;-$Z1LMZ%=%=$..z:##4/.	

 #**99!<Q?###,,15A
 19/F
,r.   c                    [         R                  " 5       (       d  gSnU(       d  gUR                  n[         R                  R                  R                  5       (       a  [        US5      nO[        US5      nU(       d  UR                  (       a  gUcX  [        R                  " U0 UD6nU Vs/ s H5  n[         R                  " U5      (       d  M   UR                  (       d  M3  UPM7     nnUc   e[        U[        R                  " [        X5      5      n	[         R                  R                  R                  5       (       a  Xl        gU	4Ul        gs  snf )a  
Registers post-backward hooks to reshard flat parameters that do not
require gradient. We register these using multi-post-grad hooks on the
input activations to ensure that all gradients that may depend on the
parameters have been computed before resharding.
Nrs  rt  )r\   r  r`   r   r   r   rg   r  pytreearg_tree_leaves	is_tensorr
   r  r  r  rs  rt  )
rA   rb   r   r   inp_tensorsr`   r  	args_flatobjr  s
             r/   r   r     s      "" 15K""J00IIKK$Z1MN$Z1LMZ55**D;F;	$
$C(<CARARC9 	 
 """*Y&&'GWK 00IIKK0;-0;~
,
s   .EE$Ec                 d   [        U R                  S5        U R                  (       a  g[        U [        R
                  /5        [        R                  R                  R                  5       (       d@  SU l        [        R                  R                  [        R                  " [        X5      5        gg)z
Registers the post-backward final callback that runs at the end of the
backward pass. This should be called from the root FSDP instance at the
beginning of the pre-backward.
zFOnly the root FSDP instance should register the post-backward callbackNT)r   rD   r   r   r   rK   r\   r   r   r   r	   _execution_enginequeue_callbackr  r  rh  rE   s     r/   r   r     s     P ++u}'9'9&:;44MMOO/3,""11;UK	
 Pr.   computation_streamc                     [         R                  R                  R                  5       (       a  gUR	                  U 5        UR	                  U 5        g)z
Has the unshard and pre-unshard streams wait for the computation stream.
For example, this should be called in the FSDP root's pre-forward to
respect optimizer step computation.
N)r\   r   r   r   r   )r  r   r   s      r/   r   r     sB     00IIKK12 ""#56r.   r   c                     [        U [        5      (       d  U /n U  H&  nUR                  (       d  M  UR                  5         M(     g)z
Clears the original parameters' gradients if needed. This method's CPU
overhead is minimal, so we may call it throughout FSDP methods, which serve
as callsites to free the gradient memory earlier.
N)
isinstancer   r    r   )r   rb   s     r/   r   r     s:     gt$$)"""88: r.   c                 >   [        U R                  S5        / n/ n[        5       n[        R                  " U5      u  pV[        [        U5      [        U5      5       H  u  pxUR                  5        Hn  u  pX;   a  M  UR                  U
5        [        U	5      UR                  ;   a  M8  UR                  U
5        UR                  UR                  R                  5        Mp     M     [        U5      [        U5      :X  d   [        U5       S[        U5       35       eX#4$ )z
Returns all buffers in the module tree rooted at ``root_module`` and a
corresponding list of the buffer dtypes for computation. Each buffer dtype
is either ``None`` if buffer mixed precision is not enabled or the buffer
low precision dtype otherwise.
z Expects the root to cast buffers )r   rD   r3   rM   _get_fsdp_states_with_modulesr   reversedr   r6   r   _ignored_buffer_namesr7   r   buffer_dtyper   )rA   rF   rX   rY   visited_buffersfsdp_statesfsdp_modulesr   fsdp_modulebuffer_namer   s              r/   rQ   rQ   $  s    enn@A"$G13M),O !0 M M!K $'x'<h|>T#U
#.#<#<#>K(' -1Q1QQNN6"  !;!;!H!HI $? $V w<3}--U#g,q]AS@T/UU-!!r.   buffer_namesc           
          / nU Hc  n[        X0R                  ;   U SU R                   SU R                  R                  5        35        UR	                  U R                  U   5        Me     U$ )z>
Returns the original buffer types of the given buffer names.
z+ is missing from pre-computed dict on rank z, which only has keys )r   r   r  keysr7   )rA   r  rY   r  s       r/   _get_orig_buffer_dtypesr  E  sx     (*M#;;;mFzzl0//44679	
 	U==kJK $ r.   rX   rY   r]   c           	      D   [        USL =(       d    [        U 5      [        U5      :H  S[        U 5       S[        U5       35        [        X5       HM  u  p4[        R                  " U5      (       a  Uc  UR                  US9Ul        M9  UR                  X$S9Ul        MO     g)z
Casts ``buffers`` to the dtypes given by ``buffer_dtypes`` and moves them
to ``device``. If an element in ``buffer_dtypes`` is ``None``, then the
corresponding buffer is only moved to ``device``.
NzfExpects `buffers` and `buffer_dtypes` to have the same length if `buffer_dtypes` is specified but got z and r   )r]   r   )r   r   r   r\   is_floating_pointr  r  )rX   rY   r]   r   r  s        r/   rR   rR   Y  s     CW]1C!C003G~U}
	  !$G ;&&v..,2F ))6)2FK ))6)FFK	 !<r.   )|r  loggingenumr   r   typingr   r   r   r   r\   torch.distributedr   r(  'torch.distributed.fsdp._traversal_utilsfsdp_traversal_utilsrM   torch.nnnntorch.nn.functional
functionalr:  torch.autogradr	   torch.autograd.graphr
   (torch.distributed.algorithms._comm_hooksr   $torch.distributed.fsdp._common_utilsr   r   r   r   r   r   r   r   "torch.distributed.fsdp._flat_paramr   r   r   r   r   "torch.distributed.fsdp._init_utilsr   torch.distributed.fsdp.apir   torch.distributed.utilsr   r   r   r   torch.utilsr   r  	getLoggerr'   r  ro   r$   Moduler   r   r=   r@   boolr5   rC   rL   rW   rP   r   r   r   r   r   r   ro  r   r   r   r   r   r   r  no_gradr  r  r  r  r  r  r"  r-  r  r.  rH  rI  floatr$  rB  rC  r  rh  re  rf  r   r~  r  r  r  r  r   r   r   r   r   r   r   rQ   r  r]   rR   r&   r.   r/   <module>r     s
      9 9    A A    # 9 H	 	 	  J 7  * 
		8	$ D 
/II/
4
T"))_,-/@")) Z0@ 
 
RYY 
4 
   D RYY 0 >>> 
> >B ""	" "J  LL 	
 
 @   $ ._%	_%	 ;;_%; ; II	;
 S/; cN; 5c?DcN*+; ;| ??_%? 
? ?4 ,,_%, , II	,
 , , 	, ,^ 777 
7 7" fEfEIIfE
 
fE fER $xx
38_ & BBIIB B
 B 	B BJ III 	I  IX... . 
	. @@@ @ 
	@"  
 & 7?
 7?O 7? 7? 7?t 33',||3
5<<%&3 3  ,, \\	 2 ? ?_ ? ? ?( 222 \\	2 2 RRR \\R R2 // / /: 5<<  U  t  
 

,,
 
 
:llll 
$ 3z 3d 3 3 *6*6II*6  *6Z ""	" "J $1$1	$1 $1N _- ! 
	 < 44#4 4 4n""" II 
 & KKIIK 
K K, II . +6+6II+6 +6 	+6
 
+6 +6\5G5G_%5G 
5Gp*>*>_%*> S/*> cN	*>
 
*>Z 

!yy
	
 
.77LL7 7(;/"; """ 4tHU[[$9::;" "@ s) 
%++ &G%,,G-.G LLG 
	Gr.   