
    sh              	          S SK r S SKrS SKrS SKrS SKrS SKrS SKJrJr  S SK J	r	  S SK
Jr  S SKJrJr  S SKJr  S SKJrJrJrJrJrJr  S SKJr  S SKrS SKJr  S SKJr  S SK Js  J!r"  S S	K#J$r$  S S
K%J&r&  S SK'J(r(J)r)J*r+  S SK,J-r-  S SK.J/r/J0r0  S SK1J2r2  S SK3J4r4J5r5J6r6  S SK7J8r8  S SK9J:r:J;r;J<r<  S SK=J>r>J?r?J@r@  S SKAJBrBJCrCJDrDJErE  S SKJFrFJGrG  S SKHJIrJ  S SKKJLrLJMrMJNrNJOrO  S SKPJQrQJRrRJSrSJTrTJUrU  S SKVJWrW  SrX\S(       a  SrYSrZ\R                  R                  5       rXO8\T(       a  SrYSrZO,\U(       a  SrYSrZ\R                  R                  5       rXOS rYS!rZS"rX " S# S$\5      r^ " S% S&\5      r_ " S' S(\R                  \5      raS)\R                  S*\R                  S+\4S, jrcS- rd  StS)\R                  S.\e4S/ jjrfSuS0 jrgS1 rhS2 riSvS)\R                  S3\e4S4 jjrjS)\R                  S5\e4S6 jrkS)\R                  S7\e4S8 jrl " S9 S:5      rm " S; S<\a5      rn " S= S>\a5      ro " S? S@\o5      rp " SA SB\o5      rq " SC SD\a5      rr " SE SF\r5      rs " SG SH\R                  5      rt " SI SJ\o5      ru " SK SL\R                  5      rv " SM SN\R                  5      rx " SO SP\R                  5      ry\ R                  SQ\4SR j5       r{\ R                  SS\4ST j5       r|\ R                  SU\4SV j5       r}\\ R                  SW\4SX j5       5       r~\\ R                  SY\4SZ j5       5       r\\ R                  S[\4S\ j5       5       r\\ R                  S]\4S^ j5       5       rS_\S+\S`\Sa\4Sb jr SwSc\R                  Sd\R                  Se\\Sf4   4Sg jjr " Sh Si\M5      r " Sj Sk\L5      rSxSl\\   4Sm jjr " Sn So\R                  5      r " Sp Sq\R                  5      r " Sr Ss\R                  5      rg)y    N)ABCabstractmethod)nullcontext)deepcopy)autoEnumwraps)AnyCallablecastno_type_checkOptionalUnion)mock)
checkpoint)
DeviceMesh)
CPUOffloadfully_shardFullyShardedDataParallel)TrainingState)FSDPParamGroupRegisterPostBackwardFunction)#NO_RESHARD_AFTER_FORWARD_STRATEGIES)BackwardPrefetchMixedPrecisionShardingStrategy)ShardedGradScaler)always_wrap_policyModuleWrapPolicywrap)distribute_tensorDTensorShard)ColwiseParallelparallelize_moduleRowwiseParallelSequenceParallel)TransformerDecoderLayerTransformerEncoderLayer)DistributedDataParallel)MultiProcessTestCaseMultiThreadedTestCaserun_subtests
TEST_SKIPS)FILE_SCHEMAget_cycles_per_ms	TEST_CUDATEST_HPUTEST_XPU)
has_triton   cudancclzhpu:0hcclxpuxcclcpugloo   c                   0    \ rS rSr\" 5       r\" 5       rSrg)FSDPInitModeV    N)__name__
__module____qualname____firstlineno__r   NO_FSDP	RECURSIVE__static_attributes__rB       w/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/testing/_internal/common_fsdp.pyr@   r@   V   s    fGIrJ   r@   c                   >    \ rS rSr\" 5       r\" 5       r\" 5       rSrg)DEVICEInitMode_   rB   N)	rC   rD   rE   rF   r   DEVICE_BEFOREDEVICE_AFTERDEVICE_NEVERrI   rB   rJ   rK   rM   rM   _   s    FM6L6LrJ   rM   c                       \ rS rSrSr\S\\R                  S4   4S j5       r	\S\R                  4S j5       r
\SS j5       r\\S	\S
\S\R                  4S j5       5       rSrg)FSDPTestModelh   zVThis defines the interface expected from all models used commonly for
FSDP unit tests.return.c                     g)z+Returns an input for the model as as tuple.NrB   selfdevices     rK   	get_inputFSDPTestModel.get_inputl        	rJ   c                     g)z,Returns the loss given the input and output.NrB   )rX   inputoutputs      rK   get_lossFSDPTestModel.get_lossq   r\   rJ   Nc                     g)z<Runs the backward pass (e.g. including ``loss.backward()``).NrB   rX   losss     rK   run_backwardFSDPTestModel.run_backwardv   r\   rJ   argskwargsc                      g)z&Initializes an instance of this model.NrB   )rg   rh   s     rK   initFSDPTestModel.init{   s     	rJ   rB   rU   N)rC   rD   rE   rF   __doc__r   tupletorchTensorrZ   r`   re   staticmethodr   nnModulerj   rI   rB   rJ   rK   rS   rS   h   s     5s):#;        C 3 299   rJ   rS   modelprocess_group	assert_fnc                 6   U R                  5        VVs/ s H$  u  p4X4R                  5       R                  5       4PM&     nnnUU R                  5        VVs/ s H$  u  pgXgR                  5       R                  5       4PM&     snn-  n[        R
                  " U5      n[        U5       V	s/ s H  n	SPM     n
n	[        R                  " XUS9  U
S   nUc   eU
SS  H)  nUc   e[        X5       H  u  u  pu  pU" X5        M     M+     gs  snnf s  snnf s  sn	f )z
All-gathers module states across ranks and calls ``assert_fn`` on each pair
of corresponding states from rank 0 and a nonzero rank. For example, if
``assert_fn`` is ``self.assertEqual()``, then this checks that all module
states are equal across ranks.
Ngroupr   r>   )	named_parametersdetachr<   named_buffersdistget_world_sizerangeall_gather_objectzip)rt   ru   rv   
param_nameparamnamed_module_statesbuffer_namebuffer
world_size_olistrank0_statesstatep1p2s                  rK   _assert_module_statesr      s%    "'!7!7!9!9J 
\\^'')*!9   #(#6#6#8#8K 
mmo))+,#8  $$]3J ,-,aT,E-5]K8L###qr    #L 8GQWab !9 
 .s   +D
+D*Dc                  6    [         R                  " [        5      $ N)ro   rY   DEVICE_TYPErB   rJ   rK   get_devtyper      s    <<$$rJ   zero_buffersc                    U(       a  [         R                  " U 5      O	[        5       nU   U R                  5        H1  n[        R
                  " 5          UR                  5         SSS5        M3     U(       aE  U R                  5        H1  n[        R
                  " 5          UR                  5         SSS5        M3     SSS5        g! , (       d  f       M  = f! , (       d  f       M_  = f! , (       d  f       g= f)zBZeros the parameters and optionally buffers of ``model`` in place.N)FSDPsummon_full_paramsr   
parametersro   no_gradzero_buffers)rt   r   summon_fullctxr   r   s         rK   _zero_modelr      s     -8$
!
!%
([]C	%%'E ! ( --/]]_LLN %_ * 
  %_ 
s;   )C)C$;C)C0C)
CC)
C&!C))
C7c                     U(       d  U R                  [        5      n U(       a  U R                  5         U R                  5       $ r   )tor   half
state_dict)rt   cpu_offloadr   s      rK   _get_state_dictr      s.    %

rJ   c           	      p    SR                  U Vs/ s H  o"b  U [        U5         OSPM     sn5      $ s  snf )Nr   none)joinstr)test_name_mappingrg   ss      rK   subtest_namer      s9    88IMNAm	3q6	"	?N Ns   3c                    UR                  5        H=  u  p#UR                  [        R                  " S5      :w  d  M+  UR                  5       X'   M?     U S:X  a  UOS /n[        R
                  " U5        [        [        [        [        R                  4   US   5      nUR                  5        H  nX   R                  [        5      X'   M     U$ )Nr<   r   )itemsrY   ro   r<   r}   broadcast_object_listr   dictr   rp   keysr   r   )rankr   r   r   r   s        rK   _broadcast_state_dictr      s     (--/
<<5<<..%*YY[J" 0  19Z$/Eu%d3,-uQx8J oo'
!+!7!:!:;!G
 (rJ   recursec                     [         R                  " XS9   [        [        U R	                  5       5      5      sSSS5        $ ! , (       d  f       g= f)a?  
Returns the full unsharded parameters of ``model``. Any FSDP-managed
parameters offloaded to CPU are moved to GPU in the returned list.

Args:
    recurse (bool): If ``False``, only unshards the parameters immediate to
        ``model``; if ``True``, recurses through the module hierarchy
        rooted at ``model``.
)r   N)r   r   r   listr   )rt   r   s     rK   get_full_paramsr      s4     
	 	 	8U--/01 
9	8	8s   "A
Amove_to_devicec                 >    U(       a  U R                  [        5      $ U $ r   )r   r   )rt   r   s     rK   _move_to_devicer      s    $2588K ==rJ   	wrap_fsdpc                 2    U(       d  U $ [        U /UQ70 UD6$ r   r   )rt   r   rg   rh   s       rK   _maybe_wrap_fsdpr      s    !5CtE'CD'CF'CCrJ   c                   H    \ rS rSrS\S\4S jrS\4S jrS\4S jrS rS	r	g
)DummyProcessGroup   r   sizec                     Xl         X l        g r   _rank_size)rX   r   r   s      rK   __init__DummyProcessGroup.__init__   s    

rJ   rU   c                     U R                   $ r   )r   rX   s    rK   r   DummyProcessGroup.rank       zzrJ   c                     U R                   $ r   )r   r   s    rK   r   DummyProcessGroup.size   r   rJ   c                 B    [         R                  " 5       nS nXCl        U$ )Nc                  d    [         R                  R                  5       n U R                  S5        U $ )Nr>   )ro   futuresFuture
set_result)futures    rK   
get_future/DummyProcessGroup.allreduce.<locals>.get_future   s'    +0==+?+?+AFa MrJ   )r   Mockr   )rX   rg   rh   	dist_waitr   s        rK   	allreduceDummyProcessGroup.allreduce   s     IIK		
  *rJ   r   N)
rC   rD   rE   rF   intr   r   r   r   rI   rB   rJ   rK   r   r      s2    S  c c 	rJ   r   c                      ^  \ rS rSrS\R
                  S\S\S\4U 4S jjrS r	S r
S	 rS
 r\   SS\R
                  S\S\S\\\\4      S\S\S\\R*                  \4   4S jj5       rS rSrU =r$ )TransformerWithSharedParamsi  ry   device_init_modeadd_bndeterministicc                   > [         TU ]  5         UR                  5       U l        UR                  5       U l        U(       a  [
        R                  " S5        SnSn[        R                  " XV5      U l	        [        R                  " USSSSS9U l        [        R                  " Xe5      U l        U R                  R                  U R                  l        U R                  SU R                  R                  R!                  U45      5        U R                  S	[
        R"                  " U R$                  [
        R&                  S
95        SU l        U(       a)  [
        R                  R+                  U R(                  5      O[
        R                  R-                  5       U l        U[0        R2                  :X  a  U R5                  [6        5      n U(       a  U R9                  5         g g )Nr               g?)d_modelnum_encoder_layersnum_decoder_layersdim_feedforwarddropout
vocab_biaslong_buffer)dtype)superr   r   r   r   ro   manual_seedrr   	Embeddingembed_tokensTransformertransformerLinearoutput_projweightregister_buffernew_ones
zeros_liker   longbsBatchNorm1dIdentitybnrM   rO   r   r   eval)rX   ry   r   r   r   d_vocabr   	__class__s          rK   r   $TransformerWithSharedParams.__init__  s]    	JJL	**,a LL:>>  
 99W6 #'"3"3":":$++22;;WJG	
 	T__EJJ?	

 39%((&&tww/uxx?P?P?R~;;;77;'DIIK rJ   c                 $   [         R                  " SU R                  -   5        [         R                  " SUS9R	                  SU R
                  5      n[         R                  " U R
                  S-  US9R	                  SU R
                  5      nX#4$ )Nr>      rY      r6   )ro   r   r   arangeviewr   )rX   rY   srctgts       rK   rZ   %TransformerWithSharedParams.get_input.  sj    !dii-(ll2f-221dgg>ll477Q;v6;;AtwwGzrJ   c                     U R                  U5      nX0R                  -   U R                  R                  U5      -   nU R                  U5      nU R	                  U5      nU R                  X45      nU R                  U5      $ r   )r   r   r   type_asr   r   r   )rX   src_idstgt_idsr  r  xs         rK   forward#TransformerWithSharedParams.forward4  sr    (OO#d&6&6&>&>s&CC(ggclS&""rJ   c                     Uu  p4[         R                  R                  UR                  SUR	                  S5      5      UR                  S5      SS9$ )Nsum)	reduction)rr   
functionalcross_entropyr  r   )rX   r^   r_   r   r  s        rK   r`   $TransformerWithSharedParams.get_loss<  sG    }}**KKFKKO,chhrle + 
 	
rJ   c                 $    UR                  5         g r   backwardrc   s     rK   re   (TransformerWithSharedParams.run_backwardB      rJ   fsdp_init_modefsdp_kwargsrU   c                 x   Uc  0 nU[         R                  :X  a)  [        U [        5      (       a  U S   nOU n[	        XbXT5      $ U[         R
                  :X  a  SU;  a  [        [        [        15      nOUR                  S5      nSU;   a?  US   [        R                  [        R                  1;   a  [        U [        5      (       d  SnOU n[        U [        5      (       a  U S   n	OU n	[	        XXT5      n
[        U
U4SU0UD6nU[        R                  :X  a  UR!                  ["        5      nU$ [%        SU 35      e)a  
Initializes a :class:`TransformerWithSharedParams` instance.

Args:
    fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
        any modules with FSDP. If ``RECURSIVE``, then wraps with
        top-level FSDP. By default, the top-level FSDP uses the
        ``ModuleWrapPolicy`` for encoder and decoder layers, but a
        different auto wrap policy may be specified via
        ``fsdp_kwargs``.
    device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
    fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
        forwarded to the FSDP constructor.
    deterministic (bool): Whether to make the model deterministic
        across constructions.
    add_bn (bool): Whether to include batch norm in the model.
Nr   auto_wrap_policysharding_strategyUnsupported FSDP init mode: )r@   rG   
isinstancern   r   rH   r    r*   r)   popr   HYBRID_SHARD_HYBRID_SHARD_ZERO2r   rM   rP   r   r   
ValueError)ry   r  r   r  r   r   pgr   fsdp_pg
tformer_pgm
fsdp_models               rK   rj    TransformerWithSharedParams.initE  sW   6 K\111%''1X.f  |555!4#3//$  $/??3E#F  ${2 34$113C3W3WXY"5%00%''"1X
"
+fA  "2 	J  >#>#>>']];7
77GHIIrJ   c                     U R                   /$ r   )r   r   s    rK   get_ignored_modules/TransformerWithSharedParams.get_ignored_modules  s      !!rJ   )r   r   r   r   r   r   r   )NFT)rC   rD   rE   rF   r}   ProcessGrouprM   boolr   rZ   r  r`   re   rq   r@   r   r   r   r   r   rr   rs   r   rj   r/  rI   __classcell__r   s   @rK   r   r     s    (  ( )( 	(
 (T#
 
 15#KJ  KJ$KJ )KJ d38n-	KJ
 KJ KJ 
ryy$	KJ KJZ" "rJ   r   c                      ^  \ rS rSrS\R
                  S\S\S\4U 4S jjrS r	S r
S	 rS
 r\  SS\R
                  S\S\S\\\\4      S\S\R(                  4S jj5       rSrU =r$ )NestedWrappedModulei  ry   r   r   r   c                   >^^^ [         TU ]  5         TR                  5       U l        TR                  5       U l        U[
        R                  :H  nUUU4S jnU(       a  [        R                  " S5        [        R                  " [        [        R                  " SS5      U5      U" [        R                  " U" [        [        R                  " SS5      U5      5      [        [        R                  " SS5      U5      5      5      U" [        [        R                  " SS5      U5      5      [        [        R                  " SS5      U5      5      U l        g )Nc                 0   > T(       a  [        U T40 TD6$ U $ r   r   layerr  ry   r   s    rK   _maybe_wrap1NestedWrappedModule.__init__.<locals>._maybe_wrap      E58K88LrJ   r   r   r6   r   )r   r   r   r   r   rM   rO   ro   r   rr   
Sequentialr   r   module	rX   ry   r   r   r   r  r   r;  r   s	    ``  `  rK   r   NestedWrappedModule.__init__  s     	JJL	**,)^-I-II	
 a mmBIIaO^<		!R0@. QR#BIIb"$5~F 		"a(8.IJBIIaO^<

rJ   c                 v    [         R                  " SU R                  -   5        [         R                  " SSUS94$ )Nr>   r6   r   r  )ro   r   r   randrW   s     rK   rZ   NestedWrappedModule.get_input  s.    !dii-(

1a/11rJ   c                 $    U R                  U5      $ r   r?  rX   r  s     rK   r  NestedWrappedModule.forward      {{1~rJ   c                 &    UR                  5       nU$ r   )r  rX   r^   r_   rd   s       rK   r`   NestedWrappedModule.get_loss  s    zz|rJ   c                 $    UR                  5         g r   r  rc   s     rK   re    NestedWrappedModule.run_backward  r  rJ   r  r  rU   c                    Uc  0 nU[         R                  :X  a  [        U SUUS9$ U[         R                  :X  a;  [        U 4SUUS.UD6nU[        R
                  :X  a  UR                  [        5      nU$ [        SU 35      e)a  
Initializes a :class:`NestedWrappedModule` instance.

Args:
    fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
        any modules with FSDP. If ``RECURSIVE``, then wraps some nested
        modules with FSDP but not the top-level module. The model may
        later be wrapped with a top-level FSDP external to this method
        if desired.
    device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
    fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
        forwarded to the FSDP constructor.
    deterministic (bool): Whether to make the model deterministic
        across constructions.
Fr   r   r   Tr"  )	r@   rG   r6  rH   rM   rP   r   r   r'  )ry   r  r   r  r   r,  s         rK   rj   NestedWrappedModule.init  s    . K\111&!1+	  |555,!1+	
 J  >#>#>>']];7
77GHIIrJ   r?  r   r   NF)rC   rD   rE   rF   r}   r1  r2  rM   r   rZ   r  r`   re   rq   r@   r   r   r   r   rr   rs   rj   rI   r3  r4  s   @rK   r6  r6    s    
  
 
 )	

 
@2 
 15#+J  +J$+J )+J d38n-	+J
 +J 
+J +JrJ   r6  c                   v   ^  \ rS rSr\  S	S\R                  S\S\S\	\
\\4      S\4
U 4S jjj5       rSrU =r$ )
AlwaysWrapNestedWrappedModulei  ry   r  r   r  r   c                 :  > [         [        [        ]   U [        R                  UUUS9nU[        R                  :X  a  U$ U[        R
                  :X  aH  U=(       d    0 n[        U4S[        0UD6nU[        R                  :X  a  UR                  [        5      nU$ g)z
Initializes a :class:`NestedWrappedModule` instance, but unlike
:meth:`NestedWrappedModule.init`, for the ``RECURSIVE`` init mode, this
wraps with top-level FSDP and the ``always_wrap_policy()`` auto wrap
policy.
)ry   r  r   r  r   r   N)r   rU  rj   r@   rG   rH   r   r   rM   rP   r   r   )ry   r  r   r  r   rt   r,  r   s          rK   rj   "AlwaysWrapNestedWrappedModule.init  s     )+H
'//-#'  
 	 \111L|555%+KeX6HXKXJ>#>#>>']];7
 6rJ   rB   rS  )rC   rD   rE   rF   rq   r}   r1  r@   rM   r   r   r   r   r2  rj   rI   r3  r4  s   @rK   rU  rU    s^    
 15#  $ ) d38n-	
  rJ   rU  c                      ^  \ rS rSrS\R
                  S\S\S\4U 4S jjr\	SS j5       r
\	  SS\R
                  S\S\S	\\\\4      S\4
S
 jj5       rSrU =r$ )NonUniformReqGradNWMi  ry   r   r   r   c                   >^^^ [         [        U ]  5         TR                  5       U l        TR	                  5       U l        U[        R                  :H  nUUU4S jnU(       a  [        R                  " S5        [        R                  " [        [        R                  " SS5      U5      U" [        R                  " U" [        [        R                  " SS5      U5      5      [        [        R                  " SS5      U5      5      5      U" [        R                  " [        [        R                  " SS5      U5      [        [        R                  " SS5      U5      5      5      5      U l        g )Nc                 0   > T(       a  [        U T40 TD6$ U $ r   r   r9  s    rK   r;  2NonUniformReqGradNWM.__init__.<locals>._maybe_wrap+  r=  rJ   r   r   r6   r   )r   r6  r   r   r   r   rM   rO   ro   r   rr   r>  r   r   r?  r@  s	    ``  `  rK   r   NonUniformReqGradNWM.__init__  s     	!413 JJL	**,)^-I-II	
 a mmBIIaO^<		!R0@. QR#BIIb"$5~F #BIIb!$4nE#BIIaO^D
rJ   c                     U R                  5        H3  u  p#[        R                  " X5      (       a  M"  UR                  S5        M5     g rS  )rz   rematchrequires_grad_)rt   req_grad_masknps       rK   _set_nonuniform_req_grad-NonUniformReqGradNWM._set_nonuniform_req_gradB  s4    **,DA88M--  ' -rJ   r  r  c                    [         R                  " S5      nU[        R                  :X  a#  [	        U SUUS9n[        R                  Xe5        U$ U[        R                  :X  aU  Uc  0 n[	        U 4SUUS.UD6nU[        R                  :X  a  UR                  [        5      n[        R                  Xu5        U$ [        SU 35      e)a  
Initializes a :class:`NestedWrappedModule` instance, but unlike
:meth:`NestedWrappedModule.init`, it wraps a second :class:`torch.nn.Sequential`
container to enable the desired non-uniform ``requires_grad``
``use_orig_params=True`` tests. For both ``RECURSIVE`` and ``NO_FSDP``
init modes, freezes all parameters except the last two to validate
``ShardedGradScaler`` support for ranks with no (non-zero sized) local shards in
FSDP ``use_orig_params=True`` mode.
zmodule\.2.*\.1.*FrP  Tr"  )r_  compiler@   rG   rY  re  rH   rM   rP   r   r   r'  )ry   r  r   r  r   req_grad_pattern	ddp_modelr,  s           rK   rj   NonUniformReqGradNWM.initH  s    ( ::&9:\111,!1+	I !99)V|555" -!1+	
 J  >#>#>>']];7
 99*W77GHIIrJ   rR  rl   rS  )rC   rD   rE   rF   r}   r1  r2  rM   r   rq   re  r@   r   r   r   r   rj   rI   r3  r4  s   @rK   rY  rY    s    (
  (
 (
 )	(

 (
T ( (
 
 15#+J  +J$+J )+J d38n-	+J
 +J +JrJ   rY  c                      ^  \ rS rSrSrS\R                  S\S\4U 4S jjrS r	S r
S	 rS
 r\S\\   S\S\S\S\4
S j5       rSrU =r$ )ModuleWithDelayiw  zThis class wraps a :class:`FSDPTestModel` to optionally add a delay
after computing the loss and/or before the gradient reduction.r?  delay_after_loss_msdelay_before_reduction_msc                 F   > [         TU ]  5         X l        X0l        Xl        g r   )r   r   rn  ro  r?  )rX   r?  rn  ro  r   s       rK   r   ModuleWithDelay.__init__{  s!     	#6 )B&rJ   c                 8    U R                   R                  U5      $ r   )r?  rZ   rW   s     rK   rZ   ModuleWithDelay.get_input  s    {{$$V,,rJ   c                 $    U R                  U5      $ r   rF  rG  s     rK   r  ModuleWithDelay.forward  rI  rJ   c                 b   U R                   R                  X5      nU R                  S:  a  [        (       d  [        (       a%  [
        R                  " U R                  S-  5        U$ [        (       a=  [        R                  R                  [        U R                  [        5       -  5      5        U$ Nr     )r?  r`   rn  r3   r4   timesleepr2   ro   r7   _sleepr   r1   rK  s       rK   r`   ModuleWithDelay.get_loss  s}    {{##E2##a'x88

433d:;  

!!#d&>&>ARAT&T"UVrJ   c                    ^ ^ [         R                  R                  mUU 4S jn[        R                  " SU5         T R
                  R                  U5        S S S 5        g ! , (       d  f       g = f)Nc                  8  > TR                   S:  a  [        (       a>  [        R                  R	                  [        TR                   [        5       -  5      5        O9[        (       d  [        (       a#  [        R                  " TR                   S-  5        T" U 0 UD6$ rw  )ro  r2   ro   r7   r{  r   r1   r3   r4   ry  rz  )rg   rh   orig_reduce_scatterrX   s     rK   _delayed_reduce_scatter=ModuleWithDelay.run_backward.<locals>._delayed_reduce_scatter  sq    --19JJ%%D::=N=PPQ XJJt==DE&777rJ   z'torch.distributed.reduce_scatter_tensor)ro   distributedreduce_scatter_tensorr   patchr?  re   )rX   rd   r  r  s   `  @rK   re   ModuleWithDelay.run_backward  sR    #//EE	8 ZZ57N
 KK$$T*
 
 
s   A  
A.module_class
model_argsmodel_kwargsc                <    [        U R                  " U0 UD6UU5      $ )a  
Args:
    module_class (Type[FSDPTestModel]): Wrapped module class to which
        to add delays.
    model_args: Positional arguments forwarded to the ``module_class``
        ``init()``.
    delay_after_loss_ms (int): Delay after computing the loss/before
        the optimizer step (in ms).
    delay_before_reduction_ms (int): Delay before reduce-scattering
        gradients (in ms).
    model_kwargs: Keyword arguments forwarded to the ``module_class``
        ``init()``.
)rm  rj   )r  rn  ro  r  r  s        rK   rj   ModuleWithDelay.init  s*    * z:\:%
 	
rJ   )rn  ro  r?  )rC   rD   rE   rF   rm   rr   rs   r   r   rZ   r  r`   re   rq   typerS   r   rj   rI   r3  r4  s   @rK   rm  rm  w  s    F				 !	 $'		-+$ 
=)

 !
 $'	

 
 
rJ   rm  c                       \ rS rSr\\R                  SSSS4S\R                  S\	S\S\
\\\4      S	\S
\S\4S jj5       rSrg)NestedWrappedModuleWithDelayi  NFr   ry   r  r   r  r   rn  ro  c                 >    [         R                  [        U UUUUUUS9$ )Nry   r  r   r  r   rn  ro  )rm  rj   r6  r  s          rK   rj   !NestedWrappedModuleWithDelay.init  s4     ##)-#' 3&? $ 	
 		
rJ   rB   )rC   rD   rE   rF   rq   rM   rP   r}   r1  r@   r   r   r   r   r2  r   rj   rI   rB   rJ   rK   r  r    s     ,:+F+F04##$)*
  
$
 )
 d38n-	

 
 !
 $'
 
rJ   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )DummyDDPi  c                 .   > [         TU ]  5         Xl        g r   )r   r   r?  )rX   r?  r   s     rK   r   DummyDDP.__init__  s    rJ   c                 &    U R                   " U0 UD6$ r   rF  rX   rg   rh   s      rK   r  DummyDDP.forward  s    {{D+F++rJ   rF  rC   rD   rE   rF   r   r  rI   r3  r4  s   @rK   r  r    s    , ,rJ   r  c                      ^  \ rS rSrS\R
                  S\S\S\S\4
U 4S jjr	S r
S	 r\   SS\R
                  S
\S\S\\\\4      S\S\4S jj5       rSrU =r$ )MixtureOfExpertsi  ry   r   r   delay_before_free_msr   c                   > [         TU ]  UUUUS9  Xl        X@l        X l        U[
        R                  :H  U l        U(       a#  [        R                  " SU R                  -   5        SnSnSn	[        [        R                  " Xx5      U R                  5      n
[        S U
R                  5        5       5      U l        U
R                  5        H
  nSUl        M     U(       a  [        R                  " S5        [        [        R                  " X5      U R                  5      nU(       aF  [        R$                  R'                  UR                  5       /5      n[)        X40 UD6n
[)        X40 UD6n[        R*                  " [        [        R                  " X5      U R                  5      UU
[        [        R                  " X5      U R                  5      5      U l        g )	N)ry   r   r   r   *   r   r  r   c              3   @   #    U  H  oR                  5       v   M     g 7fr   )numel).0rd  s     rK   	<genexpr>,MixtureOfExperts.__init__.<locals>.<genexpr>   s     $L8K1WWYY8K   Tr   )r   r   ry   r  r   rM   rO   r   ro   r   r   r   rr   r   r  r   num_expert_paramsexpertr  	new_groupr   r>  r?  )rX   ry   r   r   r  r   r  d_expertd_sharedd_inputr  rd  sharedexpert_groupr   s                 rK   r   MixtureOfExperts.__init__  s{    	-'	 	 	
 
$8!"..2N2NNb499n- 8!>@S@ST!$$L8I8I8K$L!L""$AAH % a  8!>@S@ST ,,66L &>+>F&7;7FmmBIIg8$:M:MNBIIh8$:M:MN	
rJ   c                   ^ ^ T R                   S:  a  T R                  S   n[        U[        5      (       ag  [        R
                  R                  R                  R                  mUU 4S jn[        R                  " SU5         T R                  U5      sS S S 5        $ T R                  U5      $ ! , (       d  f       N= f)Nr   r   c                    > [         (       a>  [        R                  R                  [	        TR
                  [        5       -  5      5        O9[        (       d  [        (       a#  [        R                  " TR
                  S-  5        T" U 0 UD6$ )Nrx  )r2   ro   r7   r{  r   r  r1   r3   r4   ry  rz  )rg   rh   orig_reshardrX   s     rK   _delayed_reshard2MixtureOfExperts.forward.<locals>._delayed_reshard  sc     y

)) 9 9<M<O OP "XX

4#<#<t#CD'888rJ   z.torch.distributed.fsdp._runtime_utils._reshard)r  r?  r#  r   ro   r  fsdp_runtime_utils_reshardr   r  )rX   r  r  r  r  s   `   @rK   r  MixtureOfExperts.forward  s    $$q([[^F&$''$0055DDMM9 ZZDFV  ;;q> 
 {{1~ s   B//
B=c                    UR                  5         U R                  (       d  [        R                  " 5          U R	                  5        H|  n[        US5      (       a  M  UR                  c  M%  UR                  R                  U R                  5        [        R                  R                  UR                  U R                  S9  M~     S S S 5        g g ! , (       d  f       g = f)Nr  rx   )r  r   ro   r   r   hasattrgraddiv_r   r  
all_reducery   )rX   rd   rd  s      rK   re   MixtureOfExperts.run_backward1  s    ~~*Aq(++ vv)DOO4))44QVV4::4N + !  s   2C-AC
C r  r  c                 
   Uc  0 nU[         R                  :X  a  [        U SUUUS9$ U[         R                  :X  a<  [        U 4SUUUS.UD6nU[        R
                  :X  a  UR                  [        5      nU$ [        SU 35      e)a  
Initializes a :class:`MixtureOfExperts` instance.

Args:
    fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
        any modules with FSDP. If ``RECURSIVE``, then wraps some nested
        modules with FSDP, including the expert and shared layers, but
        not the top-level module. The model may later be wrapped with a
        top-level FSDP external to this method if desired.
    device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
    fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
        forwarded to the FSDP constructor.
    deterministic (bool): Whether to make the model deterministic
        across constructions.
    delay_before_free_ms (int): Delay before resharding expert
        parameters in the forward pass (in ms).
F)r   r   r  r   Tr"  )	r@   rG   r  rH   rM   rP   r   r   r'  )ry   r  r   r  r   r  r,  s          rK   rj   MixtureOfExperts.init=  s    4 K\111#!1%9+  |555)!1%9+ J  >#>#>>']];7
77GHIIrJ   )r  ry   r?  r   r  r   )NFr   )rC   rD   rE   rF   r}   r1  r2  rM   r   r   r  re   rq   r@   r   r   r   r   rj   rI   r3  r4  s   @rK   r  r    s    2
  2
 2
 )	2

 "2
 2
h0
O 
 15#$%0J  0J$0J )0J d38n-	0J
 0J "0J 0JrJ   r  c                      ^  \ rS rSr SSSSS.S\S\\R                     S\S	\S
\4
U 4S jjjjr	S\R                  S\R                  4S jrS rSrU =r$ )MLPiq  TFr6   )biaswith_bufferdim_multiplierdimrY   r  r  r  c                   > [         TU ]  5         [        R                  " XU-  X#S9U l        [        R                  " XQ-  XUS9U l        U(       a'  U R                  S[        R                  " U4US95        g S U l	        g )N)rY   r  r   r  )
r   r   rr   r   in_projout_projr   ro   randnr   )rX   r  rY   r  r  r  r   s         rK   r   MLP.__init__r  sf     	yys&:6U		."6QUV  5;;vf+MNDKrJ   r  rU   c                     U R                  U5      n[        R                  " U5      nU R                  U5      n[        R                  " U5      nU R                  b  X R                  -   nU$ r   )r  Frelur  r   )rX   r  zs      rK   r  MLP.forward  sQ    LLOFF1IMM!FF1I;;"KKArJ   c                     U R                   b4  [        R                  R                  R	                  U R                   5        g g r   )r   ro   rr   rj   normal_r   s    rK   reset_parametersMLP.reset_parameters  s+    ;;"HHMM!!$++. #rJ   )r   r  r  r   )rC   rD   rE   rF   r   r   ro   rY   r2  r   rp   r  r  rI   r3  r4  s   @rK   r  r  q  s     *.
 ! &
    " %,, / /rJ   r  c                   V   ^  \ rS rSrSS.S\S\4U 4S jjjrS\S\S	\S
S 4S jrSr	U =r
$ )MLPStacki  F)with_seq_parallelmlp_dimr  c                   > [        USS9[        U5      [        USS9/nU(       a$  UR                  [        R                  " USS95        [        TU ]  " U6   X l        g )N   )r  Fr  )r  appendrr   	LayerNormr   r   r  )rX   r  r  modulesr   s       rK   r   MLPStack.__init__  sV     *L*	$
 NN2<<e<='"!2rJ   tp_meshdp_meshuse_activation_checkpointingrU   c           
         [        SS9[        SS9[        SS9[        SS9[        SS9U R                  (       a  [        [        S5      S9O	[        5       S.nU R                  (       a  [	        SS9US'   [        XUS9  U  HD  n[        U[        R                  5      (       a  M$  U(       a  [        U5        [        U4S	U0UD6  MF     [        U 4S	U0UD6  U $ )
NF)use_local_outputr>   )output_layouts)z	0.in_projz
0.out_projz	1.in_projz
1.out_projz	2.in_projz
2.out_proj)sequence_dim3)device_meshparallelize_planmesh)r%   r'   r  r$   r(   r&   r#  rr   r  r   r   )rX   r  r  r  r  r  r?  s          rK   parallelizeMLPStack.parallelize  s     )%@)5A(%@)5A(%@%% *qB "
 !!$4!$DS!4GWXF&",,//+6"<W<<  	D6w6+6rJ   )rC   rD   rE   rF   r   r2  r   r   r  rI   r3  r4  s   @rK   r  r    sN    BG 
3 
34 
3 
3  '+	 
 rJ   r  c                      ^  \ rS rSrSrS
S\S\4U 4S jjjrS\R                  S\
\\R                  \R                  4   \R                  4   4S jrS	rU =r$ )DoubleLineari  z
This can be used for returning multiple outputs from a module
(``use_second_linear=True``) or for having an unused module (``False``).
r  use_second_linearc                    > [         TU ]  5         [        R                  " X5      U l        [        R                  " X5      U l        [        R                  " 5       U l        X l        g r   )	r   r   rr   r   lin1lin2ReLUr  r  )rX   r  r  r   s      rK   r   DoubleLinear.__init__  sA    IIc'	IIc'	GGI	!2rJ   r  rU   c                     U R                   (       a@  U R                  U R                  U5      5      U R                  U R                  U5      5      4$ U R                  U R                  U5      5      $ r   )r  r  r  r  rG  s     rK   r  DoubleLinear.forward  sQ     !!99TYYq\*DIIdiil,CCCyy1&&rJ   )r  r  r  r  T)rC   rD   rE   rF   rm   r   r2  r   ro   rp   r   rn   r  rI   r3  r4  s   @rK   r  r    s^    
3C 3D 3 3''	uU\\5<</0%,,>	?' 'rJ   r  new_all_gather_into_tensorc              #      #    [         R                  n[         R                  " 5         U [         l         S v   [         R                  " 5         U[         l        g ! [         R                  " 5         U[         l        f = f7fr   )r}   all_gather_into_tensorbarrier)r  orig_all_gathers     rK   patch_all_gatherr    sN     11OLLN"<D6&5# 	&5#   1A>A !A>"A;;A>new_reduce_scatter_tensorc              #      #    [         R                  n[         R                  " 5         U [         l         S v   [         R                  " 5         U[         l        g ! [         R                  " 5         U[         l        f = f7fr   )r}   r  r  )r  r  s     rK   patch_reduce_scatterr    sO     44LLN!:D9%8" 	%8"r  new_all_reducec              #      #    [         R                  n[         R                  " 5         U [         l         S v   [         R                  " 5         U[         l        g ! [         R                  " 5         U[         l        f = f7fr   )r}   r  r  )r  orig_all_reduces     rK   patch_all_reducer    sI     ooOLLN$DO*) 	)r  new_unshardc              #      #    [         R                  n[        R                  " 5         U [         l         S v   [        R                  " 5         U[         l        g ! [        R                  " 5         U[         l        f = f7fr   )r   unshardr}   r  )r  orig_unshards     rK   patch_unshardr    P      "))LLLN(N.!- 	!-r  new_reshardc              #      #    [         R                  n[        R                  " 5         U [         l         S v   [        R                  " 5         U[         l        g ! [        R                  " 5         U[         l        f = f7fr   )r   reshardr}   r  )r	  r  s     rK   patch_reshardr  
  r  r  new_post_backwardc              #      #    [         R                  n[        R                  " 5         U [         l         S v   [        R                  " 5         U[         l        g ! [        R                  " 5         U[         l        f = f7fr   )r   post_backwardr}   r  )r  orig_post_backwards     rK   patch_post_backwardr    sQ      (55LLN#4N :'9$ 	'9$r  new_backwardc              #      #    [         R                  n[        R                  " 5         U [         l         S v   [        R                  " 5         U[         l        g ! [        R                  " 5         U[         l        f = f7fr   )r   r  r}   r  )r  orig_backwards     rK   *patch_register_post_backward_hook_backwardr  $  sS      199MLLN,8 )>0=$- 	0=$-r  r  rg   rh   c                     [        U5      S:  a  US   nOSU;   a  US   nO[        SU SU 35      eU" U5        U" U0 UD6$ )Nr   r_   z,Cannot get reduce-scatter output from
args: z	
kwargs: )lenAssertionError)clsr  rv   rg   rh   r_   s         rK   reduce_scatter_with_assertr  1  sa     4y1}a	V	!;D6F8T
 	
 f///rJ   replicated_modulesharded_moduleprefixes_to_ignore.c                    [        UR                  5       UR                  5       5       GH  u  u  pEu  pgUnU H  n	UR                  U	S5      nM     U R                  XH5        U R	                  U[
        5        [        U[
        5      (       d   eUR                  UR                  p[        U5      [        S5      [        S5      4:X  a  [        S5      e[        XZU5      nU R                  UR                  5       UR                  5       5        UR                  c  U R                  UR                  5        GM  U R!                  UR                  5        [        UR                  X5      nU R	                  UR                  [
        5        [        UR                  [
        5      (       d   eU R                  UR                  R                  5       UR                  5       5        GM     g )N r   zmFSDP's (Shard(0), Shard(0)) layout differs from distribute_tensor(), so we cannot check for equality using it)r   rz   replaceassertEqualassertIsInstancer#   r#  r  
placementsrn   r$   r  r"   to_localr  assertIsNoneassertIsNotNone)r  r  r  r  replicated_namereplicated_paramsharded_namesharded_paramclean_sharded_nameprefixr  r#  sharded_ref_paramsharded_ref_grads                 rK   check_sharded_parityr/  D  s    OR**,n.M.M.OOJ+-Jl *(F!3!;!;FB!G )<]G4-1111(44m6N6Njq58 44 ;  ..>jQ..02C2L2L2NO  (]//0M../,-=-B-BDU]//9-,,g6666**3357G7P7P7RS1OrJ   c                   J   ^  \ rS rSr\S 5       rU 4S jrS rS rS r	Sr
U =r$ )FSDPTestMultiThreadie  c                     [         $ r   DEVICE_COUNTr   s    rK   r   FSDPTestMultiThread.world_sizef      rJ   c                 B   > [         TU ]  5         U R                  5         g r   )r   setUp_spawn_threadsrX   r   s    rK   r8  FSDPTestMultiThread.setUpj  s    rJ   c                      [        U /UQ70 UD6$ r   r.   r  s      rK   r.    FSDPTestMultiThread.run_subtestsn      D242622rJ   c                 @    [         R                  R                  5         g r   ro   _dynamoresetr   s    rK   perThreadSetUp"FSDPTestMultiThread.perThreadSetUpq      rJ   c                 @    [         R                  R                  5         g r   rA  r   s    rK   perThreadTearDown%FSDPTestMultiThread.perThreadTearDownt  rF  rJ   rB   )rC   rD   rE   rF   propertyr   r8  r.   rD  rH  rI   r3  r4  s   @rK   r1  r1  e  s.     3 rJ   r1  c            $         ^  \ rS rSrU 4S jr\S 5       r\S 5       r\S\4S j5       r	\S 5       r
S rS	 rS
 rS r\S 5       r       S)S\R$                  S\S\S\S\\   S\S\\   S\S\S\\\\4      4S jjrSSS\" 5       SSSSSSSSS4S\\   S\S\S\\    S \S\S!\S"\\!   S#\\"   S\\   S$\S%\S\S\S&\\\\4      S\\\\4      4 S' jjr#S(r$U =r%$ )*FSDPTestix  c                 h   > [         TU ]  5         S[        R                  S'   U R	                  5         g )N0TORCH_NCCL_DESYNC_DEBUG)r   r8  osenviron_spawn_processesr:  s    rK   r8  FSDPTest.setUpy  s)     14

,-rJ   c                     [         $ r   r3  r   s    rK   r   FSDPTest.world_size  r6  rJ   c                 >    [         R                  R                  5       $ r   )r}   distributed_c10d_get_default_groupr   s    rK   ru   FSDPTest.process_group  s    $$7799rJ   rU   c                     grS  rB   r   s    rK   destroy_pg_upon_exitFSDPTest.destroy_pg_upon_exit  s     rJ   c                 *    [          U R                   3$ r   )r0   	file_namer   s    rK   init_methodFSDPTest.init_method  s    t~~.//rJ   c                 :    U R                  X!R                  5        g r   )r!  r   )rX   r,  r   s      rK   _check_cpu_offloadFSDPTest._check_cpu_offload  s    &<&<=rJ   c                 :    U R                  X!R                  5        g r   )r!  backward_prefetch)rX   r,  re  s      rK   _check_backward_prefetch!FSDPTest._check_backward_prefetch  s    *,H,HIrJ   c                 :    U R                  X!R                  5        g r   )r!  forward_prefetch)rX   r,  ri  s      rK   _check_forward_prefetch FSDPTest._check_forward_prefetch  s    )+F+FGrJ   c                      [        U /UQ70 UD6$ r   r=  r  s      rK   r.   FSDPTest.run_subtests  r?  rJ   c                 4   U " U5      nXl         X6l        UR                  SS5      n[        SUR                    SUR                   35         U(       a^  [
        R                  R                  R                  R                  R                  5       n[        R                  " SUR                  UUS9  OC[        R                  " UR                  [        [        UR                  5      UR                   S9   S n
UR                   [,        -  n[.        (       d  [0        (       a  [
        R2                  R5                  U5        U/n
[        R6                  " U
S9  [
        R8                  R;                  5         UR=                  X$5        [
        R8                  R;                  5         [        R6                  " U
S9  [        R>                  " 5         g ! [          a@  n	SU	R"                  S	   ;   a'  [$        R&                  " [(        S
   R*                  5        e S n	A	ff = f)Nfake_pgFzdist init r=z, world=fake)backendr   r   store)r_  rq  r   r   	recompiler   backend_unavailable)
device_ids) r   r^  getprintr   ro   testing	_internalr  ro  	FakeStorer}   init_process_groupr_  DISTRIBUTED_BACKENDr   RuntimeErrorrg   sysexitr/   	exit_coder4  r2   r4   acceleratorset_device_indexr  rB  rC  run_testdestroy_process_group)r  r   	test_namer^  piperh   rX   ro  rr  eru  	device_ids               rK   _runFSDPTest._run  s   9~	"**Y.TYYKx/@AB	//;;CCMMO''"#	 '' $ 0 0/"4??3	 
II,	9..y9[

 	
+i&
+""$/  	affQi'$9:DDE		s    A$G 2AG 
H;HHNFrt   	num_stepsautocastlrfsdp_cpu_offload
save_modelmixed_precisionenable_sharded_grad_scaleruse_pure_fp16sharded_grad_scaler_kwargsc           	         U=(       a    UR                   n[        UR                  5       5      R                  nU
c  0 n
[	        SSU0U
D6n[
        R                  R                  UR                  5       USS9n[        U5       GHg  nUR                  5         [
        R                  R                  [        US9   UR                  R                  [
        R                  " [        5      5      nU	(       d  U(       aW  [        U[         5      (       dB  [        U[
        R"                  5      (       a  UR%                  5       nO['        S U 5       5      nU" U6 nU(       ap  [        U[         5      (       a[  UR(                  [*        ;  aG  UR                  5        H3  nU R-                  UR                  [
        R                  " S5      5        M5     UR                  R/                  UU5      R1                  U5      nS S S 5        UR3                  W5      nU(       d-  U	(       d&  UR4                  [
        R6                  :X  d   S5       eOU	(       a+  U R-                  UR4                  [
        R8                  5        Ok[        U[         5      (       a,  Uc   eU R-                  UR4                  UR:                  5        O*U R-                  UR4                  [
        R6                  5        UR                  R=                  U5        U(       a\  [        U[         5      (       aG  UR                  5        H3  nU R-                  UR                  [
        R                  " S5      5        M5     UR?                  U5        URA                  5         U(       d  GM	  URC                  5       RE                  5        VVs0 s H  u  nnUURG                  5       _M     nnn[I        U5        URK                  U5        GMj     [        U[         5      (       a  URM                  [N        RP                  5        WRS                  5       $ ! , (       d  f       GNA= fs  snnf )	Nenabledg?)r  momentum)r  c              3   @   #    U  H  oR                  5       v   M     g 7fr   )r   )r  r  s     rK   r  4FSDPTest._train_for_several_steps.<locals>.<genexpr>  s     %>1ffhhr  r<   zeloss data type should be float32, as the original                     parameter data type is float32.rB   )*offload_paramsnextr   rY   r   ro   optimSGDr   	zero_gradampr  r   r?  rZ   r#  r   rp   r   rn   r!  r   r!  r`   r   scaler   float32float16param_dtypere   stepupdater   r   cloner   load_state_dict_assert_stater   IDLEr{   )rX   rt   r  r  r  r  r  r  r  r  r  cpu_offload_paramsmodel_devicesharded_grad_scalerr  r   r^   r_   rd  rd   kvr   s                          rK   _train_for_several_steps!FSDPTest._train_for_several_steps  s/    .Q2B2Q2QE,,./66%-)+&/ 
.
2L

  0 0 2rCHy!AOO##K#B..u||K/HI _Zt=T=T!%66 %

 %%>%> > '"5$// //>? #--/((5<<3FG 0 ||,,UF;>>|L- C. ',,T2D"=JJ%--/555/ !$$TZZ?t,,*666$$TZZ1L1LM$$TZZ?LL%%d+!j&=&=))+A$$QXXu||E/BC ,  $$U+&&(z7<7G7G7I7O7O7QR7Qtq!al7Q
R E"%%j1s "v eT"" 2 23{{}w CBf Ss   9E P3-Q3
Q	r   Tmodel_classr  r   ref_init_fn	num_itersr   re  r!  ri  use_orig_paramsinit_kwargsc                    U[         R                  :w  d   S5       eUc  0 nSnU R                  R                  5       nUR                  " U R                  [         R                  [
        R                  4SS0UD6nUc-  [        (       a  [        U[        /[        S9nO[        UU/US9nOU" U5      nU(       a  UR                  5       nU R                  UUU
SLUUU
UUUS9	n[        UR                  5       5      nUR                  UUU	U
UUS.5         UR                  " U R                  UUU4SS0UD6n[%        U[&        5      (       d  ['        UU R                  40 UD6nU(       a  UR                  5       nU[
        R(                  :X  a  UR+                  [        5      nUSL=(       a    UR,                  nU=(       a    U[
        R(                  :H  nU=(       a    U[
        R(                  :g  nU(       aI  [.        R0                  " S5      nUR                  5        H  nU R3                  UR0                  U5        M!     U(       a  U R5                  [6        S[         35      O	[9        5       nU   U R                  UUSUUUU
UUUS9
n SSS5        U(       a  gU(       a^  [.        R0                  " S5      nUR                  5        H  nU R3                  UR0                  U5        M!     W R+                  [        5      n [;        U5      n![.        R<                  R?                  UW SS9  U
c  U(       d  U R3                  UU!SSS9  ggg! [         a   n[!        S	U S
[#        U5       35      UeSnAff = f! , (       d  f       N= f)a  
Tests FSDP training against a reference, which defaults to DDP but
may be customized with ``ref_init_fn``.

Args:
    model_class (Type[FSDPTestModel]): A model class that inherits from
        ``FSDPTestModel``, which defines the expected interface.
    fsdp_init_mode (FSDPInitMode): The mode to initialize the
        FSDP-wrapped model. This should not be ``NO_FSDP``.
    ref_init_fn (Optional[Callable]): A callable to invoke that wraps a
        non-wrapped model to construct the reference model, where this
        wrapper should provide data parallel semantics. If ``None``,
        then the callable defaults to the DDP constructor.
z.Expects an FSDP init mode that wraps with FSDPN{Gz?r   T)ru  output_device)r  r  r  r  r  r  r  )r   re  r!  r  ri  r  zInitializing z raised error r<   zOAn FSDP-managed module with parameter CPU offloading enabled has parameters on F)r  r  r  r  r  r  r  r  )check_dtypezFSDP did not match DDP)exact_devicemsg) r@   rG   ru   r   rj   rM   rO   r3   DDPr   r   r  r   r   r  	Exceptionr'  r   r#  r   rP   r   r  ro   rY   r!  assertRaisesRegexr}  r   r   rx  assert_close)"rX   r  r  r   r  r  r  r   re  r!  r  ri  r  r  r  r  r  r  r  r   rt   	ref_modelref_loss
ddp_paramsr,  r  r  expects_device_errorexpects_cpu_device
cpu_devicer   context	fsdp_lossfsdp_unsharded_paramss"                                     rK   _test_fsdp_parityFSDPTest._test_fsdp_parity)  s   F l222	<;	<2K!!&&(    ((
 	

 
 x{m;	  4&M	#E*I!(I00$D0(+'A''A 1 

 )..01
*%6%6#2$4#2		

	Y$))"" 	
 # J *d++ j$*<*<LLJ#*J~:::#{3J$D0O[5O5O
 N/>3N3NN 	 N/>3N3NN 	 e,J#..0  z: 1 $ ""%%0M3  	 55!,% /+E++E 6 I    e,J#..0  z: 1![1I /
 ; 	""8YE"J "=%!,	   ,9"K  	Y}[MAxPQWXX	YF Ws$   "M 1M5
M2M--M25
N)r^  r   )r  NFNFFN)&rC   rD   rE   rF   r8  rJ  r   ru   r2  r[  r_  rb  rf  rj  r.   classmethodr  rr   rs   r   floatr   r   r   r   r   r   r  r  rS   r@   rM   r   r   r   r  rI   r3  r4  s   @rK   rL  rL  x  sI       : : d   0 0>JH3 1% 1%p 15 48+0#?CUyyU U 	U
 U #:.U U ".1U %)U U %-T#s(^$<Ux +/",,8<8<48!& %+0#04?C#g-(g %g )	g
 h'g g g  g $$45g $$45g ".1g g g %)g g  d38n-!g" %-T#s(^$<#g grJ   rL  compile_compute_on_modulec                 D   ^ ^^ U 4S jm " S S[         5      mUU4S jnU$ )Nc                     > [         R                  R                  R                  " U 0 UD6  Tb  [	        U S   T5      (       a  U S   R                  5         g g )Nr   )ro   r  r  r   r#  rh  )rg   rh   r  s     rK   !fully_shard_with_compiled_compute=compiled_fsdp_test.<locals>.fully_shard_with_compiled_compute  sS    **D;F;$,
G.1
 1
 GOO1
rJ   c                   0    \ rS rSr\" 5       r\" 5       rSrg)*compiled_fsdp_test.<locals>.FullyShardModei  rB   N)rC   rD   rE   rF   r   EAGERCOMPILED_COMPUTErI   rB   rJ   rK   FullyShardModer    s    6rJ   r  c                 6   >^  [        T 5      UUU 4S j5       nU$ )Nc                    > [         R                  R                  R                  nT GH  nUTR                  :w  a'  [        5       (       d  [        R                  " S5        M;  [         R                  R                  R                  n[         R                  R                  R                  n[         R                  R                  5         UTR                  :X  a  UnO_UTR                  :X  aA  S[         R                  R                  l
        S[         R                  R                  l        TnO[        SU 35      eUT	R                   UR"                  '   T	" U 0 UD6  [         R                  R                  5         UT	R                   UR"                  '   U[         R                  R                  l
        U[         R                  R                  l        GM     g )Nz0Inductor on GPU needs Triton and recent GPU archTr>   z!Need to implement FullyShardMode=)ro   r  r  r   r  r5   warningswarnrB  configskip_fsdp_hooks	_inductorcompile_threadsr  r  NotImplementedError__globals__rC   )
rg   rh   original_fully_shardmodeoriginal_skip_fsdp_hooksoriginal_compile_threadsfully_shard_patchr  r  funcs
          rK   wrapper6compiled_fsdp_test.<locals>.decorator.<locals>.wrapper  s_   (-(9(9(>(>(J(J &>///
MM"TU+0==+?+?+O+O(+0??+A+A+Q+Q(!!))+>///(<%^<<<;?EMM((8=>EOO**:(I%-;D6B  CT  !5!>!>?d%f%!!))+BV  !5!>!>?7O$$49Q&&69 'rJ   r	   )r  r  r  r  s   ` rK   	decorator%compiled_fsdp_test.<locals>.decorator  s#    	t	R 
	R@ rJ   )r   )r  r  r  r  s   ` @@rK   compiled_fsdp_testr    s"    " ""H rJ   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )
SkipModulei  c                 X   > [         TU ]  5         [        R                  " SSSS9U l        g N
   Fr  )r   r   rr   r   linr:  s    rK   r   SkipModule.__init__  s"    99R%0rJ   c                 $    U R                  U5      $ r   r  rG  s     rK   r  SkipModule.forward  s    xx{rJ   r  rl   r  r4  s   @rK   r  r    s    1 rJ   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )NestedLineari  c                    > [         TU ]  5         U(       a8  [        [        R                  " SSSS9R                  [        5      5      U l        g [        R                  " SSSS9R                  [        5      U l        g r  )r   r   r!   rr   r   r   r   nested_linear)rX   	fsdp_wrapr   s     rK   r   NestedLinear.__init__  sV    !%biiBU&C&F&F{&S!TD!#2r!>!A!A+!NDrJ   c                 $    U R                  U5      $ r   r  rG  s     rK   r  NestedLinear.forward  s    !!!$$rJ   r  r  r4  s   @rK   r  r    s    O% %rJ   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )	SkipModeli  c                    > [         TU ]  5         [        R                  " SSSS9R	                  [
        5      U l        [        5       R	                  [
        5      U l        [        [        US9[
        S9U l        g )Nr  Fr  )r  )r  )r   r   rr   r   r   r   linearr  linear_skipr!   r  r  )rX   double_nestr   s     rK   r   SkipModel.__init__  sW    iiBU366{C%<??;7!;/;
rJ   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   r  r  r  rG  s     rK   r  SkipModel.forward$  s4    KKNQq!rJ   r  r  r4  s   @rK   r  r    s    
 rJ   r  )FT)FFr  )rB   r   )
contextlibrP  r_  r~  ry  r  abcr   r   r   copyr   enumr   r   	functoolsr
   typingr   r   r   r   r   r   unittestr   ro   torch.distributedr  r}   torch.nnrr   torch.nn.functionalr  r  torch.distributed._composabler   torch.distributed.device_meshr   torch.distributed.fsdpr   r   r   r   $torch.distributed.fsdp._common_utilsr   5torch.distributed.fsdp._fully_shard._fsdp_param_groupr   r   "torch.distributed.fsdp._init_utilsr   2torch.distributed.fsdp.fully_sharded_data_parallelr   r   r   *torch.distributed.fsdp.sharded_grad_scalerr   torch.distributed.fsdp.wrapr   r    r!   torch.distributed.tensorr"   r#   r$   !torch.distributed.tensor.parallelr%   r&   r'   r(   r)   r*   torch.nn.parallel.distributedr+   r  *torch.testing._internal.common_distributedr,   r-   r.   r/   $torch.testing._internal.common_utilsr0   r1   r2   r3   r4   torch.utils._tritonr5   r4  r   r|  r7   device_countr:   r@   rM   rs   rS   r1  r   r   r2  r   r   r   r   r   r   r   r   r   r6  rU  rY  rm  r  r  r  r  r>  r  r  contextmanagerr  r  r  r  r  r  r  r  rn   r   r/  r1  rL  r  r  r  r  r  rB   rJ   rK   <module>r#     sa    	 	 
   # "    F F        4 4 
 ? S 
 I R R F F  F H   + K ::**,LK K 99))+LK L4 T BIIs 499$$ >% #99##""2299 2t 2>299 >d >DBII D$ D .Q"- Q"h[J- [J|$7 D]J. ]J@J
m J
Z
? 
.,ryy ,JJ* JJZ/")) /@*r}} *Z'299 '6 6 6 6 9H 9 9 *X * * .x .  . .x .  . :8 :  : >X >  >0!0 0 	0
 0. +-	TyyT IIT c3h	TB/ &X# Xv
0(4. 0f 	%299 	%		 rJ   