
    sh                        S SK r S SKrS SKrS SKrS SKJrJrJr  S SKJ	r	J
r
JrJrJrJr  S SKrS SKJr  S SKJs  Js  Jr  S SKJs  Js  Jr  S SKJs  Js  Jr  S SKJr  S SKJ r   S SK!J"r"J#r#  S SK$J%r%  S SK&J'r'J(r(J)r)J*r*J+r+J,r,J-r-  S SK.J/r/J0r0J1r1J2r2  S S	K3J4r4  S S
K5J6r6J7r7J8r8J9r9J:r:J;r;J<r<J=r=  S SK>J?r?  S SK@JArA  S SKBJCrC  S SKDJErE  \(       a  S SKFJGrG  SrH S SKIJJrJJKrK  \M" S5      rNSrO\P\R                  \R                  4   rR\\\R                  \R4      rS\;R                  \2R                  \;R                  \2R                  \;R                  \2R                  \;R                  \2R                  \;R                  \2R                  0rY\;R                  \;R                  /rZ\;R                  \;R                  4r[\ S_S\(S\SS\;S\\?   S\\#   S\(4S jj5       r\\S\(S\SS\#S\(4S j5       r]\S\	S\^4S j5       r_\S\#S\^4S j5       r`\S\MS\R                  4S  j5       ra\S!\R                  S\MS\R                  4S" j5       rbS!\R                  S\MS\P\R                  \R                  4   4S# jrc\ S_S\(S$\R                  S%\\\R<                  R                        S&\\\\R<                  R                        \\\R<                  R                        4   S\(4
S' jj5       rfS&\g\	   S(\^SS4S) jrh\S\(S$\R                  S*\i\R                     S+\\\M\R                  4      S\(4
S, j5       rk\S\(S$\R                  S\(4S- j5       rl\S\(S\\;   S.\\:   S/\\7   S0\^S1\^S2\MS3\MS\(4S4 j5       rm\S\(S\(4S5 j5       rn\S\(S6\6S7\^S\(4S8 j5       ro\S_S\(S\#S\(4S9 jj5       rp\S\(S\(4S: j5       rqS$\R                  S;\g\R                     SS4S< jrr\S\(S=\R                  S+\\\M\R                  4      S>\\
\R                  /S4      S?\^S\(4S@ j5       rs\S\(S;\g\R                     S=\R                  4SA j5       rtSB\R                  SC\\\R<                  R                        S\i\R                     4SD jru S_SB\R<                  R                  S%\i\R<                  R                     SE\\\R<                  R                        S\i\R<                  R                     4SF jjrvSB\R<                  R                  S%\i\R<                  R                     S\i\w   4SG jrxSB\R                  S\i\w   4SH jryS$\R                  S*\i\R                     S+\\\M\R                  4      SS4SI jrzS+\\\M\R                  4      SJ\MSK\'S\\R                     4SL jr{S$\R                  S*\i\R                     S%\i\R                     S\P\^\^4   4SM jr|SB\R                  S>\
\R                  /S4   S%\i\R                     SS4SN jr}SB\R                  SO\\R                     S%\i\R                     SK\'4SP jr~SB\R                  S%\i\R                     S\g\R                     4SQ jrS$\R                  S*\i\R                     SR\i\GR                      SO\\R                     SS4
SS jrS;\g\R                     ST\g\GR                      SO\\R                     SS4SU jrSV rS$\R                  S*\i\R                     SO\\R                     SJ\MSK\'S\R                  4SW jrS$\R                  S;\g\R                     S\R                  SS4SX jrSY\g\GR                      SS4SZ jrS$\R                  S*\i\R                     S\\R                     4S[ jrS*\i\R                     SS4S\ jrS\;4S] jrS\R                  S\ GR                  4S^ jrg! \L a    SrH GNf = f)`    N)	GeneratorIterableIterator)AnyCallableno_type_checkOptionalTYPE_CHECKINGUnion)default_hooks)_mesh_resources
DeviceMesh)_get_default_group)_FSDPDeviceHandle
_FSDPState_get_module_fsdp_state_is_fsdp_flattened!_named_parameters_with_duplicatesclean_tensor_nameTrainingState)_FSDP_USE_FULL_PREC_IN_EVALFlatParameterFlatParamHandleHandleShardingStrategy)_FreeEventQueue)BackwardPrefetch
CPUOffloadFullOptimStateDictConfigFullStateDictConfigMixedPrecisionShardingStrategyStateDictConfigStateDictType)_Policy)DTensorExtensions)_sync_params_and_buffers)is_traceable_wrapper_subclass)RemovableHandleT)deferred_initfakeFi  _fsdp_syncedstateprocess_groupsharding_strategypolicydevice_meshreturnc                 F   Ub  Ub  [        S5      eU[        ;   nU(       a%  Uc  Uc  Uc  [        SU S35      e[        XU5      n O6U(       a  X@l        UR	                  SS9U l        OUb  UO	[        5       U l        U R
                  R                  5       U l        U R
                  R                  5       U l	        U R                  nU(       a  X`R                  R                  5       -  n[        R                  R                  U5      U l        X`R                  -  U l        U $ )NzcCannot pass both process_group and device_mesh at the same time. Please just pass only one of them.zManual wrapping with zA requires explicit specification of process group or device_mesh.r   mesh_dim)
ValueErrorHYBRID_SHARDING_STRATEGIES*_init_process_group_state_for_hybrid_shard_device_mesh	get_groupr-   r   ranksize
world_size_inter_node_pgr   DefaultState_get_gradient_predivide_factor_gradient_predivide_factor_gradient_postdivide_factor)r,   r-   r.   r/   r0   is_hybrid_strategydata_parallel_world_sizes          v/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/distributed/fsdp/_init_utils.py_init_process_group_staterE   Y   sA     [%<<
 	
 +.HH V^8K '(9': ;S S 
 ?kE !,"-"7"7"7"CE "/!:@R@T  $$))+EJ**//1E$// $8$8$=$=$?? ""AA$	
 
$ 	!#C#CC 
% L    c                    U(       aW  [        U5      (       a/  X l        UR                  SS9U l        UR                  SS9U l        O[        SUR                   35      eUc<  [        5       n[        X0R                  R                  5       5      u  pEX@l        XPl        O7[        U5      (       a  Uu  U l        U l        O[        S[        U5       35      e[        U R                  S9U l        U $ )Nr   r3      z,Expected device_mesh to have ndim=2 but got zmExpected process_group to be passed in as either None or Tuple[dist.ProcessGroup, dist.ProcessGroup] but got r-   )"_is_valid_hybrid_shard_device_meshr8   r9   r=   r-   r5   ndimr   !_init_intra_and_inter_node_groups_device_handledevice_count_is_valid_hybrid_shard_pg_typetype_get_default_comm_hook_state_inter_node_state)r,   r-   r0   default_groupintra_node_groupinter_node_groups         rD   r7   r7      s    -k::!, $/#8#8!#8#DE "-"7"7"7"CE>{?O?O>PQ  
	*,-N//<<>.
* // *-88 9F5E!5GGKMGZF[] 
 ;**E LrF   c                 ~    [        U [        5      =(       a'    [        U 5      S:H  =(       a    [        S U  5       5      $ )N   c              3   V   #    U  H  n[        U[        R                  5      v   M!     g 7fN)
isinstancedistProcessGroup).0pgs     rD   	<genexpr>1_is_valid_hybrid_shard_pg_type.<locals>.<genexpr>   s      JMb
2t0011M   '))rZ   tuplelenallrI   s    rD   rO   rO      s:     	=%( 	K!#	KJMJJrF   c                 N    [        U [        5      =(       a    U R                  S:H  $ )NrW   )rZ   r   rK   )r0   s    rD   rJ   rJ      s    k:.H;3C3Cq3HHrF   num_devices_per_nodec                 6    [         R                  " U 5      u  pU$ )a5  
Return a process group across the current node.

For example, given each row is a distinct node:
0  1  2  3  4  5  6  7
8  9 10 11 12 13 14 15
This API would return an intra-node subgroup across
[0, 1, ..., 7] or [8, 9, ..., 15] depending on the process's rank.
For example, rank 3 would get [0, 1, ..., 7].
)r[   new_subgroups)rf   intra_node_subgroup_s      rD   _init_intra_node_process_grouprk      s     "//0DErF   global_process_groupc                 \   Sn[         R                  " U 5      n[         R                  " U 5      nXA-  n[         R                  " U 5      U-  n[	        U5       H?  n[	        U5       Vs/ s H	  oX-  -   PM     n	n[         R
                  " XS9n
Xv:X  d  M=  U
nMA     Uc
   U S35       eU$ s  snf )al  
Return an inter-node process group where each contained rank has the same local rank.

For example, given each row is a distinct node:
0  1  2  3  4  5  6  7
8  9 10 11 12 13 14 15
This API would return inter-node process group [0, 8], [1, 9], [2, 10], and so forth
depending on the process's rank. For example, rank 1 would get [1, 9], rank 5
would get [5, 13].
N)ranksbackendz. expected to assign inter-node pg, but did not)r[   get_backendget_world_sizeget_rankrange	new_group)rl   rf   inter_node_pgsharding_backendr<   	num_nodesmy_local_rank
local_rankiranks_for_inter_groupgrps              rD   _init_inter_node_process_groupr}      s      M''(<=$$%9:J2IMM"67:NNM01
=B9=M!
=M!23=M 	 !
 nn#8S&M 2 $ /GH$ !
s   'B)c                 .    [        U5      [        X5      4$ )a  
Initialize intra and inter-node process groups and return the ones corresponding to this process's rank.

This function can be used to initialize process groups for ``HYBRID_SHARD`` or
``_HYBRID_SHARD_ZERO2`` in FSDP.
This function assumes each node has an equal number of CUDA-enabled devices.
Returns:
    Tuple[dist.ProcessGroup, dist.ProcessGroup]: Intra and inter-node process group.
)rk   r}   )rl   rf   s     rD   rL   rL      s      	'';<&';R rF   moduleignored_modulesignored_statesc                    Ub  Ub  [        S5      eS nUS LnU(       a  [        U5      n[        US5        O/ n[        Ub  [        U5      O/ S5        [        U5      S:  a'  [	        US   [
        R                  5      (       a  UnOUn[        X5      U l        [        UU R                  U5      U l
        [        UU R                  5      U l        U $ )NzfCannot pass both ignored_modules and ignored_states at the same time. Please just pass ignored_states.TFr   )r5   list_check_ignored_statesrc   rZ   nn	Parameter_get_ignored_modules_ignored_modules_get_ignored_params_ignored_params_get_ignored_buffer_names_ignored_buffer_names)r,   r   r   r   ignored_parameterspassed_as_ignored_statesignored_states_lists          rD   _init_ignored_module_statesr     s     "~'A:
 	
 -T9">2148 %4%@D!b%	
 !#)!,bll;;!41O1&JE/E
 #<#E LrF   r   c                    [        U 5      S:X  a  gU(       aj  [        S U  5       5      n[        S U  5       5      nU(       d>  U(       d6  [        U  Vs1 s H  n[        U5      iM     sn[        S9n[        SU 35      egg[        S U  5       5      (       d6  [        U  Vs1 s H  n[        U5      iM     sn[        S9n[        SU 35      egs  snf s  snf )	z
Check that the ignored states are uniformly parameters or uniformly modules.

We may remove this check in the future if we permit mixing.
r   Nc              3   V   #    U  H  n[        U[        R                  5      v   M!     g 7frY   )rZ   r   r   r]   r,   s     rD   r_   (_check_ignored_states.<locals>.<genexpr>I  s     UnUE2<<88nra   c              3   V   #    U  H  n[        U[        R                  5      v   M!     g 7frY   rZ   r   Moduler   s     rD   r_   r   J  s     SN5*UBII66Nra   )keyzUignored_states expects all nn.Parameter or all nn.Module list elements but got types c              3   V   #    U  H  n[        U[        R                  5      v   M!     g 7frY   r   r   s     rD   r_   r   S  s     L^E:eRYY//^ra   z>ignored_modules expects nn.Module list elements but got types )rc   rd   sortedrP   reprr5   )r   r   
all_paramsall_modulesr,   sorted_typess         rD   r   r   >  s     >aUnUU
SNSS+!N"KN54;N"KQUVL**69  #.z L^LLL!N"KN54;N"KQUVL%(  M #L #Ls   C"Cignored_params	device_idc                 b   SnUb7  [        U[        R                  5      (       a  UO[        R                  " U5      nUc  [        X5       H  nUR                  R                  S;   a  M  Uc  UR                  nM0  UR                  R                  UR                  :w  d  MV  [        SUR                   SUR                  R                   35      e   U=(       d    [        R                  R                  5       nUR                  S:X  a  [        S5      e[        R                  " U5      U l
        U $ )a  
Determine device handle used for initializing FSDP.

If a device is specified by ``device_id``,
then returns device handle corresponds to that device type. Otherwise, If the
module is already on a non-CPU device, then the device type is that non-CPU device type.
If the module is on CPU or meta, then the device type is the current accelerator device.
See the :ref:`Accelerators<accelerators>` for details.


This method will be called once ignored paramters was determined, as the device handle maybe needed
for other initialization.
N>   cpumetazLFSDP does not support modules with different device types but got params on z and r   zOFSDP needs a non-CPU accelerator device, but no accelerator device is detected.)rZ   torchdevice_get_orig_paramsrP   RuntimeError_C_get_acceleratorr   from_devicerM   )r,   r   r   r   determined_deviceparams         rD   _init_device_handler   [  s   (  )U\\22 i( 	
  %f=E||  O3 ($)LL!<<$$(9(>(>>&-->-C-C,DE%,,J[J[I\^  > .L1J1J1L!!U*a  -889JKELrF   c                     [        U5      U l        0 nUR                  5        H  u  p4[        U5      nUR                  X#'   M      X l        U $ rY   )_get_buffer_names_buffer_namesnamed_buffersr   dtype_buffer_name_to_orig_dtype)r,   r   r   buffer_namebuffers        rD   _init_buffer_stater     sT    
 ,F3E
 :<%335'428,,"/  6 (B$LrF   mixed_precisioncpu_offloadlimit_all_gathersuse_orig_paramsbackward_prefetch_limitforward_prefetch_limitc                    U R                   S:X  aV  U[        R                  :w  a1  [        R                  " SU=(       d    [        R
                   S35        [        R                  nO.U[        R                  :X  a  [        R                  " S[        SS9  U=(       d    [        R
                  U l        U=(       d
    [        5       U l	        Ub5  [        R                  R                  S[        U R                  5       35        [        R                  R!                  ["        S5      S	:H  U l        U=(       d
    ['        5       U l        X@l        XPl        [.        R0                  U l        S U l        [7        5       U l        [:        R<                  " 5       U l        [@        RB                  " U R>                  UU5      U l"        S U l#        0 nXl$        S n	Xl%        / n
Xl&        U $ )
NrH   z/FSDP is switching to use `NO_SHARD` instead of z since the world size is 1.zoThe `NO_SHARD` sharding strategy is deprecated. If having issues, please use `DistributedDataParallel` instead.   )
stacklevelz'torch.distributed.fsdp.mixed_precision. 1)'r<   r!   NO_SHARDwarningswarn
FULL_SHARDFutureWarningr.   r    r   r   r   _log_api_usage_oncestrosenvirongetr   _use_full_prec_in_evalr   r   r   _use_orig_paramsr   IDLEtraining_state_is_rootr   _free_event_queuer[   get_debug_level_debug_levelexec_order_utils_ExecOrderData_exec_order_data_unshard_event_fully_sharded_module_to_handle_handleparams)r,   r.   r   r   r   r   r   r   r   r   r   s              rD   _init_core_stater     s    1 0 9 99MMA$C(8(C(CD E''
 -55	.77	7< 	
 0N3C3N3NE+?~/?E"$$5c%:O:O6P5QR	
 	

2B73> 
  $3z|E/,(--EEN-/E--/E-<<E
  E IK#,K) *.GM"$FLLrF   c                 `    / nXl         / nX l        / nX0l        SU l        S U l        S U l        U $ )NT)_root_pre_forward_handles_pre_forward_handles_post_forward_handles_sync_gradients
_comm_hook_comm_hook_state)r,   r   r   r   s       rD   _init_runtime_stater     sE     8:&?#24!535"7 EE!ELrF   backward_prefetchforward_prefetchc                     Xl         X l        U $ rY   )r   r   )r,   r   r   s      rD   _init_prefetching_stater     s     0- LrF   c                     [         R                  " U5      nU(       a+  X R                  :w  a  [        U R                  5      U l        U $ S U l        U $ rY   )r   get_root_meshr8   r%   rM   _fsdp_extension)r,   r0   	root_meshs      rD   _init_extensionr     sM      --k:I y$6$66 1%2F2F G
 L !%LrF   c                 ~    [         R                  U l        [        5       n[	        5       U l        Xl        0 nX l        U $ rY   )r#   FULL_STATE_DICT_state_dict_typer   r   _optim_state_dict_config_state_dict_config_unshard_params_ctx)r,   state_dict_configunshard_params_ctxs      rD   _init_state_dict_stater     s;    *::E)<)>%=%?E"057 2LrF   r   c                     U HW  n[        UR                  5      S:X  d  M  SnU R                  5        H  u  pEX%L d  M  Un  O   U(       d   e[        SU S35      e   g)z
Verify if the parameters are accepted by FSDP. The only restriction now
is that the parameter cannot be a scalar tensor (param.shape == []).
r   r   z/FSDP doesn't support scalar parameters. Change z& to a 1D tensor with numel equal to 1.N)rc   shapenamed_parametersr5   )r   r   r   
param_namenameparam_s         rD   _verify_managed_paramsr     sp    
 u{{q J & 7 7 9?!%J !: :$%KM  rF   fully_sharded_moduleparam_init_fnsync_module_statesc                   ^  [        UT R                  U5        [        UT R                  T R                  5      n[        UT R                  T R                  5      u  pgU(       d  U(       a  Ub  [        XT R                  5        OJU(       a#  [        UUT R                  T R                  5        O U(       a  [        R                  " UU 4S jS9  T R                   VV	s1 s H  nUR                  5         H  n	U	iM     M     n
nn	[        UT R                  U
U5        [        UT R                  UT R                  T R                  5      T l        [        [!        UT R                  5      5      n[#        X5        U(       a@  [%        XT R&                  5        T R(                  [*        ;   a  [%        XT R,                  5        [/        T X5        T $ s  sn	nf )zHInitialize a ``FlatParamHandle`` from a module ``fully_sharded_module``.c                 J   > [        U 5      S L =(       a    U TR                  ;  $ rY   )r   r   )	submoduler,   s    rD   <lambda>0_init_param_handle_from_module.<locals>.<lambda>L  s(    '=i'HD'P (8!7!77(8rF   )check_fn)_check_single_device_moduler   _get_device_from_device_idr:   rM   _need_to_materialize_moduler   _materialize_with_param_init_fn_materialize_meta_moduler)   materialize_modulebuffers_move_module_to_device_get_compute_devicecompute_devicer   r   r   _sync_module_params_and_buffersr-   r.   r6   r=   _init_param_handle_from_params)r,   r   r   r   r   device_from_device_idis_meta_moduleis_torchdistX_deferred_initignored_moduler   ignored_buffersmanaged_paramss   `           rD   _init_param_handle_from_moduler  -  s      4e6K6KYW65::u33 3Ne33U5K5K3/N 	5=;T' 1G1G	
 
  ""  		
 
%(( 8	
 $444N$,,.F 	. 	4   	 /

E *+?AVAVWXN/@' %2E2E	
 ""&@@+$e6J6J #5.OL?s   $Gc                    [        U5      S:X  a  g [        UUU R                  [        U R                     U R
                  R                  U R                  R                  U R                  R                  U R                  R                  U R                  U R                  U R                  S9nUR                  5         U R                  (       a   eU R                   R#                  UR$                  5        X0l        X0R&                  UR(                  '   [*        R,                  " S5      nU R
                  R                  (       a-  UR$                  R,                  U:w  a  UR/                  U5        g g g )Nr   )fsdp_extensionr   )rc   r   r  SHARDING_STRATEGY_MAPr.   r   offload_paramsr   param_dtypereduce_dtypekeep_low_precision_gradsr-   r   r   shardr   r   append
flat_paramr   _fully_sharded_moduler   r   flat_param_to)r,   r   r   handle
cpu_devices        rD   r  r  r  s!    6{ae556(())**66,,F LLN}}	LL))*MJP))&*F*FGe$J''F,=,=,D,D
,RZ( -S'rF   root_moduler   c           	      z   Sn Ub  [        U5      O	[        5       nU Ha  n[        U[        R
                  R                  5      (       d  [        US[        U5       3-   5      e[        U5      (       d  MX  [        S5      e   U R                  5        H1  n[        R                  " U5      (       a  M   UR                  U5        M3     U VVs1 s H>  nUR                  5         H&  n[        U[        R                  5      (       a  M$  UiM(     M@     nnnX;   a  [        R                   " SW 35        U R                  5        HA  n[        U5      n	U	c  M  [#        U	S5      (       d   eUR%                  U	R&                  5        MC     U$ ! [         a   n[        US[        U5       3-   5      UeSnAff = fs  snnf )aP  
Check that ``_ignored_modules`` is an iterable of ``nn.Module`` s without any FSDP instances.

Return the modules contained in their module
subtrees as a :class:`set`. Nested FSDP instances are excluded, but their
already-computed ignored modules are included.

``_ignored_modules`` represents the argument passed by the user to FSDP.
z>`ignored_modules` should be an iterable of `torch.nn.Module`s Nzbut got zbut got an iterable with z1`ignored_modules` should not include FSDP moduleszTrying to ignore the top-level module passed into the FSDP constructor itself will result in all parameters being ignored and is not well-supported: r   )set	TypeErrorrP   rZ   r   r   r   r   r5   modulestraversal_utils_composableadd	fsdp_fileFullyShardedDataParallelr   r   hasattrupdater   )
r'  r   
msg_prefixignored_root_moduleser   childr   r  optional_fsdp_states
             rD   r   r     s    RJQ%5%AC !su 	
 '&%((//22J+DT&\N)SSTT!&)) PQQ ' %%'**622 $$V, ( +*F^^%E%!C!CD 	% 	*   %228;	
 !((*	4Y?*.0BCCCC""#6#G#GH	 +
 I  Q
x5E0F/G%HHIqPQ$s#   F
 7F7
F7

F4F//F4r   c                    [        5       nU VVs1 s H.  oDR                  5         H  n[        U5      (       a  M  UiM     M0     nnnUR                  U5        Ub5  U Vs1 s H  n[        U5      (       a  M  UiM     nnUR                  U5        U R	                  5        HA  n[        U5      n	U	c  M  [        U	S5      (       d   eUR                  U	R                  5        MC     U$ s  snnf s  snf )z
Return the parameters of the modules in ``ignored_modules`` and the parameters in ``ignored_parameters``.

:class:`FlatParameter` s are excluded from the result.
r   )r)  
parametersr   r2  r+  r   r1  r   )
r'  r   r   all_ignored_paramsmpparams_in_ignored_modulesparams_in_ignored_parametersr  r7  s
             rD   r   r     s     36% #!"aLLNqBTUVBWN?  ! 78%)(
)!1CA1FA) 	% (
 	!!">? !((*	4Y?*.0ABBBB%%&9&I&IJ	 + '!(
s   'C(
C( C.8C.c           	         [        5       nU VVs1 s H  o3R                  5         H  oDiM     M     nnnUR                  U R                  5        VVs1 s H  u  pdXE;   d  M  [	        U5      iM     snn5        U R                  5        HA  n[        U5      nUc  M  [        US5      (       d   eUR                  UR                  5        MC     U$ s  snnf s  snnf )z6Return the cleaned buffer FQNs in ``ignored_modules``.r   )	r)  r  r2  r   r   r+  r   r1  r   )	r'  r   all_ignored_buffer_namesr;  r   buffers_in_ignored_modulesr   r  r7  s	            rD   r   r     s    
 *- ("'1iikFk  " ## (3'@'@'B	
'B#3 +k*'B	
 !((*	4Y?*.0GHHHH$++,?,U,UV	 + $#'"
	
s   "CC
"C
c                 j    U R                  5        VVs1 s H  u  p[        U5      iM     snn$ s  snnf )zrReturn the fully prefixed names of all buffers in the module hierarchy rooted at ``root_module`` as a class:`set`.)r   r   )r'  r   rj   s      rD   r   r   	  s8     >I=V=V=X=X>;+&=X  s   /c                     [        X5       Vs1 s H  o3R                  iM     nn[        U5      S:X  a)  [        R                  " S5      U;   a  Uc  [	        S5      eg[        U5      S:  a  [	        SU 35      egs  snf )z
Raise an error if ``module`` has original parameters on multiple devices, ignoring the parameters in ``ignored_params``.

Thus, after this method, the
module must be either fully on the CPU or fully on a non-CPU device.
rW   r   NzTTo support a module with both CPU and GPU params, please pass in device_id argument.rH   z;FSDP only supports single device modules but got params on )r   r   rc   r   r   )r   r   r   r   devicess        rD   r  r    s     *:&)QR)Q||)QGR 7|qU\\%0G;5  
 
W	I'S
 	
 
 Ss   A;r:   device_handlec                 t   U c  g[        U [        R                  5      (       a  U O[        R                  " U 5      nUR                  S:w  al  UR                  c_  [
        R                  " SU  SU SUR                  5        SUR                   S3	5        [        R                  " UR                  5       5      nU$ )z
Return a ``torch.device`` for the specified ``device_id``.

Processes ``device_id`` and returns either the corresponding device or
``None`` if ``device_id`` is ``None``.
Nr   z"FSDP got the argument `device_id` z	 on rank zJ, which does not have an explicit index. FSDP will use the current device z6. If this is incorrect, please explicitly call `torch.zk.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.)rZ   r   r   rP   indexr   r   current_device)r   r:   rE  r   s       rD   r  r  .  s     	5<<88	ell9>U  {{e 409f 00=0L0L0N/O PCCI;;- P11	
 m::<=MrF   c                 2   [        [        X5      5      n[        S U 5       5      nU R                  5        H.  nXR;   a  M
  UR	                  SS9 H  nXFR
                  -  nM     M0     U(       + =(       a    [        =(       a    [        S U 5       5      nXG4$ )z
Return if ``module`` has parameters on meta device and if ``module`` is using torchdistX deferred initialization.

At most of the returned bools can
be ``True``. If either is ``True``, then ``module`` needs to be
materialized.
c              3   8   #    U  H  oR                   v   M     g 7frY   )is_metar]   r   s     rD   r_   ._need_to_materialize_module.<locals>.<genexpr>X  s     CN5Ns   Frecursec              3   N   #    U  H  n[         R                  " U5      v   M     g 7frY   )r*   is_fakerL  s     rD   r_   rM  d  s     @U##s   #%)r   r   anyr+  r  rK  _TORCHDISTX_AVAIL)r   r   r   r  r  r  bufr  s           rD   r  r  K  s     *6BCNCNCCN ^^%	'$$U$3Ckk)N 4 &  	A	A@@@  
 66rF   c                     [        U5      (       d  [        SU S[        U5       35      e[        X5      nU H  nU" U5        M     g )Nz	Expected z to be callable but got )callabler5   rP   _get_modules_to_materialize)r'  r   r   modules_to_materializer   s        rD   r	  r	  i  sR    
 M""&>tM?R>ST
 	
 9V(f )rF   r  c           	      R   U=(       d$    [         R                  " UR                  5       5      n[        X5      nS n [         R                  " 5          U Ht  n[
        R                  " UR                  SS9UR                  SS95      n[        [        U5      5      S:  nU(       d  MT  UR                  USS9  UR                  5         Mv     S S S 5        g ! , (       d  f       g = f! [         a6  n	[        R                  " S[!        U	5       S[#        U5       S35        U	eS n	A	ff = f)NFrN  r   )r   rO  zIUnable to call `reset_parameters()` for module on meta device with error z(. Please ensure that your module oftype z* implements a `reset_parameters()` method.)r   r   rH  rW  no_grad	itertoolschainr9  r  rc   r   to_emptyreset_parametersBaseExceptionr   r   r   rP   )
r'  r  r   rE  materialization_devicerX  r   module_state_iterhas_module_statesr5  s
             rD   r
  r
  w  s    3 ell$$&7 9VF ]]_0 %.OO%%e%4fnnUn6S%! %(->(?$@1$D!$$OO+A5OQ++- 1 __  !!$Q )L>!KM	

 s<   C& AC($CC& 
C#C& #C& &
D&01D!!D&c                 J   / n[         R                  " U /5      nU 1nU(       a  UR                  5       nUR                  U5        UR	                  5        HA  nXd;  d  M
  [        U5      b  M  Xa;  d  M  UR                  U5        UR                  U5        MC     U(       a  M  U$ rY   )collectionsdequepopleftr!  childrenr   r.  )r'  r   rX  queuevisited_modulesr   child_modules          rD   rW  rW    s    
 /1{m,E'2mO
%%f-"OO-L3*<8@ 7##L1\* . % "!rF   r  c                   ^ [         R                  " S5      mUGb"  [        R                  " 5       nUR	                  U 5        / n/ nU(       a  UR                  5       nUR                  U4S jUR                  SS9 5       5        UR                  U4S jUR                  SS9 5       5        UR                  5        H5  n[        U[        R                  5      (       a  M$  UR	                  U5        M7     U(       a  M  U V	s/ s H  oU;  d  M
  U	PM     n
n	U V	s/ s H  oU;  d  M
  U	PM     nn	[        XU5        g[        [        X5      S5      nUb  UR                  T:X  a  [!        5         gggs  sn	f s  sn	f )a  
Move ``module`` depending on ``device_from_device_id`` and its current device.

This includes moving ignored modules' parameters.

- If ``device_from_device_id`` is not ``None``, then this moves
``module`` to the device.
- If ``device_from_device_id`` is ``None``, then this does not move
``module`` but warns the user if it is on CPU.

Precondition: ``_check_single_device_module()``.
r   Nc              3   L   >#    U  H  nUR                   T:X  d  M  Uv   M     g 7frY   r   )r]   r   r&  s     rD   r_   )_move_module_to_device.<locals>.<genexpr>  s&      BE<<:- B   $	$FrN  c              3   L   >#    U  H  nUR                   T:X  d  M  Uv   M     g 7frY   rm  )r]   r   r&  s     rD   r_   rn    s&      @F==J. @ro  )r   r   rd  re  r!  rf  extendr9  r  rg  rZ   r/  r0  _move_states_to_devicenextr   _warn_cpu_init)r   r   r  r  rh  r   r  curr_moduler  r<  params_to_movebufs_to_mover   r&  s                @rD   r  r    sV   $ e$J( /:.?.?.AV%'&(--/K
 MM (33E3B 
 NN )11%1@ 
 )113	!)Y-O-OPPLL+ 4! e& &,GV/F!VG#*G7a.F7G~=RS!&94@EU\\Z7 8 HGs   	E3E3	E8)E8r  c                 \   [        U 5      S:X  a  [        U5      S:X  a  g[        U 5      S:  a  U S   R                  nO[        U5      S:  a  US   R                  n[        R                  " S5      nUb  U  Hn  n[        R                  " 5          UR	                  U5      Ul        UR                  b*  UR                  R	                  U5      UR                  l        SSS5        Mp     U H  nUR	                  U5      Ul        M     gWU:X  a  [        5         gg! , (       d  f       M  = f)z
Move states to the specified device.

Precondition: ``_check_single_device_module()`` and module's parameters and
buffers have been materialized if needed.
r   Nr   )rc   r   r   rZ  todatagradrt  )r   r  r  rH  r&  r   r   s          rD   rr  rr    s     6{aCLA-
6{Q))	W	 **e$J( E"XX&;<
::)&+jjmm4I&JEJJO ! 
 F ))$9:FK 	:	% 
& !s   AD
D+	c                  0    [         R                  " S5        g )Nam  The passed-in `module` is on CPU and will thus have FSDP's sharding initialization run on CPU, which may be slower than on GPU. We recommend passing in the `device_id` argument for FSDP to move `module` to GPU for the sharding initialization. `module` must also be on GPU device to work with the `sync_module_states=True` flag since that requires GPU communication.)r   r    rF   rD   rt  rt    s    MM	1rF   c                    [        [        X5      S5      nUb'  UR                  R                  S:w  a  UR                  nO$[        R                  " UR                  5       5      nUb  Xb:w  a  [        SU SU SU 35      eU$ )a  
Determine and return this FSDP instance's compute device.

If the module is already on a non-CPU device, then the compute device is that non-CPU
device. If the module is on CPU, then the compute device is the current
device.

Since this method should be called after materializing the module, any
non-CPU device should not be meta device. For now, the compute device is
always a CUDA or CUDA-like device with its explicit index.

Precondition: ``_check_single_device_module()`` and
``_move_module_to_device()``.
Nr   z4Inconsistent compute device and `device_id` on rank z: z vs )rs  r   r   rP   r   rH  r5   )r   r   r  r:   rE  r   r  s          rD   r  r    s    * !&94@EU\\..%7m&B&B&DE(^-TB4&d#8"9;
 	
 rF   c                    / nU R                  5        H  n[        U[        S5      (       a  M  [        U[        S5        UR	                  5       n[        U5      (       a@  UR                  5       u  pgU Vs/ s H  n[        XX5      PM     n	nUR                  U	5        M  UR                  U5        M     U Ht  n
U
R	                  5       n[        U5      (       a@  UR                  5       u  pgU Vs/ s H  n[        X5      PM     nnUR                  U5        Mc  UR                  U5        Mv     [        U5        [        UU[        SS9  gs  snf s  snf )z
Synchronize module states (i.e. parameters ``params`` and all not-yet-synced buffers) by broadcasting from rank 0 to all ranks.

Precondition: ``sync_module_states == True`` and ``self.process_group`` has
been set.
FTr   )srcN)r  getattrFSDP_SYNCEDsetattrdetachr'   __tensor_flatten__rq  r!  +_check_module_states_for_sync_module_statesr&   PARAM_BROADCAST_BUCKET_SIZE)r   r   r-   module_statesr   detached_bufferattrsrj   attrinner_buffersr   detached_paraminner_paramss                rD   r  r  4  s#    )+M.."v{E22FK.$mmoO,_== +==?LQ RED!?E R$$]3$$_5 # (88%88:HEFKLedGN9eLL  .  0  0>#	 !S Ms   5E
/Er  c                 X    U (       a#  [        S U  5       5      (       a  [        S5      eg g )Nc              3   f   #    U  H'  oR                   [        R                   " S 5      :H  v   M)     g7f)r   N)r   r   )r]   tensors     rD   r_   >_check_module_states_for_sync_module_states.<locals>.<genexpr>c  s#      ;He,,=s   /1zThe module has CPU parameters or buffers when `sync_module_states=True`, which requires them to be on GPU. Please specify the `device_id` argument or move the module to GPU before passing it to FSDP.)rR  r5   )r  s    rD   r  r  `  s<      ;H   C
 	
}rF   c              #      #    U R                  5       n  [        U5      nX1;  a  [        U5      (       d  Uv   M&  ! [         a     gf = f7f)a,  
Return an iterator over the original parameters in ``module``.

The iterator does not return
the parameters in ``ignored_params``, any ``FlatParameter`` s (which may be
present due to nested FSDP wrapping), or any original parameters already
flattened (only relevant when ``use_orig_params=True``).
N)r9  rs  r   StopIteration)r   r   	param_genr   s       rD   r   r   m  sT      !!#IOE*3Ee3L3L   s    A'; 
AAAAc           	          [        U 5       HH  u  p#X1;  d  M  [        U5      (       a  M  [        SU SUR                  5        SUR                   35      e   g)a  
Check that original parameters in ``fsdp_module`` have been flattened.

The flattened parameters are made
invisible to ``named_parameters()`` for the module hierarchy rooted at
``fsdp_module``. This should be called as a sanity check after flattening
the wrapped module's parameters.
z Found an unflattened parameter: z;  N)r   r   r   r;   	__class__)fsdp_moduler   r   r   s       rD   _check_orig_params_flattenedr    sY     ?{K
&/A%/H/H2:,b::<.%//!24  LrF   c                 j    U [         R                  :X  a  [        R                  $ [        R                  $ rY   )r!   r   r   allreduce_hookreduce_scatter_hook)r.   s    rD   _get_default_comm_hookr    s3      0 9 99 	$$ ..rF   c                 *    [         R                  " U S9$ )NrI   )r   r>   rI   s    rD   rQ   rQ     s     %%MBBrF   rY   )rd  r[  r   r   collections.abcr   r   r   typingr   r   r   r	   r
   r   r   torch.distributeddistributedr[   (torch.distributed.fsdp._exec_order_utilsfsdp_exec_order_utilsr   'torch.distributed.fsdp._traversal_utils_traversal_utilsr,  2torch.distributed.fsdp.fully_sharded_data_parallelfully_sharded_data_parallelr/  torch.nnr   (torch.distributed.algorithms._comm_hooksr   torch.distributed.device_meshr   r   "torch.distributed.distributed_c10dr   $torch.distributed.fsdp._common_utilsr   r   r   r   r   r   r   "torch.distributed.fsdp._flat_paramr   r   r   r   %torch.distributed.fsdp._limiter_utilsr   torch.distributed.fsdp.apir   r   r   r   r    r!   r"   r#   torch.distributed.fsdp.wrapr$   &torch.distributed.tensor.parallel.fsdpr%   torch.distributed.utilsr&   torch.utils._python_dispatchr'   torch.utils.hooksr(   rS  
torchdistxr)   r*   ImportErrorintr  r  rb   r\   HybridShardProcessGroupTypeProcessGroupTyper   r   SHARD_GRAD_OPHYBRID_SHARD_HYBRID_SHARD_ZERO2r  r6   #NO_RESHARD_AFTER_FORWARD_STRATEGIESrE   r7   boolrO   rJ   rk   r}   rL   r   r   r   r   r   r)  r   r   r   r   r   r   r   r   r   r  r  r   r   r   r   r   r  r  r  r	  r
  rW  Tensorr  rr  rt  r  r  r  r   r  r  r>   rQ   r}  rF   rD   <module>r     s
     	  9 9 O O    C C A A F F  B E A    B	 	 	 0 D < F 1 . ""34 #D$5$5t7H7H$HI E$"3"35P"PQR 
 5>>!7!B!B""$:$H$H!!#9#F#F((*@*T*T  !!(( 
 ""((' #  )-00#0 (0 W	0
 *%0 0 0f ((#( ( 	( (V # $   IJ I4 I I  ARAR   !++!! 
! !H++ 4d///0&  	++II+ huxx78+ %((,,-.%((//9R0SS	+ + +\I9=	: --II- %- c5<</01	-
 - -` II  " >> 01> n-> *%	>
 > > !>  > > >B    		'	 	 		 	 : J *   *   299 d2<<6H T & AA))A c5<</01A Hbii[$%678	A
 A A AH ))) ))) )<66x896 	^6x BF) !%((*<*<!=> 				D$$)$ 	X$:299 S 
II
%
 c5<</01
 
	
<c5<</01
 % ell	:7II7%7 ^7 4:	7<RYYK-. ^ 
	  #ELL1  ^  %	 F""-0^"	"))_",3II3%3 &3 $ELL1	3
 
3l%,, $ELL1 
	@II% $ELL1 	
 % \\F)II)) $$) 
	)X

%

	

II% bll,% 
(.> C$$CCO#  s   _ __