
    shS                       S SK r S SKrS SKrS SKrS SKJrJrJr  S SKJ	r	  S SK
JrJr  S SKJr  S SKJrJrJrJrJrJrJr  S SKrS SKJr  S SKJs  Js  Jr  S SKJ r   S SK!J"r"  S SK#J$r$  S S	K%J&r&J'r'J(r(J)r)J*r*J+r+J,r,  S S
K-J.r.  S SK/J0r0J1r1  S SK2J3r3J4r4  S SK5J6r6J7r7  S SK8J9r9J:r:J;r;  S SK<J=r=J>r>  S SK?J@r@  \(       a  S SKAJBrB  \R                  " \D5      rE\ " S S5      5       rFS\G\H\4   S\\I\H\4      4S jrJ\ " S S5      5       rK " S S\5      rL " S S\5      rMS\FS\G\H\4   S\NS \NS!\NS\O\G\H\4      4S" jrPS#\S\N4S$ jrQS\FS\G\H\4   S\K4S% jrRS\FS&\KS \NS\O\G\H\4      4S' jrSS(\'S)\G\H\4   S*\\R                     S\G\H\4   4S+ jrUS(\'S&\S*\\R                     S\4S, jrVS\FS-\HS)\G\H\4   S\G\H\4   4S. jrW    ShS/\G\H\4   S0\ R                  S1\NS2\\R                  R                     S3\NS*\\R                     S\G\H\4   4S4 jjr[S\FS5\G\H\G\H\4   4   S6\O\H   S\G\H\4   4S7 jr\S8\HS9\O\R                     S6\O\H   S:\\R                     S;\1S\R                  4S< jr_S8\HS=\O\R                     S6\O\H   S\R                  4S> jr`S8\HS?\O\   S6\O\H   S\4S@ jra SiSA\G\H\4   S0\ R                  S2\R                  R                  SB\\\O\G\H\4      \\ R                     4      SC\NSD\NS\G\H\4   4SE jjrc SjS0\ R                  SB\\\O\G\H\4      \\ R                     4      S\G\d\ R                  4   4SF jjreS0\R@                  R                  S\G\0\H4   4SG jrf    SkS2\R                  R                  S0\\ R                     SD\NSH\\G\ R                  \O\H   4      SI\\G\0\H4      S\G\\d\H4   \ R                  4   4SJ jjrg    SkS2\R                  R                  S0\\ R                     SD\NSH\\G\ R                  \O\H   4      SI\\G\0\H4      S\G\ R                  \\d\H4   4   4SK jjrh SjS0\ R                  SB\\\O\G\H\4      \\ R                     4      S\G\ R                  \d4   4SL jjriSM\O\M   SN\G\M\\H\d4   4   SO\G\\H\d4   \ R                  4   S*\\R                     SS4
SP jrj SiS/\G\H\4   S*\\R                     SO\G\\d\H4   \ R                  4   SH\G\ R                  \O\H   4   SQ\G\H\F4   SR\NS\I\O\M   \G\M\\d\H4   4   4   4SS jjrkST\G\H\4   SO\G\\d\H4   \ R                  4   SH\G\ R                  \O\H   4   S\O\G\H\4      4SU jrlS/\G\H\4   S\N4SV jrm\ " SW SX5      5       rnS(\'SY\G\H\4   S\O\G\H\n4      4SZ jroS\FS[\O\G\H\n4      SY\G\H\4   S\\G\H\G\H\4   4   S\I\\R                     \G\H\O\\R                        4   4   4
S] jrqS\FS\\G\H\G\H\4   4   S8\HS \NS\NS!\NSS4S^ jrrS\FS[\O\G\H\n4      SY\G\H\4   S \NS\NS!\NS\G\H\G\H\4   4   4S_ jrsS\FSY\G\H\4   S \NS\NS!\NS\G\H\4   4S` jrt SlSa\O\M   SN\G\M\\d\H4   4   SQ\G\H\F4   S/\G\\H\d4   \4   S\NS \NS!\NS\G\H\4   4Sb jjru SlSa\O\M   SN\G\M\\d\H4   4   SQ\G\H\F4   S/\G\\H\d4   \4   S\NS \NS!\NS\G\H\4   4Sc jjrv\R                  " 5         SmS0\ R                  S2\R                  R                  S/\G\H\4   SB\\\O\G\H\4      \\ R                     4      S3\NS \NS*\\R                     SC\NS1\NS!\NS\G\H\4   4Sd jj5       rxS0\ R                  S\G\H\F4   4Se jry\S(\'Sf\:SS4Sg j5       rzg)n    N)IterableIteratorSequence)	ExitStack)	dataclassfield)chain)Anycast
NamedTupleno_type_checkOptionalTYPE_CHECKINGUnion)_gather_state_dict)_get_pg_default_device)_apply_to_modules
_FSDPState._get_module_fsdp_state_if_fully_sharded_module_get_param_to_fqns_module_handle!_named_parameters_with_duplicatesclean_tensor_name)SimpleProfiler)FlatParameterFlatParamHandle)_ext_chunk_dtensor_ext_chunk_tensor)
_lazy_init%_reset_flat_param_grad_info_if_needed)ShardingStrategyStateDictSettingsStateDictType)DTensor	Replicate)tree_map_only)ShardedTensorc                   N    \ rS rSr% \\S'   \\S'   \\\	4   \S'   \
\   \S'   Srg)FSDPParamInfo5   statehandleparam_indicesparam_requires_grad N)__name__
__module____qualname____firstlineno__r   __annotations__r   dictstrintlistbool__static_attributes__r/       w/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/distributed/fsdp/_optim_utils.pyr)   r)   5   s&    S>!d#r;   r)   
dictionaryreturnc              #   `   #    [        U R                  5       5      nU H  nX U   4v   M     g 7fN)sortedkeys)r=   rB   ks      r<   sorted_itemsrD   =   s-     *//#$DA s   ,.c                       \ rS rSr% Sr\" \S9r\\\	R                  4   \S'   \" \S9r\\\	R                  4   \S'   \" \S9r\\\4   \S'   Srg)	_ConsolidatedOptimStateC   a4  
This holds the consolidated optimizer state on the target rank. Positive-
dimension tensor state is communicated across ranks, while zero-dimension
tensor state and non-tensor state is taken directly from the target rank.

PyTorch version 1.12 moved to using zero-dimension tensors for scalar
values, but user implemented optimizers may still use float (i.e. a
non-tensor). Thus, we support both and handle them identically.

Attributes:
    tensor_state (Dict[str, torch.Tensor]): Mapping from positive-dimension
        tensor state name to the unsharded flat tensor representing the
        state.
    zero_dim_tensor_state (Dict[str, torch.Tensor]): Mapping from zero-
        dimension tensor state name to its value.
    non_tensor_state (Dict[str, Any]): Mapping from non-tensor state
        name to its value.
)default_factorytensor_statezero_dim_tensor_statenon_tensor_stater/   N)r0   r1   r2   r3   __doc__r   r5   rI   r6   torchTensorr4   rJ   rK   r
   r:   r/   r;   r<   rF   rF   C   s\    & -2$,GL$sELL()G5:45P4U\\ 12P',T'Bd38nBr;   rF   c                   V    \ rS rSr% Sr\R                  \S'   \R                  \S'   Sr	g)_PosDimTensorInfo]   ac  
Metadata for positive-dimension tensors used internally for
:meth:`scatter_full_optim_state_dict`.

Attributes:
    shape (torch.Size): Sharded tensor shape (which is equal to the
        unsharded tensor shape if the tensor is optimizer state for a
        non-FSDP parameter and is hence not sharded).
    dtype (torch.dtype): Data type of the tensor.
shapedtyper/   N)
r0   r1   r2   r3   rL   rM   Sizer4   rS   r:   r/   r;   r<   rP   rP   ]   s    	 ::;;r;   rP   c                   8    \ rS rSr% Sr\\S4   \S'   \\S'   Sr	g)_OptimStateKeym   z
This represents an optimizer state key that may be used commonly across
ranks. It is based on the unflattened parameter names rather than parameter
IDs to make it independent of each rank's own optimizer construction.
.unflat_param_namesis_fsdp_managedr/   N)
r0   r1   r2   r3   rL   tupler6   r4   r9   r:   r/   r;   r<   rV   rV   m   s     c3h'r;   rV   fsdp_param_infoflat_param_stateto_saveshard_statecpu_offloadc                 F   U(       a  U(       d   S5       e[        U U5      nU(       ax  [        U UU5      nU Hc  nU(       d  M  [        UR                  5       5       H:  nXx   n	[	        U	[
        R                  5      (       d  M(  U	R                  5       Xx'   M<     Me     U$ / $ )a  
Unflattens the optimizer state, consisting of the "state" part and the
"param_groups" part. Unflattening the "state" part involves consolidating
the state on the target rank and remapping from flattened to unflattened
parameter IDs, and the "param_groups" part only involves remapping from
flattened to unflattened parameter IDs.

Args:
    fsdp_param_info (FSDPParamInfo): The FSDP state, the handle, and a
        mapping from FQN to original parameter index.
    flat_param_state (Dict[str, Any]): Entry for the flat parameter in the
        "state" part of the optimizer state dict.
    to_save (bool): Whether to save the state on this rank.

Returns:
    List[Dict[str, Any]]: A :class:`list` holding the entries in the
    "state" part of the optimizer state dict corresponding to the
    unflattened parameters comprising the flat parameter if on the target
    rank or an empty :class:`list` otherwise. The final optimizer state
    dict will need to map these entries using the proper unflattened
    parameter IDs.
z7If ``shard_state`` is True, ``to_save`` has to be True.)_communicate_optim_state#_unflatten_communicated_optim_stater8   rB   
isinstancerM   rN   cpu)
r[   r\   r]   r^   r_   consolidated_stateunflat_param_stateoptim_statekeyr+   s
             r<   _unflatten_optim_stateri   x   s    : g A% 2 @

 .K{ 0 0 23C',E%eU\\:: ',yy{K$	 4 . "!	r;   xc                 b    [         R                  " U 5      =(       a    U R                  5       S:H  $ Nr   )rM   	is_tensordim)rj   s    r<   _is_zero_dim_tensorro      s     ??1.!%%'Q,.r;   c                    U R                   nU R                  R                  n[        5       nUR                  UR
                  UR                  pvn[        U5       GH  u  p[        R                  " U	5      (       GaD  U	R                  5       S:  Ga/  UR                  S:X  d  UR                  [        R                  :X  a  XU'   Mk  UR                  c   S5       eU	R                   R"                  UR                  R"                  :w  a  U	R%                  UR                  5      n	UR&                  R)                  5       n
U	R*                  " U
6 n[,        R.                  " XUR0                  S9  UR2                  R5                  5         [7        [8        R:                  UR<                  5      R?                  5       nUSU XX'   GMf  [A        U	5      (       a#  U	RC                  5       RE                  5       Xh'   GM  XU'   GM     U$ )a  
Communicates the optimizer state for a flat parameter across ranks. All
ranks will hold the entire non-sharded optimizer state on GPU.

If ``N`` is the number of tensor optimizer states in the optimizer state
dict, then the communication complexity is 0 if ``N = 0`` and ``N + 1``
otherwise (where the plus 1 comes from all-gathering the padding per rank).

Args:
    fsdp_param_info (FSDPParamInfo): The FSDP state, the handle, and a
        mapping from FQN to original parameter index.
    flat_param_state (Dict[str, Any]): The entry in the "state" part of the
        optimizer state dict corresponding to the flat parameter.

Returns:
    ConsolidatedOptimState: Consolidated optimizer state for the target
    flat parameter.
r      Nz'compute_device has not been initializedgroup)#r+   r,   
flat_paramrF   rI   rJ   rK   rD   rM   rm   rn   
world_sizesharding_strategyr!   NO_SHARDcompute_devicedevicetypeto_full_param_paddedsize	new_zerosdistall_gather_into_tensorprocess_group_device_handlesynchronizer   nn	Parameter_unpadded_unsharded_sizenumelro   detachclone)r[   r\   
fsdp_statert   r+   rI   rJ   rK   
state_namevaluebuffer_sizetensor_bufferunpadded_numels                r<   ra   ra      s   , !&&J ''22J#%E## *:L **:;
??5!!eiikAo
 %%*//3C3L3LL+0Z(,,8 98 ||  J$=$=$B$BB!:!:; %77<<>K!OO[9M''J,D,D %%113!jAAeg  (5_n'EL$ #5))49LLN4H4H4J%1/4,G <H Lr;   r+   c           
      v   U R                   nU R                  nUR                  n/ n0 nUR                  nUR                  UR
                  UR                  pn	[        U5       GHQ  n0 n[        U	5       H  u  pX;   nU(       d  UR                  U5      nUX~'   OX~   n[        U5      nU(       a  UR                  n[        USS5      (       a<  UR                  c   e[        UUR                  UR                  UR                   5      nO_UR"                  c   e[%        UUR                  UR&                  UR(                  R+                  5       UR"                  UR                   5      nUX'   M     [        U
5       H
  u  nnUX'   M     [        U5       H
  u  nnUX'   M     UR-                  U5        GMT     U$ )a  
Unflattens the communicated optimizer state (given by ``tensor_state``,
``non_tensor_state``, and ``zero_dim_tensor_state``) for a single flat
parameter. This should only be called on the target rank.

Args:
    fsdp_param_info (FSDPParamInfo): The FSDP state, the handle, and a
        mapping from FQN to original parameter index.
    state (_ConsolidatedOptimState): Consolidated optimizer state.

Returns:
    List[Dict[str, Any]]: A :class:`list` holding the entries in the
    "state" part of the optimizer state dict corresponding to the
    unflattened parameters comprising the flat parameter. The final
    optimizer state dict will need to map these entries using the proper
    unflattened parameter IDs.
_use_dtensorF)r+   r,   rt   _num_paramsrI   rJ   rK   rangerD   _get_unflat_viewsnext_optim_state_dict_configgetattr_device_meshr   rank_fsdp_extensionr   r   ru   r   device_countappend)r[   r+   r^   r   r,   rt   rf   flat_param_viewsnum_unflat_paramsrI   rJ   rK   _unflat_state_paramr   flat_tensorviews_generatedviewsrg   
osd_configzero_dim_tensor
non_tensors                         r<   rb   rb      s   , !&&J##F""J/1,."..## *:L $%'3L'A#J(<O"00=/4 ,(4GKE{K'@@
:~u==%22>>>"4#""//"22	#K &33???"3#""--"11>>@"00"22#K .9*9 (B> ,88M+N'J-<* ,O '33C&D"J
-7* 'E!!"45O &P r;   r   rg   rs   c                     S /n[         R                  " U5      S:X  a  [        [        R                  S U5      US'   [         R
                  " USUS9  [         R                  " U5      S:X  a  U$ US   $ )Nr   c                     U R                  5       S:X  a  U R                  5       $ [        U R                  U R                  5      $ rl   )rn   rd   rP   rR   rS   )vs    r<   <lambda>,_broadcast_processed_state.<locals>.<lambda>O  s.    AaeegV3DQWWagg3VVr;   srcrs   )r   get_rankr&   rM   rN   broadcast_object_list)r   rg   rs   objectss       r<   _broadcast_processed_stater   F  sl    
 G}}Uq "LLV


 	wAU;}}Uq qzr;   c                    [         R                  " U5      S:X  aQ  [        U[        R                  5      (       a  UR                  5       S:X  a  U$ UR                  U R                  5      nO[        U[        R                  5      (       a  UR                  5       S:X  d   S5       eU$ [        U[        5      (       d  U$ [        R                  " UR                  UR                  U R                  S9n[         R                  " USUS9  U$ )Nr   zlFor non-zero ranks, a tensor state should have zero dimension, but got the state with shape {state.shape()}.rS   ry   r   )r   r   rc   rM   rN   rn   r{   rx   rP   zerosrR   rS   	broadcast)r   r+   rs   tensors       r<   _broadcast_stater   Y  s     }}Uq %..%))+2BL*334eU\\**99;!# @# LE#455LKKu{{:3L3L
 	NN6q.Mr;   fqnc                 H   U(       d  0 $ U R                   nU R                  R                  nU R                  U   nUR                  U   n[        X#R                  UR                  S9nUR                  (       d  0 $ 0 nUR                  nUR                  n	UR                  5        Hy  u  p[        R                  " U5      (       aU  UR                  5       S:  aA  UR                  [         R"                  :w  a#  UR%                  5       XS-    R'                  5       nXU
'   M{     U$ )z
Shard the optimizer state for the original parameter with the name ``fqn``.
This API should only be used when ``use_orig_params`` is True.
pgry   r   rq   )r+   r,   rt   r-   _shard_param_infosr   r   rx   in_shardintra_param_start_idxintra_param_end_idxitemsrM   rm   rn   rv   r!   rw   flattenr   )r[   r   rg   r   rt   	param_idxshard_param_infonew_optim_stater   r   r   r   s               r<   _shard_orig_param_stater   p  s    	 &&J ''22J--c2I!44Y?$009R9RK $$	&(O,BB*>>(..0
OOE""		a,,0@0I0IIMMO%) eg  ',
# 1 r;   optim_state_dictmodeluse_orig_paramsoptim
rank0_onlyc           	      F   [         R                  " 5         U nSU;  a  U(       d  [        S5      e[        U5      n[	        U5      n[        [        UR                  5       5      5      R                  n	U(       a
  [        XUS9n0 n
US   n[        UR                  5       5      nUR                  5        GH<  u  pUS   nX;  a  M  UR                  U5        U(       aG  U H<  nX   (       d  M  X   R                  5        H  n[        XU   U   US9X   U'   M     M>     US   nX;   Ga  X   nU(       aG  [         R                  " [         R                   R"                  5         [%        UUX   5      nSSS5        O['        UUU5      n[)        [+        U5      S5      nW(       a  UU
U'   OU(       a~  [-        U5      S:X  d   SU S	35       eUb`  UR                  R/                  US5      nUb  [0        R2                  " U5      U
U'   Ox[4        R6                  " S
U SU	R8                   S	35        OP[;        SU S35      e[-        U5      S:X  d   e[)        [+        U5      S5      n[0        R0                  " X   5      U
U'   U(       d  GM  U H_  nX   (       d  M  [=        X   R                  5       5       H2  u  nnU	R8                  S:  a  X   U	 M  UR?                  5       X   U'   M4     Ma     GM?     U HY  nUU   n[A        U[B        RD                  5      (       a  U(       a  U(       a  [        U	UUS9n[0        R0                  " U5      U
U'   M[     [         RF                  " S5        SU;   a  [0        R2                  " US   5      nU
US.$ SU
0$ ! , (       d  f       GN= f)a~  
Flattens the full optimizer state dict, still keying by unflattened parameter
names.

If ``use_orig_params`` is True, each rank will have all FSDP-managed
parameters but some of these parameters may be empty due to the sharding.
For a regular optim.Optimizer, states for those empty parameters will
not be initialized. So, when aggregating the FQNs across ranks, no assert
will be raised on a rank even if it does not have all the states -- it is
valid and FSDP know how to aggregate them. However, FSDP has to ignore
handling those parameters that are not managed by FSDP and do not exist on
the local rank -- it is managed by other parallelism and FSDP does not
know ho to handle/aggregate them.

Note that ``_flatten_tensor_optim_state`` does not need ``optim`` to
flatten/shard the state. However, NamedOptimizer and KeyedOptimizer require
all the states even if the corresponding parameters are empty. To this end,
``optim`` will be used to to get the initial state of the empty parameters.
``optim`` should only be non-None if the ``optim` is KeyedOptimizer or
NamedOptimizer.

Returns:
    Dict[str, Any]: The flattened optimizer state dict.
r+   zO`optim_state_dict` must have the keys "state"to be a valid optimizer state dictrr   r   NTrq   z5use_orig_params is True but there are multiple FQNs, .zoptim_state[z] is not on rankzThe state of z8 is empty. This should happen when use_orig_params=True.Fz,FSDP _flatten_optim_state_dict() profiling: param_groupsr+   r   )$r   reset
ValueErrorr   _get_fqn_to_fsdp_param_infor   itervaluesr+   r   setrB   r   difference_updater   profileType
RESHARDINGr   _flatten_optim_staterV   rZ   lengetcopydeepcopywarningswarnr   RuntimeErrorr8   rd   rc   rM   rN   dump_and_reset)r   r   r   r   r   rs   
unflat_osdparam_to_fqnsfqn_to_fsdp_param_infor   flat_osd_stateunflat_osd_stateall_state_keysparamfqnsr   r   r[   
flat_staterh   r+   param_state
user_stateflat_osd_param_groupss                           r<   _flatten_optim_state_dictr     s   @ !Jj 1
 	
 'u-M8?d188:;<BBJ /
eT
 =?N!'*)..01N$**,1g&((.',"2"7"<"<">J8H"S$9*$EU9$)*5 #?  q'C(49O#++N,?,?,J,JK!8'(-"J LK 2#$

 !td3C &0s# 4yA~ KD6QRS~ $!KKOOE48E(.2mmE.Bs+ *3%/?
?PPQR
 ##C5 ), , 
 t9>!> te4C"&)),<,A"BN3:',/34D4I4O4O4Q/R+J!*,1*= =HOO<M(-j9 0S y -R %c*
j%,,//J?)*jNJ"ii
3s	  !!"PQ # $j.H I'9NOO((K LKs   #N
N 	r   rX   c           	         U R                   nU R                  nUR                  n[        U5      nUS:  d   S5       eUR                  n[        U5      nXh:X  d   SU SU 35       eU V	s/ s H  n	[        X;   5      PM     n
n	[        U
5      (       d  0 $ U V	s/ s H+  n	X;   a!  [        X   UR                  UR                  S9OSPM-     nn	SnU HQ  nUc  M  Uc  [        UR                  5       5      nM&  U[        UR                  5       5      :w  d  ME  [        SU 35      e   Uc   e0 nU GH  nU Vs/ s H  nUb  X   OSPM     nnU Vs/ s H
  nUc  M  UPM     nnU(       d  SX'   MA  S=n=nnU Hb  nU[        R                  " U5      =(       a    UR                  5       S:  -  nU[!        U5      -  nU[        R                  " U5      (       + -  nMd     U Vs1 s H  n[#        U5      iM     nn[        U5      S	:w  d  U(       d"  U(       d  U(       d  [        S
U SU SU 35      eU(       av  [%        UUUUU5      nUR&                  S	:w  aN  UR(                  [*        R,                  :w  a0  [.        R0                  " UUR2                  UR&                  5      u  nnOUnUX'   GM  U(       a  [5        UUU5      X'   GM  U(       d   e[7        UUU5      X'   GM     U$ s  sn	f s  sn	f s  snf s  snf s  snf )a3  
Flattens the optimizer state in ``full_optim_state_dict`` for a single
flat parameter in ``fsdp_param_info`` corresponding to the unflattened
parameter names in ``unflat_param_names``.

Args:
    fsdp_param_info (FSDPParamInfo): The FSDP state, the handle, and a
        mapping from FQN to original parameter index.
    unflat_osd_state (Dict[str, Dict[str, Any]]): The "state" part of the
        optimizer state dict corresponding to the unflattened parameters.
    unflat_param_names (List[str]): A :class:`list` of unflattened
        parameter names corresponding to the flat parameter ``flat_param``.

Returns:
    Dict[str, Any]: A :class:`dict` mapping state names to their values for
    a particular flat parameter. The sharded optimizer state dict's "state"
    part will map a key to this returned value.
r   zNExpects at least one unflattened parameter corresponding to the flat parameterzExpects z shapes but got r   Nz@Differing optimizer state names for the unflattened parameters: Trq   z*Differing optimizer state types for state z	, values z", and unflattened parameter names )r+   r,   rt   r   _shapesr9   anyr   r   rx   r   rB   r   rM   rm   rn   ro   rz   _flatten_tensor_optim_stateru   rv   r!   rw   r   
_get_shardr   $_flatten_zero_dim_tensor_optim_state_flatten_non_tensor_optim_state)r[   r   rX   r   r,   rt   r   unflat_param_shapesnum_unflat_param_shapesunflat_param_name	has_stateunflat_param_statesstate_namesrf   r   r   state_valuesr   non_none_state_valuesare_pos_dim_tensorsare_zero_dim_tensorsare_non_tensorstypesr   sharded_flat_tensorr   s                             r<   r   r   %  s   . !&&J##F""J./q  X  %,,!"567 
$%%56M5NO7 "4!3 	23!3   y>>	 "4	 "4 0 	/'',,	
 	 "4  	 K1%05578Kc"4"9"9";<< ##5"68  2 """ 57J!
 ':
&9" /A.L*RVV&9 	 
 -9 JLqAL J$%)J"GKKK2_&A5??1#5#E!%%'A+E $7$:: 5??1#555O ' #88"7Qa"78u:?#7?<ZL I/0 1+,. 
 5"#K %%*004D4M4MM)8)C)COO))*&#Q '2#%8J"!%I"&J" #"?%D"&J"m "x C	8
 !K 9s$   *K&2K+7K0K5K5"K:r   pos_dim_tensorsr   r,   c                 v   UR                   nU Vs/ s H	  ofc  M  UPM     nnU Vs1 s H  ofR                  iM     nn[        U5      S:w  a  [        SU SU  SU 35      e[	        [        U5      5      n	[        X5       HR  u  pU
c  [        U5      S:X  a  [        S5      eU
c  M'  U
R                  U:w  d  M9  [        SU
R                   SU 35      e   [        R                  " S	5      n[        X5       VVs/ s HW  u  pUb%  [        R                  " UR                  U5      5      O)[        R                  " [        R                  " UU	US
95      PMY     nnnUR                  XR                  5      nUR                  nUR                  U:X  d   SUR                   SU 35       eU$ s  snf s  snf s  snnf )a  
Flattens the positive-dimension tensor optimizer state given by the values
``tensors`` for the state ``state_name`` for a single flat parameter
from ``handle`` corresponding to the unflattened parameter names
``unflat_param_names`` and unflatted parameter shapes
``unflat_param_shapes``. This flattens each unflattened parameter's tensor
state into one tensor.

NOTE: We use zero tensors for any unflattened parameters without state
since some value is required to fill those entries. This assumes that the
zero tensor is mathematically equivalent to having no state, which is true
for Adam's "exp_avg" and "exp_avg_sq" but may not be true for all
optimizers.

Args:
    state_name (str): Optimizer state name.
    pos_dim_tensors (List[torch.Tensor]): Positive-dimension tensor
        optimizer state values for the unflattened parameters corresponding
        to the single flat parameter.
    unflat_param_names (List[str]): A :class:`list` of unflattened
        parameter names corresponding to the single flat parameter.
    unflat_param_shapes (List[torch.Size]): Unflattened parameter shapes
        corresponding to the single flat parameter.
    handle (FlatParamHandle): The flat parameter's handle.

Returns:
    torch.Tensor: A flat tensor containing the optimizer state
    corresponding to ``state_name`` constructed by concatenating the
    unflattened parameter tensor states in ``pos_dim_tensors`` (using zero
    tensors for any unflattened parameters without the state).
rq   zAll unflattened parameters comprising a single flat parameter must have positive-dimension tensor state with the same dtype but got dtypes  for state ! and unflattened parameter names r   z6Flattening a zero-dimension parameter is not supportedzBTensor optimizer state does not have same shape as its parameter:  rd   )r}   rS   ry   ztensor optim state: z flat parameter: )rt   rS   r   r   r   r   ziprR   rM   ry   r   r{   r   flatten_tensors_aligned_numelr   )r   r  rX   r   r,   rt   tnon_none_tensorsdtypesrS   r   rR   
cpu_devicestate_valuetensors_to_flattenr   flat_param_shapes                    r<   r   r     s   L ""J#2D?a?D/0/!gg/F0
6{a))/J< H++=*>@
 	
 fE_B>c%jAoUVVFLLE$9$ll^1UG5 	 C e$J #&o"K #LK " 	knnZ01]]KK!
	
 #L   (();=R=RSK!:: 00 
{0011BCSBTU0 U E0.s   F+F+F0:AF5zero_dim_tensorsc           
         U Vs/ s H	  o3c  M  UPM     nnU Vs1 s H  o3b  UR                  5       OSiM     nnU Vs1 s H  o3b  UR                  OSiM     nn[        U5      [        U5      :w  d  [        U5      S:w  d  [        U5      S:w  a  [        SU SU SU  SU 35      e[	        [        U5      5      n[	        [        U5      5      n[        R                  " Xx[        R                  " S5      S9$ s  snf s  snf s  snf )	a,  
Flattens the zero-dimension tensor optimizer state given by the values
``zero_dim_tensors`` for the state ``state_name`` for a single flat
parameter corresponding to the unflattened parameter names
``unflat_param_names`` by enforcing that all tensors are the same and using
that common value.

NOTE: The requirement that the tensors are the same across all unflattened
parameters comprising the flat parameter is needed to maintain the
invariant that FSDP performs the same computation as its non-sharded
equivalent. This means that none of the unflattened parameters can be
missing this state since imposing a value may differ from having no value.
For example, for Adam's "step", no value means maximum bias correction,
while having some positive value means less bias correction.

Args:
    state_name (str): Optimizer state name.
    zero_dim_tensors (List[torch.Tensor]): Zero-dimension optimizer state
        for the unflattened parameters corresponding to the single
        flat parameter.
    unflat_param_names (List[str]): A :class:`list` of unflattened
        parameter names corresponding to the single flat parameter.

Returns:
    torch.Tensor: A zero-dimensional tensor giving the value of the state
    ``state_name`` for all unflattened parameters corresponding to the
    names ``unflat_param_names``.
Nrq   All unflattened parameters comprising a single flat parameter must have scalar state with the same value and dtype but got values z and dtypes r  r  rd   r   )	itemrS   r   r   r   r   rM   r   ry   )	r   r  rX   r  r  
values_setr  r   rS   s	            r<   r   r     s   B $4E#3a#3E?OP?O!m!&&(5?OJP:JK:JQaggD0:JFK%5!66z?av;!(\fX[l;!"	$
 	
 j!"EfE<<5<<3FGG% FPKs   C9C9C>Dnon_tensorsc                     U Vs/ s H	  o3c  M  UPM     nn[        U5      n[        U5      [        U5      :w  d  [        U5      S:w  a  [        SU SU  SU 35      e[        [	        U5      5      nU$ s  snf )a@  
Flattens the non-tensor optimizer state given by the values ``non_tensors``
for the state ``state_name`` for a single flat parameter corresponding
to the unflattened parameter names ``unflat_param_names`` by enforcing that
all values are the same and using that common value.

See the note in :func:`_flatten_zero_dim_tensor_optim_state`.

Args:
    state_name (str): Optimizer state name.
    non_tensors (List[Any]): Non-tensor optimizer state for the unflattened
        parameters corresponding to the single flat parameter.
    unflat_param_names (List[str]): A :class:`list` of unflattened
        parameter names corresponding to the single flat parameter.

Returns:
    Any: A non-tensor giving the value of the state ``state_name`` for all
    unflattened parameters corresponding to the names
    ``unflat_param_names``.
rq   r  r  z" and  unflattened parameter names )r   r   r   r   r   )r   r  rX   ntnon_none_non_tensorsnon_tensor_setr   s          r<   r   r   8  s    2 *5G2BG%N
 C$44N8Kq8P,-[ E++=*>@
 	
 d>*+J Hs
   A3A3sharded_osdoptim_inputusing_optim_inputis_named_optimizerc           
      6   [        U5      n[        U5      n[        [        [        R
                  [        [        [        4   4   U(       a  [        X5      O[        X!XVU5      5      n[        U5      [        U5      ::  d   e0 n	0 n
UR                  5        H*  u  pX;  a  M  X   nX[        U5      '   U H  nXU'   M	     M,     U S   n0 nUR                  5        HM  u  nn[        U[        5      (       a  UUU'   M"  U	R                  UR                   UR                   5      nUUU'   MO     SU ;   a`  / nU S    HP  n["        R$                  " U5      n['        US    Vs1 s H  nX   iM	     sn5      nUUS'   UR)                  U5        MR     UUS.$ SU0$ s  snf )a  
Rekeys the optimizer state dict from unflattened parameter names to flat
parameter IDs according to the calling rank's ``optim``, which may be
different across ranks. In particular, the unflattened parameter names are
represented as :class:`_OptimStateKey` s.
r+   r   paramsr   )r   _get_flat_param_to_fqnr   r5   r   r   r   r7   r6   '_get_param_to_param_id_from_optim_input_get_param_to_param_keyr   r   rZ   rc   r   rX   r   r   rA   r   )r  r   r   r  r  r  r   flat_param_to_fqnparam_to_param_key$unflat_param_names_to_flat_param_key#unflat_param_name_to_flat_param_keyr   rX   flat_param_keyr   sharded_osd_staterekeyed_osd_staterh   r   rekeyed_osd_param_groupsunflat_param_groupflat_param_groupflat_param_keyss                          r<   _rekey_sharded_optim_state_dictr0  _  s   & 'u-M.u5>BR\\5c?*+ ! 4EG(0AR	? !"c-&8888 	 )
 	 ( &3%8%8%:!*+2JXU3E-FG!3ES0AB "4 &; $G,46-335[c3%0c"=AA""C$:$:
 -8.) 6 $9; "-n"=#}}-?@$ .@-I-I) 8J-IO *9X&$++,<= #> +<TUU*++s   F
c                    Uc"  [        [        U R                  5       5      5      $  [        [        [
        R                     [	        U5      5      n[        U5      S:X  a  [        S5      eSnSnU H3  nU[        U[        R                  5      -  nU[        U[         5      -  nM5     U(       d  U(       d  [        S5      eU(       a  [        [        U5      5      $ U(       d   e/ nU H*  nSU;   n	U	(       d   S5       eUR                  US   5        M,     [        [        U5      5      $ ! [         a  n[        SU 35      UeSnAff = f)	a  
Constructs a mapping from parameter IDs to parameters. This may be used
both for models with ``FlatParameter`` s and without.

NOTE: This method is only preserved for backward compatibility. The method
:meth:`_get_param_key_to_param` is the preferred code path that does not
rely on ``optim_input``.

NOTE: We critically assume that, whether the optimizer input is a list of
parameters or a list of parameter groups, :class:`torch.optim.Optimizer`
enumerates the parameter IDs in order. In other words, for a parameter list
input, the parameter IDs should be in that list order, and for a parameter
groups input, the parameter IDs should be in order within each parameter
group and in order across parameter groups.

Args:
    model (nn.Module): Model whose parameters are passed into the
        optimizer.
    optim_input (Optional[Union[List[Dict[str, Any]],
    Iterable[nn.Parameter]]]): Input passed into the optimizer
        representing either a :class:`list` of parameter groups or an
        iterable of parameters; if ``None``, then this method assumes the
        input was ``model.parameters()``. (Default: ``None``)

Returns:
    List[nn.Parameter]: Mapping from parameter IDs to parameters,
    where the parameter ID is implicitly the index in the :class:`list`.
NzCOptimizer input should be an iterable of Tensors or dicts, but got r   z#Optimizer input should not be emptyTz9Optimizer input should be an iterable of Tensors or dictsr!  zNA parameter group should map "params" to a list of the parameters in the group)r5   	enumerate
parametersr   r8   r   r   	TypeErrorr   r   rc   rM   rN   extend)
r   r  r!  eall_tensors	all_dictsr   param_id_to_paramparam_grouphas_params_keys
             r<   '_get_param_id_to_param_from_optim_inputr<    sP   N Ie..0122d2<<(${*;< 6{a>?? KIz%66Zt,,	  ySTTIf%&&9,.![0 	
&	
~ 	  X!67  	+,--;  "m%
 	s   *D+ +
E	5EE	c                 x    S nS n0 n[        U UU[        U 5       VVs/ s H  u  pEUPM	     snnU5      $ s  snnf )a  
Constructs a mapping from ``FlatParameter`` to a cleaned (devoid of prefixes
from wrappers) fully qualified name (FQN). Note that this FQN is "non-canonical"
because ``FlatParameter``  s do not come from the original module but are
registered only after FSDP has been applied. This function returns the FSDP-given
name for the ``FlatParameter`` (usually module._flat_param) as opposed to the
canonical FQNs returned for ``FlatParameter`` s in ``_common_utils._get_param_to_fqns(...)``).

Consequently, this function will only return a non-empty mapping if FSDP was
applied with ``use_orig_params=False`` as, otherwise, the original parameters
are used within the module and there would be no ``FlatParameter`` s in the module.

c                 z    [        U SS9 H-  u  pE[        U[        5      (       d  M  [        X-   5      nXcU'   M/     g )NF)recurse)r   rc   r   r   )moduleprefix
tree_levelr%  
param_namer   r   s          r<   	module_fn)_get_flat_param_to_fqn.<locals>.module_fn	  s@    !BE"
J e]33#F$78C'*e$"
r;   c                     U $ r@   r/   )r%  s    r<   	return_fn)_get_flat_param_to_fqn.<locals>.return_fn        r;   r   r   )r   rD  rG  flat_param_to_fqn_retr   r   s         r<   r"  r"    sN    +! 79<UCDCCD  	E   6r   r%  c                    0 nU(       a3  Ub  Uc   S5       eUc   e[        U5       H  u  pgXe[        U5      '   M     0 nSn	U R                   Hd  n
U(       aE  U
S    H:  nUc   eX;   a  XK   nOUc   e[        X;   5      S:X  d   eX;   S   n XV   nXU'   M<     MO  U
S    H  nXU	'   U	S-  n	M     Mf     U$ ! [         a/  n[	        SU S[        UR                  5       5       S35      UeSnAff = f)	z
Constructs a mapping from parameter keys to parameters. For the regular
optimizers, the keys are parameter IDs. For NamedOptimizer, the keys
are FQNs. This API may be used both for models with ``FlatParameter`` s and
without.
NzDThe optimizer is a NamedOptimizer, `param_to_fqns` must not be None.r   r!  rq   zCan't find z from r   )r   r   r   r   KeyErrorr8   rB   )r   r   r  r   r%  clean_fqn_to_curr_fqnrh   r   param_key_to_parampidr:  r   r6  s                r<   _get_param_key_to_paramrR    sU    -/(->-J 	
R	
J    7>FC<?"3C"89 ? ?A
C))$X.(444-+2C(444}34999'.q1C/4C
 +03'! /$ %X.*/3'q /) *0    "%cU&6K6P6P6R1S0TTUVs   B77
C0*C++C0c                 r    [        XX#U5      nUR                  5        VVs0 s H  u  pgXv_M	     snn$ s  snnf )z
Constructs the inverse mapping of :func:`_get_param_key_to_param`. This API
only supports the case where `optim` is a regular optimizer, not NamedOptimizer.
So the parameter keys will be parameter ids.
)rR  r   )r   r   r  r   r%  r9  param_idr   s           r<   r$  r$  R  sB     0(9J 4E3J3J3LM3LEO3LMMMs   3c                 n    [        X5      nUR                  5        VVs0 s H  u  p4XC_M	     snn$ s  snnf )zRConstructs the inverse mapping of :func:`_get_param_id_to_param_from_optim_input`.)r<  r   )r   r  r9  rT  r   s        r<   r#  r#  d  s7     @S3D3J3J3LM3LEO3LMMMs   1r0_optim_state_keysoptim_state_key_to_param_keyrP  c           	         / nU  HR  nXQ;  a  UR                  U5        M  X   n[        U[        5      (       d  M6  US:  a  U[        U5      :  a  MM   S5       e   [	        U5      n[
        R                  " [        U5      /[
        R                  US9n[        R                  " XS9  UR                  5       S:  a  [        [        R                  " U5      5       V	s/ s H  n	S PM     n
n	[        R                  " XUS9  Sn[        U
5       HR  u  p[        [         ["           U5      n[        U5      S:  d  M-  USU SU Vs/ s H  oR$                  PM     sn 3-  nMT     ['        U5      eg s  sn	f s  snf )Nr   z+Check the `param_key_to_param` constructionr   rr   zFSDP currently requires each rank to have at least the optimizer states needed by rank 0's optimizer but some ranks are missing some of those statesz
Rank z' is missing states for the parameters: )r   rc   r7   r   r   rM   r   int32r   
all_reducer  r   get_world_sizeall_gather_objectr2  r   r8   rV   rX   r   )rV  rW  rP  rs   missing_keysr0_optim_state_key	param_keyry   num_missingr   obj_list	error_msgr   rB   rh   s                  r<   _check_missing_keys_on_rankrc  r  sk    *,L1A  230D	i%%>i#6H2I&I =I 2 $E*F,,L 12%++fUKOOK-A"'(;(;E(B"CD"CQD"CDxUC/ 	
 $H-JD^,d3D4y1}dV#J:>?$3..$?@B	 . 9%% D @s   E/E4r   
merge_keysc                    [         R                  " U5      n0 n/ nUR                  5        H  u  pXS   ;  a  M  X:   n[        U
[        5      nU(       a,  US   U;   d#   US   [        UR                  5       5      45       eUS   U;   n[        [        U5      US9nUS:X  d  U(       a  UR                  U5        XU'   M     U(       as  [        [         R                  " U5      5       Vs/ s H  n/ PM     nn[         R                  " XUS9  / [        R                  " U5      Qn[        [!        U5      5      nX4$ US:X  a  U/OS/n[         R"                  " USUS9  US   c   eUS   n[%        UUUU5        X4$ s  snf )a,  
Construct the local mapping between the ``_OptimStateKey`` and parameter keys
and all the ``_OptimStateKey`` across ranks. If ``merge_keys`` is False, rank0
must contain all the ``_OptimStateKey``, an exception will be raised otherwise.
Note that ``merge_keys`` should equal to ``use_orig_params``.
r+   r   )rX   rY   rr   Nr   )r   r   r   rc   r   r8   rB   rV   rZ   r   r   r[  r\  r	   from_iterablerA   r   r   rc  )r   rs   rP  r   r   rd  r   rW  all_optim_state_keysr_  r   r   rY   optim_state_keyr   all_keysmerge_all_optim_state_keyskey_obj_lists                     r<   _map_param_key_to_optim_keysrl    s    ==DJL 13.446	 W55#$UM:744 Q+00237 4 q'%;;($T{+
 19
 ''88A_5' 7* d11%890
91B9 	 0
 	xUK%Eu':':8'D%E"%c*D&EF  == '+ai!"dV 	 	""<QeDA***+A# (		
  ==)0
s   *E<
state_dictc                    / nU S    Hp  n[         R                  " U5      nUS    Vs/ s H  nX   PM	     nnU Vs/ s H  oU   PM	     n	n/ [        R                  " U	5      QUS'   UR	                  U5        Mr     U$ s  snf s  snf )Nr   r!  )r   r   r	   rf  r   )
rm  rP  r   r   r.  r-  r)  param_group_paramsr   nested_unflat_param_namess
             r<   _unflatten_param_groupsrq    s    
 *,L&~6!]]+;< #38"<
"< ."< 	 

 /A%
.@U% .@ 	" %
(
  !:;(
8$ 	./ 7 
%
s
   A>Bc                     U R                  SS5      nU(       d  g [        [        UR                  5       5      5      n[        U[        5      $ ! [         a  n[	        U 5      UeSnAff = f)z
Returns whether the state_dict is from a NamedOptimizer.
This function checks that the keys in the state_dict['state'] are strings
(which usually are FQNs) versus integers (which usually refer to param_ids
from a vanilla torch.optim.Optimizer).
r+   NF)r   r   r   rB   	Exceptionrc   r6   )r   r+   rh   r6  s       r<   _is_named_optimizerrt    sg       $/E 14

%& c3  1()q01s   "A 
A)A$$A)c                   f    \ rS rSr% \\\4   \S'   \\\R                  4   \S'   \\\
4   \S'   Srg)	StateInfoi  tensorsscalar_tensorsr  r/   N)r0   r1   r2   r3   r5   r6   rP   r4   rM   rN   r
   r:   r/   r;   r<   rv  rv    s7     #(())ell*++c3hr;   rv  input_statesc                 0   0 n[        U R                  5       Vs/ s H  n0 PM     nnUR                  5        H  u  pV[        0 0 0 5      n[	        U5       H  u  p[
        R                  " U	5      (       ab  U	R                  5       S:X  a  U	R                  5       UR                  U'   MS  [        U	R                  U	R                  5      UR                  U'   M  XR                  U'   M     XrU'   M     [        R                   " UUU R"                  S9  U$ s  snf )z
Given the ``input_states``, allgather StateInfo for each state. The function
uses all_gather_object to gather StateInfo so no GPU tensors are sent.
r   rr   )r   ru   r   rv  rD   rM   rm   rn   rd   rx  rP   rR   rS   rw  r  r   r\  r   )
r   ry  processed_state_dictr   gathered_state_infor   rg   processed_stater   r   s
             r<   _allgather_state_infor~    s    24*//070q0  7 )..0#BB/!-k!:Ju%%99;!#AFO22:>:KU[[;O++J7 ;@++J7 "; %4S! 1 	&&
 17s   Dr|  output_statesc                 r   0 nUR                  5        GH  u  pVU Vs/ s H  owU   PM	     nn[        U V	V
s1 s H%  oR                  R                  5         H  oiM     M'     sn
n	5      n[	        5       nSnU GH9  n/ n[	        5       n[        U5       H  u  nnUR                  S5        UR                  R                  US5      nUbC  UR                  R                  5       US'   U(       d  UR                  nOUUR                  :X  d   eUS   S:X  d  M  UR                  U5        M     U(       a  UU:X  d   eUnX;  a  U R                   Vs/ s H  nSPM     snXN'   X%   R                  US5      nUb%  UR                  U R                  R                  5      nUXN   U R                  U   '   GM<     [        U5       H  u  nnUU;   a  M  UR                   R                  5        H=  u  nnUR                  US5      nUb  UU:X  d   SU SU SU S3SU 3-   5       eUUU'   M?     UR"                  R                  5        HS  u  nnUR                  US5      nUb3  [$        R&                  " UU5      (       d   SU SU SU S3SU 3-   5       eUUU'   MU     M     GM     WU4$ s  snf s  sn
n	f s  snf )	a  
Given the ``gathered_state_info`` and ``input_states``, the API converted
the StateInfo into the original state if the state is not a non-scalar
tensor. For a multi-dimensional tensor, the local state will be stored in
``state_buffer`` in a correct order for later allgather purpose.
Nr   zRank z has different values for z: r   z Other ranks: )r   rA   rw  rB   r   r2  r   r   rR   r   rS   addr-   r{   r+   rx   r  rx  rM   equal)r[   r|  ry  r  state_buffersr   gathered_states
state_infor+   nall_tensor_statesempty_ranksrS   r   numels_empty_ranksr   object_stateinfor   local_statenamenon_tensor_valuecurr_non_tensor_valuescalar_tensor_valuecurr_scalar_tensor_values                              r<   _convert_all_state_infor  1  s    >@M,224&9:&9f&9
:"&EJ50B0B0D1Q0DQJE
 !$'+ ,JF%(UL&/
&;"la #++//
DA#!%!1!1!3F2J  $

$

222":? $$T* '< #k\&AAA&K."1"?"?-"?QD"?-) '+//
DAK &)nn_-B-B-Q-QRLWM%o&C&CC&HI7 ,@ #,J"7D,{"*6*B*B*H*H*J&&(6(:(:4(F%)1,0@@ D6!;D6DTCUUVW&'<&=>?	A (8t$ +K .:-H-H-N-N-P))+9+=+=dD+I(/75;;')A< <  D6!;D6DWCXXYZ&'?&@AB  (;t$ .Q #8S  5B -A ;E0-s   J),J.J4c           	         U(       d  gU R                   R                  nU R                  nUR                  5        GHh  u  pX   n
U R                  U   n[        U
[        5      (       a  U
R                  S   nU[        5       :w  a  UR                  nU
R                  [        5       4S9  [        UR                  U   5      nX==   U
R                  R                  S5      -  ss'   [        R                   " U5      nU
R#                  U5      n
O=U
R#                  UR                  U   5      n
OU
R#                  UR                  U   5      n
U(       a  UR$                  n['        USS5      (       a<  UR(                  c   e[+        U
UR,                  UR(                  UR.                  5      n
OUR0                  c   e[3        U
UR,                  UR4                  UR6                  R9                  5       UR0                  UR.                  5      n
ODU(       d=  [:        R<                  " S5         U
R?                  5       RA                  5       n
SSS5        U(       aG  [:        R<                  " [:        RB                  RD                  5         U
RG                  5       n
SSS5        XU'   GMk     g! , (       d  f       Nf= f! , (       d  f       N)= f)a  
Given a output state dict, ``output_states``, which the keys are FQNs to the
original parameters (not FlatParameters nor parmeter ID), and the values
are gathered states, unflatten the states to the original dimensions.

This function performs the unflattening process in-place.
Nr   )
placementsr   Fr   )$r,   rt   r+   r   r-   rc   r$   r  r%   rn   redistributer8   r   device_meshr}   rM   rT   reshaper   r   r   r   r   r   r   r   ru   r   r   r   r   r   r   r   D2Hrd   )r[   r  r   r^   r]   r_   rt   r   r   r  r   r   	placementplacement_dimreshape_sizer   s                   r<   _unflatten_orig_param_statesr    sF     ''22J &&J,224*#11#6	 eW%%((+I IK' )""y{n"=#J$6$6y$AB+u/@/@/E/Ea/HH+$zz,7l3 j&8&8&CD MM*"4"4Y"?@E#<<Jz>599!..:::*OO++..	 "//;;;)OO))--::<,,.. ''0,,. 1 ''(;(;(?(?@		 A%*z"e  5X 10 A@s   *J)J:)
J7	:
K	c                 
   U R                   nUR                  S:X  a_  [        R                  " 5       [        R                  R
                  :X  a.  [        R                  SUR                  R                  5       5        UR                  5        Vs0 s H  ow0 _M     nn[        XX(5      u  p[        U
5      S:X  a  U$ U R                  R                  5        VVs/ s H  u  p{Xx;   a  SOSPM     nnnU R                  R                   n["        R$                  " [&        R(                  XR*                  S9nU" UR,                  5      nUR                  R/                  5         U
R                  5        GH  u  nn/ nUR                  UR0                  R3                  5       -  nUUR0                  R3                  5       -   S-
  nSu  nn[5        UR6                  UR8                  5       GH  u  nnU(       + =(       a%    U R:                  U   (       + =(       a
    UU   (       + nU(       d  U(       a  UUU-   S-
  nnUUs=::  a  U::  a  O  OUU:  a  UU-
  S-   OUU-
  S-   nOCUUs=::  a  U::  a  O  OUU::  a  UU-
  S-   OUU-
  S-   nOUUs=:  a  Us=::  a  U:  a  O  OUnOSnU(       a  UR=                  U" U5      5        U(       d7  UU   b,  UR=                  [?        [&        R@                  UU   5      5        US-  nUU-  nGM      UR0                  R3                  5       [C        S	 U 5       5      -
  nURD                  U:X  dF   S
URD                   SU SUR0                  R3                  5        SUR6                   SU SU S35       eUS:  a  UR=                  U" U5      5        [&        RF                  " U5      nUR3                  5       URH                  -  UR3                  5       :X  d   S5       eUR                  R/                  5         [J        RL                  " [J        RN                  RP                  5         [        RR                  " UUURT                  S9  UR                  R/                  5         SSS5        USURV                  R3                  5        nU R                  n U RY                  U5      n![        U!5      [        U R                  5      :X  d   S5       eU R                  R                  5        H*  u  p{U R:                  U   (       d  Xx;   d  M   U!U   X   U'   M,     [[        U UUUUU5        GM     AU$ s  snf s  snnf ! , (       d  f       N= f)z
Given the ``gathered_state_info`` and ``input_states``, the API allgathers
all tensor states and restore non-tensor states from ``gathered_state_info``.
r   z@Memory Summary before calling to _allgather_orig_param_states %sTFr   rq   )r   r   Nc              3   @   #    U  H  oR                  5       v   M     g 7fr@   )r   ).0r  s     r<   	<genexpr>/_allgather_orig_param_states.<locals>.<genexpr>>  s     1=a		=s   zLManually calculated _sharded_numel_padded is incorrect. _shard_numel_padded=z, shard_numel_padded=z, _sharded_size.numel=z, _numels_with_padding=z, begin=z, end=,zThe size of local shard times the world size should equal to the gathered tensor size. The inconsistency may be from a bug of FlatParameter's metadata or the reconstruction logic in optimizer state dict.rr   zThe number of parameters from FlatParameter is not consistent to the number of states used by optimizer state dict reconstruction logic.).r+   r   r   get_debug_level
DebugLevelDETAILloggerr  r   memory_summaryrB   r  r   r-   r   r,   rt   	functoolspartialrM   emptyrx   _padded_unsharded_sizer   _sharded_sizer   r  _numels_with_padding_is_padding_maskr.   r   r   rN   sum_shard_numel_paddedcatru   r   r   r   	ALLGATHERr   r   r   _get_unflat_views_alignedr  )"r[   r|  ry  r^   r]   r_   r   r   r  rS   r  idxhas_state_paramsrt   
empty_funcgathered_tensorr   bufferslocal_buffersbeginend
mem_offsetr   r   
is_paddingfrozen_and_no_statepadding_beginpadding_endpadding_lenshard_numel_paddedlocal_shardunpadded_tensorflat_param_handleorig_statess"                                     r<   _allgather_orig_param_statesr    sG    !&&J! 4 4 6$//:P:P PN%%446	

 DPCTCTCV/WCVCRCVM/W2lE =Q (55;;=$=HC $%/=  $ !''22J""5)B)BJ !!B!BCO))+,224
G,.*":":"@"@"BBj..4466: $
I!$++Z-H-H"
E: '1. ##77	BB 4(33  
 0
 .8e9Ka9O{ E8[8
 +- $e+a/ 5[1_  
 #c8[8
 !M1 m+a/ 5[1_  
 ]?k?C? #(K"#K!((K)@A 9%1!((ellGI<N)OPQ	%Jq"
t (55;;=1=11
 --1CC 	
##-#A#A"B C""4!5 6##-#;#;#A#A#C"D E$$.$C$C#D EG6#a)	
C !  ,>!?@ii.  "Z%:%::o>S>S>UU 	
	
U 	!!--/##N$7$7$A$AB''J4L4L %%113 C **WJ,O,O,U,U,WX+22'AA/R;3'D'D#EE 	
	
E
 (55;;=HC22373;O1<S1A":. > 	%	
W  5h 	e 0X$X CBs   U U:;U
U	c                    U R                   nUR                  S:X  d  UR                  [        R                  :X  a  U(       a  U$ 0 $ [
        R                  " [
        R                  R                  5         [
        R                  " [
        R                  R                  5         [        XQ5      nSSS5        [        U WUUUU5      nSSS5        U(       a  U R                  R                  5        Hm  u  pUW;   a  M  U R                  U	   (       d  M#  [        U S[!        U R                  R#                  5       5       S[!        UR#                  5       5       S35      e   W$ 0 $ ! , (       d  f       N= f! , (       d  f       N= f)a"  
Given a optimizer state dict, ``input_states``, which the keys are FQNs to the
original parameters (not FlatParameters nor parmeter ID), gather all the
states and unflatten them to the original dimensions. Note that all the
params referred by the ``input_states`` must be managed by FSDP.
rq   NzB is not in the output state. The FSDPParamInfo has the param keys z, while the output_states has the param keys r   )r+   ru   rv   r!   rw   r   r   r   r   ALLGATHER_OBJr~  r  r-   r   r.   r   rA   rB   )
r[   ry  r^   r]   r_   r   r|  r  rh   r  s
             r<   _gather_all_orig_param_stater  t  sU    !&&J"''+;+D+DD&|.B.			 3 3 > >	?##N$7$7$E$EF"7
"Q G4
 
@ '55;;=HCm#"66s;% 8/77<<>?@ A8-,,./0	3  > 	5 GF 
@	?s$   4/E6#E%/E6%
E3	/E66
Frg  c                 8   0 n0 nU  GHt  n	UR                  U	S 5      n
U
c  U	R                  (       d  M,  U	R                  (       a^  U	R                  S   nUR                  US 5      nUc  Mc  U
c  0 OX:   n[        U5      U;  a  0 U[        U5      '   X[        U5         U'   M  U(       d  M  [	        U	R                  5      S:X  d   eU	R                  S   n[
        R                  " S5         [        [        [        [        4   U
5      n
[        R                  " X:   5      X~'   U(       aI  [        X~   5       H8  u  nn[        R                  " U5      (       d  M#  UR                  5       X~   U'   M:     S S S 5        GMw     UR!                  5        H  n[#        [%        UR'                  5       5      5      nX+   n[	        UR(                  5      S:  d   S5       eUR*                  R-                  5        Hn  u  nnUU;   a  M  UR(                  U   (       d  M$  [/        U S[1        UR*                  R'                  5       5       S[1        UR'                  5       5       S35      e   UR3                  [5        UUUUU5      5        M     U$ ! , (       d  f       GM  = f)Nr   rq   none_fsdp_managed_copyzgWith use_orig_params, FSDPParamInfo should have requires_grad information. However, the length is zero.zE is not in the optimizer state. The FSDPParamInfo has the param keys z( while the optimizer has the param keys r   )r   rY   rX   idr   r   r   r   r   r6   r7   r   rD   rM   rm   rd   r   r   r   rB   r.   r-   r   r   rA   updater  )rg  rW  r   r   r]   r^   r_   fsdp_osd_state
all_statesrh  r_  r   r[   r+   r   r   r   _all_statesrh   r  s                       r<   _convert_state_with_orig_paramsr    s    &(N
 -/J 0+G+K+KT,
	 _%D%D**!44Q7C488dCO& #+B1A1LE/"*424
2o./38r/*+C0W99:a??? / B B1 E''(@A sCx)<	48II$/51 -9&9.)
E  %u55$HM		9*E. BA7 0T "((*4((*+,05?667!; 	
8	
; (55;;=HCk!"66s;% 8/77<<>?@ A4+**,-.a	1  > 	(	
' +: Y BAs   +BJ		
J	c                    0 nU  GHM  nUR                  US 5      n	U	c   SU SU	 35       eUR                  (       av  UR                  S   n
X*   n[        UX9   UUU5      nU(       aI  [	        U5      [	        UR                  5      :X  d   e[        UR                  U5       H	  u  pXU'   M     M  M  U(       d  M  [	        UR                  5      S:X  d   eUR                  S   n[        R                  " X9   5      X}'   U(       d  GM  [        X}   5       H8  u  nn[        R                  " U5      (       d  M#  UR                  5       X}   U'   M:     GMP     U$ )NzQIf use_orig_params is False, we must be able to find the corresponding param id. r  r   rq   )r   rY   rX   ri   r   r  r   rD   rM   rm   rd   )rg  rW  r   r   r]   r^   r_   r  rh  r_  r   r[   unflat_stater   rf   r   r   s                    r<   _convert_state_with_flat_paramsr    s    &(N 0+G+K+KT,
	 $ 	
''6&7qE	
$
 ** "44Q7C49O1 +L <(C0R0R,SSSS=@#66 >9% 9K#45	>  W99:a??? / B B1 E04		:J:U0VN-{)5"5*%J !??511 DIIIKN5jA*G 0T r;   c
                 P   [         R                  " 5         [        5       n
U
R                  [         R                  " [         R
                  R                  5      5        [        [        R                  " U 5      5        U(       + =(       d"    [        R                  " U5      S:H  =(       d    Un[         R                  " S5         [        U 5      n[        U 5      n[        U5      n[        [         ["        [$        [&        4   [(        R*                  4   U(       a  [-        X5      O[/        XXU5      5      n[1        U 5      nSSS5        [         R                  " S5         [3        UUWWWUS9u  nnSSS5        [         R                  " S5         U(       a  [4        O[6        nU" WWWUS   UUU	5      nSSS5        U(       d  0 $ SW0n[9        WR;                  5       5      nUS   R=                  5        H=  u  nnUU;   a  M  UU;   a  M  UW;   a  M  [>        R@                  " SU S	35        UUU'   M?     S
U;   a  [C        UWW5      US
'   U
RE                  5         [         RF                  " S5        U$ ! , (       d  f       GN:= f! , (       d  f       GN= f! , (       d  f       N= f)av	  
Consolidates the optimizer state and returns it as a :class:`dict`
following the convention of :meth:`torch.optim.Optimizer.state_dict`,
i.e. with keys ``"state"`` and ``"param_groups"``.
The flat parameters in ``FSDP`` modules contained in ``model`` are mapped
back to their unflattened parameters.

Parameter keys are not well-defined. For a regular optimizer, the optimizer
state_dict contains a mapping from parameter IDs to parameter states.
Parameter IDs are the order of parameters in ``optim.param_groups()`` across
all the groups. This API also allows user to pass ``optim_input`` for the
mapping between parameters and parameter IDs. Using ``optim_input`` is being
deprecated.

If the optimizer is a ``NamedOptimizer``, the optimizer state_dict does not
contain parameter IDs mapping but a mapping from parameter FQNs to parameter
states. This API finds the mapping from FQNs to parameters if the optimizer
is a ``NamedOptimizer``.

If ``use_orig_params`` is True, each rank will have all FSDP-managed
parameters but some of these parameters may be empty due to the sharding.
For a regular optim.Optimizer, states for those empty parameters will
not be initialized. So, when aggregating the FQNs across ranks, no assert
will be raised on a rank even if it does not have all the states -- it is
valid and FSDP knows how to aggregate them. However, FSDP has to ignore
handling those parameters that are not managed by FSDP and do not exist on
the local rank -- those are managed by other parallelisms and FSDP does not
know how to handle/aggregate them.

Args:
    model (nn.Module): Root module (which may or may not be a
        :class:`FullyShardedDataParallel` instance) whose parameters
        were passed into the optimizer ``optim``.
    optim (torch.optim.Optimizer): Optimizer for ``model`` 's
        parameters.
    rank0_only (bool): If ``True``, saves the populated :class:`dict`
        only on rank 0; if ``False``, saves it on all ranks. (Default:
        ``True``)
    shard_state (bool): If ``True``, shard and distribute all
        non-zero-dimension states.

Returns:
    Dict[str, Any]: A :class:`dict` containing the optimizer state for
    ``model`` 's original unflattened parameters and including keys
    "state" and "param_groups" following the convention of
    :meth:`torch.optim.Optimizer.state_dict`. If ``rank0_only=False``,
    then nonzero ranks return an empty :class:`dict`.
r   preprocessingNpreprocessing_with_comm)rd  state_convertingr+   zFound a optim state, aK  , that FSDP cannot process. FSDP will directly copy everything to the returned state_dict. In most cases, this is a user-defined state that is not associated with any particular parameter. Another possible case is this state is managed by TorchRec. Otherwise, there may  be a mismatched assumption of optim_state_dict of this mode.r   z$FSDP _optim_state_dict() profiling: )$r   r   r   enter_contextr   r   ALLr    traversal_utils_get_fsdp_handlesr   r   r   r"  rt  r   r5   r   r7   r6   r   r   r<  rR  r   rl  r  r  r   r   r   r   r   rq  closer   )r   r   r   r  r   r^   rs   r  r   r_   cmr]   r   r%  r  rP  r   rg  rW  
convert_fnr  fsdp_osdflat_param_fqnsrh   r   s                            r<   _optim_state_dictr  :  so   D 	B^++N,?,?,C,CDE)/*K*KE*RSnHe 4 9H[G				0*5125901AB!sCx",,./ % 8K,"4EV	
 "=U!C 
1" 
		 9	: )"&
	
 ( 
; 
		 2	3  ,0 	
 $ ("W%
 
4$ 	 '8H+2245O&w/557
U. /!$$ 	#C5 )L L	
 $s% 8( ))#:0-$
  HHJ!!"HIOe 
1	0" 
;	: 
4	3s%   A>I3J%J3
J
J
J%c                 x    S nS n0 n[        U UU[        U 5       VVs/ s H  u  pEUPM	     snnU5      $ s  snnf )a  
Construct the mapping from a param's fqn to its corresponding ``FSDPParamInfo``
if the param is managed by FSDP. Shared parameters, or original parameters that
are shared across multiple nn.Modules, are required to belong to one and only
one FSDP instance and thus correspond to one ``FlatParameter``. Within the one
``FlatParameter``, ``FlatParameter._fqns`` only stores the first FQN of a shared
parameter. Thus, the keys in the mapping are guaranteed to map to unique parameters.
c                    [        U 5      nUc  g [        X@5        [        X@5      nU(       d  g UR                  n[	        XE0 / 5      n[        UR                  5       H  u  p[        X-   5      n
X;   a"  X:   R                  R                  UL d   U
5       eXsU
'   XR                  U
'   UR                  c  MZ  UR                  R                  UR                  U   R                  5        M     g r@   )r   r   r   rt   r)   r2  _fqnsr   r,   r-   _paramsr.   r   requires_grad)r@  rA  rB  fqn_to_param_infor   r,   rt   r[   r  	local_fqnr   s              r<   rD  ._get_fqn_to_fsdp_param_info.<locals>.module_fn  s    CFK
:&
3&&
'
BC (
(8(89NC#F$67C'(-44??:MRsRM%4c"14))#.!!-33::&&s+99 :r;   c                     U $ r@   r/   )r  s    r<   rG  ._get_fqn_to_fsdp_param_info.<locals>.return_fn  rI  r;   rJ  )r   rD  rG  r  r   r   s         r<   r   r     sP    .! 35 <UCDCCD  	ErL  state_dict_settingsc                     [        U SS 5      (       a?  UR                  nU[        R                  :X  a  [	        SSS5      eSUR
                  l        g g )Nr   z'Found state_dict_type LOCAL_STATE_DICT.z3DeviceMesh is not compatible with LOCAL_STATE_DICT.zKPlease set state_dict_type to SHARDED_STATE_DICT to get DTensor state_dict.T)r   state_dict_typer#   LOCAL_STATE_DICTr   optim_state_dict_configr   )r   r  r  s      r<   _set_optim_use_dtensorr    sZ     z>400-==m<<<9E]  HL77D 1r;   )FNFN)Fr@   )NFNN)T)FT){r   r  loggingr   collections.abcr   r   r   
contextlibr   dataclassesr   r   	itertoolsr	   typingr
   r   r   r   r   r   r   rM   torch.distributeddistributedr   'torch.distributed.fsdp._traversal_utilsfsdp_traversal_utilsr  torch.nnr   #torch.distributed._state_dict_utilsr   "torch.distributed.distributed_c10dr   $torch.distributed.fsdp._common_utilsr   r   r   r   r   r   r   #torch.distributed.fsdp._debug_utilsr   "torch.distributed.fsdp._flat_paramr   r   'torch.distributed.fsdp._fsdp_extensionsr   r   %torch.distributed.fsdp._runtime_utilsr   r    torch.distributed.fsdp.apir!   r"   r#   torch.distributed.tensorr$   r%   torch.utils._pytreer&   'torch.distributed._shard.sharded_tensorr'   	getLoggerr0   r  r)   r5   r6   rZ   rD   rF   rP   rV   r9   r8   ri   ro   ra   rb   ProcessGroupr   r   r   Moduler   	Optimizerr   r   rN   rT   r   r   r   r   r0  r7   r<  r"  rR  r$  r#  rc  rl  rq  rt  rv  r~  rS   r  r  r  r  r  r  no_gradr  r   r  r/   r;   r<   <module>r     s       8 8   (  W W W    A A  B E   ? M 
 8 - E 
		8	$ $ $ $T#s(^ sCx0I  C C C2
  Z 4"438n4 4 	4
 4 
$sCx.4n/3 /4 /C"C38nC CLJ"J"J J 
$sCx.	JZc3h D%%& 
#s(^	&#&/78I8I/J.#"#	# c3h# 
#s(^	#R "-1)-L)38nL)99L) L) EKK))*	L)
 L) D%%&L) 
#s(^L)^F"F3S#X./F S	F 
#s(^	FRQQ%,,'Q S	Q "%**-	Q
 Q \\Qh3H3H5<<(3H S	3H \\	3Hl$$c$ S	$ 		$d  %M,c3hM,99M, ;;  M, c3h R\\"$	
	M, M, M, 
#s(^M,n 	H.99H.c3h R\\"$	
H. 
#r||
H.V"%((// "d=#;M6N "N "&$=A<@0;;  0BII0 0 DtCy!89:	0
  ]C%7 890 
%S/2<<
'(0j "&$=A<@N;;  NBIIN N DtCy!89:	N
  ]C%7 89N 
",,c3h
'(N2 	N99Nc3h R\\"$	
N 
",,
N'&n-'&"&~uS#X'F"G'& U38_bll:;'& D%%&	'&
 
'&` <>38n<>D%%&<> U38_bll:;<> d3i/0	<>
 !m!34<> <> 4neCHo&E!FFG<>~S#XU38_bll:; d3i/0 
$sCx.	, $sCx.  T  &      ""sCx." 
$sI~
"JP "P d3	>23P  sCx.P  T#s(^+,	P 
 8EKK $sD%,,1G,H'H"IIJP fE+"E+T#s(^+,E+ E+ 	E+
 E+ E+ 
E+Pe"ed3	>23e sCx.e 	e
 e e 
#tCH~
eP/"/sCx./ / 	/
 / 
#s(^/r X~.X"&~uS#X'F"GX !m!34X 5c?C/0	X
 X X X 
#s(^XD 6~.6"&~uS#X'F"G6 !m!346 5c?C/0	6
 6 6 6 
#s(^6r  "Y99Y;;  Y 38nY c3h R\\"$	
	Y Y Y D%%&Y Y Y Y  
#s(^!Y Yx.ryy .T#}:L5M .b LL*L 
L Lr;   