
    shd5                        S SK r S SKJrJrJr  S SKrS SKJr  S SK	Js  J
s  Jr  S SKJs  Jr  S SKJrJrJrJr  S SK	Jr  S SKJr  S SKJr  S SKJr  S SKJr  S S	KJr  S S
K J!r!  S SK"J#r#J$r$J%r%Jr&  S SK'J(r(J)r)  S/r*S\$S\+\RX                  \RX                  4   4S jr-S\$S\.S\+\RX                  \RX                  4   4S jr/S\$S\+\RX                  \RX                  4   4S jr0S\$S\.S\4S jr1S\$S\Rd                  S\4S jr3S\$S\Rd                  4S jr4S\Rj                  S\Rl                  S\.S\Rj                  4S jr7S\Rl                  S\.S\.S\.S\Rd                  S\Rl                  4S  jr8S\Rl                  S\.S!\#S\$4S" jr9S\Rl                  S\+\Rl                  \:\   4   4S# jr;S\$S$\\#   S\Rl                  4S% jr< " S& S\5      r=g)'    N)AnycastOptional)ShardShardedTensorShardedTensorMetadataTensorProperties)ShardMetadata)ChunkShardingSpec)_mesh_resources)_set_fsdp_flattened)FSDPExtensions)_create_chunk_sharded_tensor)_remote_device)
DeviceMeshDTensor	Replicater   )_flatten_tensor_unflatten_tensorDTensorExtensionstensorreturnc                    U R                   nUR                  S:X  d   S5       eU R                  S   nS/[        U R	                  5       5      -  nUR	                  SS9nU R                  S   R                  5       (       a2  [        [        U5      R                  nU R	                  U5      U-  nXcU'   [        R                  " U5      U R                  R	                  5       4$ )N   &Only 1D DeviceMeshes currently handledr   )mesh_dim)device_meshndim
placementslensizeis_shardr   DSharddimtorchSize_local_tensor)r   r   	placementoffsets
num_chunks	shard_dim
chunk_sizes          z/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/distributed/tensor/parallel/fsdp.py_get_boxr.       s    $$Kq J"JJ !!!$IcC&&G!!1!-J$$&&+//	[[+z9
'	JJw!5!5!:!:!<==    idxc                 |    [        U 5      u  p#[        R                  " U Vs/ s H  oDU-  PM	     sn5      U4$ s  snf N)r.   r%   r&   )r   r0   r)   r!   vals        r-   _get_box_forr4   0   s6    V$MGJJW5Wcc	W56==5s   9c                 `    U R                   nUR                  5       nUc   e[        XS   5      $ )Nr   )r   get_coordinater4   )r   r   coords      r-   _get_local_boxr8   5   s6    $$K&&(Ea))r/   dtcurrent_rankc                     U R                   nUR                  S:X  d   S5       e[        U 5      u  p4[        [	        U5      [	        U5      SU SU R
                  R                   3S9$ )Nr   r   rank:/shard_offsetsshard_sizesr(   )r   r   r8   r
   listr'   device)r9   r:   meshr)   sizess        r-   _create_shard_md_from_dtrE   <   se    >>D99>CCC>#B'NG7mK,q)9)9)@)@(AB r/   dt_pgc                 
   / n[         R                  " U5      nUS:  a  SOSnU R                  S   R                  5       (       a  UR	                  5       nOSn[        U5       H^  n[        X5      u  pxUR                  [        [        U5      [        U5      SUS:  a  UOU SU R                  R                   3S95        M`     [        UU R	                  5       [        U R                  U R                  U R                   S9S9$ )Nr   r   r<   r=   r>   )dtypelayoutrequires_grad)shards_metadatar!   tensor_properties)distget_rankr   r"   r!   ranger4   appendr
   rA   r'   rB   r   r	   rH   rI   rJ   )	r9   rF   	shards_mdmy_rankscapegoat_rankshard_countir)   rD   s	            r-   !_create_sharded_tensor_md_from_dtrV   H   s     ImmE"G!A+Q1N	}}Q  ""jjl;%b,"7m Ka!eNA2CSCSCZCZB[\		
   !!WWY*((99**
	 	r/   c                 h    U R                   nUR                  S:X  d   S5       eUR                  5       $ )Nr   r   )r   r   	get_group)r9   rC   s     r-   
_get_dt_pgrY   o   s.    >>D99>CCC>>>r/   specrankc                 @   [        U [        5      (       d  U $ SnU R                   HK  n[        [        U5      nUR                  5       U:X  d  M)  UR                  5       UR                  :w  d  MI  Sn  O   U(       a  [        R                  " U 5      n [        U R                  5       Hs  u  pV[        [        U5      nUR                  5       U:X  d  M+  UR                  5       UR                  :w  d  MK  [	        SU SUR                   35      U R                  U'   Mu     U $ )z
Rewrite ``spec`` to match the device of ``tensor``.

FSDP.sharded_optim_state_dict sneakly ships optimizer state to CPU so if the original ShardingSpec
produces CUDA metadata, ST construction bombs.
FTr<   r=   )

isinstancer   r   r   r   r[   rB   copydeepcopy	enumerate)rZ   r   r[   rewriteprU   r(   s          r-   _rewrite_spec_if_neededrc   u   s     d-.. G__#668t
fmm ;G	 
 }}T"%doo6LA^Y7I~~4'I,<,<,>&--,O%3eD66==/4R%S" 7
 Kr/   
world_sizenum_devices_per_nodepgc           	         [        U 5      [        L a  [        U R                  5       5      S:X  d   eU R	                  5       n[        UUUUU5      nU R                  5       S   n[        U[        R                  " UR                  5      5      /n[        R                  " U R                  5       5      n	SU	R                  l        [        R                  " UU	U R                  SS9n
U
$ [        U 5      [        L a  U R                  nUR                   S:X  d   S5       eU R"                  n[        UUU[$        R&                  R)                  5       U5      n[+        U 5      n[        U[-        U [.        R0                  " U5      5      5      /n[3        X5      n	SU	R                  l        [        R                  " UU	USS9n
U
$ [        U UUUU5      $ )Nr   r   F)sharded_tensor_metadataprocess_group
init_rrefsr   )typer   r    local_shardslocal_tensorr   r   r^   r_   metadatarL   rJ   +_init_from_local_shards_and_global_metadata_process_groupr   r   r   r'   r%   acceleratordevice_countrY   rE   rM   rN   rV   )r   r[   rd   re   rf   inner_paraminner_stouter_local_shardshardsst_metast_outerr   rF   s                r-   _chunk_tensorry      s    F|}$6&&()Q...))+/ 
 #//1!4(DMM*;*D*DEF
 -- 1227!!/ LL$+ //	
 	f	 ((1$N&NN$**/**,
 6" (4VT]]5=QRS
 4FB27!!/ LL$+	
 + 
 	
r/   r   c                    [         R                  " U5      nUc  [        S5      eUR                  S:  a  [        SUR                   S3S5      eU R	                  5       R                  5       n [        U [        R                  5      (       a  [        U [        5      (       d  [        UR                  5       Vs/ s H  n[        5       PM     nn[        UR                  5       Vs/ s H  n[        5       PM     nn[        S5      US'   [        R                  " XUSS9R                  UUS	9$ U R                  nUS   nU R!                  5       n [        UR                  5       Vs/ s H  n[        5       PM     nnXS
'   [        UR                  5       V	s/ s H  n	[        5       PM     nn	[        S5      US'   XS
'   [        R                  " XUSS9R                  UUS	9$ s  snf s  snf s  snf s  sn	f )z
Shard a tensor to chunks along the first dimension.

The local rank will gets its corresponding chunk as the local tensor to create a DTensor.
z4No parent device_mesh is found for FSDP device_mesh.   z!Found parent device_mesh of ndim=,zbut meshes must be at least 2D.r   F)	run_checkr   r   )r   get_root_meshRuntimeErrorr   detachcloner]   r%   Tensorr   rO   r   r#   
from_localredistributer   to_local)
r   r[   r   	root_mesh_replicate_placementsshard_placementstp_placementstp_placementrU   s
             r-   _chunk_dtensorr      s     --k:IQRR~~/	/?qA-
 	
 ]]_""$F
 &%,,''
670K0K 6;9>>5JK5J	5JK16y~~1FG1FAIK1FG$Qi!!3u

,!'  
	
 ))$Q'" 6;9>>5JK5J	5JK#/R 16y~~1FG1FAIK1FG%ay+!!3u

,!'  
	
9  LG*  LGs   8G"%G'%G,G1c                    [        [        U 5      R                  5       n[        U5      S:X  a@  [	        US   R
                  5      [        L a!  US   R
                  nUR                  5       nUn U [        U5      S:  a  U4$ / 4$ )Nr   r   )r   r   rl   r    rk   r   )r   rv   inner_tensors      r-   _pre_load_state_dictr     sz     -(557F
6{aD!1!12mCay''**,c&kAoF66266r/   parent_meshc                     XR                   :X  d   e[        [        R                  " U R                  5      5      n[        S[        U5      S-
  5       H  n[        5       X#'   M     U R                  U R                   US9n U R                  5       $ )zGAll gather a DTensor in its FSDP dimension and return the local tensor.r   r   r~   )
r   rA   r^   r_   r   rO   r    r   r   r   )r   r   r   rU   s       r-   _all_gather_dtensorr   )  s    
 ,,,,,dmmF$5$567J 1c*o)*!
 +  && ! F
 ??r/   c                     ^  \ rS rSrSrSU 4S jjrS\R                  S\\R                  \	\
   4   4S jrS\R                  S\
S\R                  4S jr SS\R                  S	\S
\S\S\R                  S\	\R                      S\R                  4S jjrS\R                  S	\S\S\R                  4S jrS\R                  S\\R                  \\   4   4S jrS\S\	\   S\R                  4S jrSrU =r$ )r   i=  z
DTensorExtension is the TensorFlattener extension needed for 2D FSDP + TP.

This is the implementation for FSDPExtensions defined in
https://github.com/pytorch/pytorch/blob/main/torch/distributed/fsdp/_fsdp_extensions.py
r   c                    > [         TU ]  5         S U l        Xl        [        R
                  R                  U R                  5      U l        g r2   )super__init__compute_streamdevice_handler%   _dynamodisablepost_unflatten_transform)selfr   	__class__s     r-   r   DTensorExtensions.__init__E  s>    "* ).(=(=)))
%r/   r   c                     [        U5      $ r2   )r   r   r   s     r-   pre_flatten_transform'DTensorExtensions.pre_flatten_transformO  s     v&&r/   param_extensionc                 "   U R                   =(       d    U R                  R                  5       nU R                  R                  U5         [	        UUU R                  U R                   S9n[        U5        UsS S S 5        $ ! , (       d  f       g = f)N)r   r   )r   r   current_streamstreamr   r   )r   r   r   r   results        r-   r   *DTensorExtensions.post_unflatten_transformU  st     $$K(:(:(I(I(K&&v. '"00#22	F  ' /..s   	-B  
Br[   rd   re   rf   rB   c                     [        XX4U5      $ r2   )ry   )r   r   r[   rd   re   rf   rB   s          r-   chunk_tensorDTensorExtensions.chunk_tensorh  s     V:RPPr/   r   c                     [        XU5      $ r2   )r   )r   r   r[   r   s       r-   chunk_dtensorDTensorExtensions.chunk_dtensors  s     fK88r/   c                     [        U5      $ r2   )r   r   s     r-   pre_load_state_dict_transform/DTensorExtensions.pre_load_state_dict_transform{  s     $F++r/   r   c                     [        X5      $ r2   )r   )r   r   r   s      r-   all_gather_dtensor$DTensorExtensions.all_gather_dtensor  s    
 #677r/   )r   r   r   )r   Nr2   )__name__
__module____qualname____firstlineno____doc__r   r%   r   tupler   r   r   r   intrM   ProcessGrouprB   r   r   r   rA   r   r   r   r   __static_attributes____classcell__)r   s   @r-   r   r   =  s[   
'' 
u||Xc]*	+'ll58	4 *.	Q	Q 	Q 		Q
 "	Q 	Q &	Q 
	Q99 9  	9
 
9,, 
u||T%[(	),88 j)8 
	8 8r/   )>r^   typingr   r   r   r%   torch.distributeddistributedrM   &torch.distributed._shard.sharding_spec_shardsharding_spec
shard_spec"torch.distributed.distributed_c10ddistributed_c10dc10d'torch.distributed._shard.sharded_tensorr   r   r   r	   r
   :torch.distributed._shard.sharding_spec.chunk_sharding_specr   torch.distributed.device_meshr   $torch.distributed.fsdp._common_utilsr   'torch.distributed.fsdp._fsdp_extensionsr   #torch.distributed.fsdp._shard_utilsr   torch.distributed.remote_devicer   torch.distributed.tensorr   r   r   r#   6torch.distributed.tensor.parallel._data_parallel_utilsr   r   __all__r   r&   r.   r   r4   r8   rE   r   rV   rY   ShardingSpecr   rc   ry   r   rA   r   r   r    r/   r-   <module>r      s;    & &    ; ; 1 1  A X 9 D B L : T T 
>W >uzz5::'=!> > > >s >uUZZ5K/L >
*7 *uUZZ-C'D *	 	 	 	$$))$$N7 t00 

!
!+0<<?B:G
LLG

G
 G
 	G

 	G
 \\G
T>
LL>

>
 >
 	>
B	7LL	7
5<<e$%	7*% \\(I8 I8r/   