
    sh2                        S SK Jr  S SKJr  S SKrS SKJs  Js  Jr	  S SK
Jr  S SKJr  S SKJr  S SKJrJrJrJrJr  S\S	\S
\\   S\\\S4   \\S4   4   4S jrS\R2                  S	\S
\\   S\\\   \\   4   4S jrS\R8                  R:                  S\\   S\4S jrS\S	\S
\\   S\\S4   4S jr S\RB                  4S jr"g)    )Sequence)castN)	ShapeType)
DeviceMesh)DTensorSpec)_StridedShardPartial	Placement	ReplicateShardglobal_shapemesh
placementsreturn.c                    UR                  5       nUc  g[        U 5      nS/[        U 5      -  n[        [        U 5      5       Vs/ s H  nS/UR                  -  PM     nnS/[        U 5      -  n[        U5       H  u  pUR                  U	5      n[        U
[        5      (       d  M-  U
R                  nS/[        U 5      -  nU[        U5      :  d   SU S[        U5       35       eU
R                  XL   UX9   SS9u  pXU'   XU'   X\   X   ::  a  X   X\'   OX\==   X   -  ss'   X==   U-  ss'   M     [        S U 5       5      nU(       GaR  S	/[        U 5      -  nS	/[        U 5      -  n[        U5       H  u  pUR                  U	5      n[        U
[        5      (       d  M-  U
R                  nUU   (       a  [        S
U
 SU	 SU S35      eUU   (       a  SUU'   [        U
[        5      (       a   SUU'   X   U
R                  U-  -  X|   U	'   M  X==   U-  ss'   X   X|   U	'   M     [        U5       VVVVs/ s H4  u  nn[        [!        UU5       VVs/ s H  u  nnUU-  PM     snn5      PM6     nnnnn[!        UU5       VVs/ s H  u  nnUU-  PM     nnn[#        U5      [#        U5      4$ s  snf s  snnf s  snnnnf s  snnf )a*  
Compute the local tensor shape and the global offsets into the original tensor
of a DTensor on its current global rank. This is useful for checkpointing purpose.

Example (2 host with 4GPUs each):
# Below is a DeviceMesh with mesh_shape of (2, 4)
mesh = DeviceMesh(device_type="cuda",
                    mesh=[
                    [0, 1, 2, 3],
                    [4, 5, 6, 7]
                    ],
)

Let's say we distribute a global_tensor of shape (8,4) over the above DeviceMesh
with a placements of [Shard(0), Shard(0)].
The local shape and global offset will be as follows:
rank0 -- local_shape:[1, 4], global_offset:[0, 0]
rank1 -- local_shape:[1, 4], global_offset:[1, 0]
rank2 -- local_shape:[1, 4], global_offset:[2, 0]
rank5 -- local_shape:[1, 4], global_offset:[5, 0]
rank3 -- local_shape:[1, 4], global_offset:[3, 0]
rank4 -- local_shape:[1, 4], global_offset:[4, 0]
rank6 -- local_shape:[1, 4], global_offset:[6, 0]
rank7 -- local_shape:[1, 4], global_offset:[7, 0]

Let's say we distribute a global_tensor of shape (2) over the above DeviceMesh with
a placements of [Shard(0)]. We will not have non-empty local tensor for all the ranks.
The local shape and global offset will be as follows:
rank0 -- local_shape:[1,], global_offset:[0,]
rank1 -- local_shape:[1,], global_offset:[1,]
rank2 -- local_shape:[0,], global_offset:[2,]
rank5 -- local_shape:[0,], global_offset:[2,]
rank3 -- local_shape:[0,], global_offset:[2,]
rank4 -- local_shape:[0,], global_offset:[2,]
rank6 -- local_shape:[0,], global_offset:[2,]
rank7 -- local_shape:[0,], global_offset:[2,]
))r    r      Sharding dim  greater than tensor ndim T)return_offsetc              3   B   #    U  H  n[        U[        5      v   M     g 7fN)
isinstancer   ).0ps     s/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/distributed/tensor/_utils.py	<genexpr>8compute_local_shape_and_global_offset.<locals>.<genexpr>{   s     PZz!];;Zs   FzTStrided sharding does not allow Shard() to appear after the strided part has ended. z at idx z in z violates this assumption.)get_coordinatelistlenrangendim	enumeratesizer   r   dim_local_shard_size_on_dimanyNotImplementedErrorr   split_factorsumziptuple)r   r   r   my_coordinatelocal_shapeglobal_offset_shard_idx_stride_by_mesh_dimnum_shards_by_tensor_dimidx	placementmesh_dim_size	shard_dimlocal_offset
shard_sizeshard_offsetstrided_shardingstrided_part_seenstrided_part_endshard_idx_stridexy	shard_idxs                          r   %compute_local_shape_and_global_offsetrB      s5   P '')M<(c,//%*3|+<%=(
%=QC$))O%= 	% (
 %&3\):#: '
3NC IIcNM)U++%MM	 !sS%66 3{#33 #I;.H[IYHZ[3 ,5+M+M*!!&"&	 ,N ,(
 *4I&*6Y' !+|/FF/;/FM,!,0GG,(3}D37 4h PZPP!&#l*; ; %w\)::"+J"7 $		#i// )I'	21;;D+XcURV)l*DF  )36:(3!)];;7;))44? ) 6 6 FH 5?D
 1;M;4? 5?D/ #8: 4=044/I/ s+;]'KL'Ktq!QU'KLM4   03;	/JK/Jtq!QU/JMK[!5#777C(
r M Ls$   J9?K
J>.K
K>K
tensorc           	         [        U R                  5       5      n[        U R                  5       5      n[        U5       GH  u  pVUR                  U5      nUR	                  5       (       a  [        [        U5      nUR                  S:  a  [        SU 35      eUR                  n	XR                  :  d   SU	 SU R                   SU S35       eX9   n
X-  X9'   [        [        U5      5       H  nX:w  d  M
  XK   XI   :  d  M  XK   U-  XK'   M!     M  [        U[        [        45      (       a  M  [        S[!        U5       S35      e   X44$ )	a  
Compute the global size and stride of a DTensor from the given local tensor.
The local size is multiplited by `world_size` per Sharding dim.
The local stride is multiplited by `world_size` per Sharding dim, as long as the
dimension is outside sharding dim.

For example, if we have a local tensor with size (4, 8, 2) and stride (16, 1, 8).
If the DTensor placements are [Shard(2)] and world_size is 2;
then the global size is (4, 8, 4) and stride is (16 * 2, 1, 8).

Args:
    tensor (:class:`torch.Tensor`):
        Local tensor which DTensor will be constructed from.
    mesh (:class:`DeviceMesh`):
        Object which describes the mesh topology
        of devices for the DTensor.
    placements (Sequence[:class:`Placement`]]):
        The attribute of the DTensor that describes its layout
        on the mesh topology.

Return:
    tensor_shape: A List of int which specifies the size of DTensor which build
        on top of the local tensor.
    tensor_stride: A List of int which specifies the stride of DTensor.
r   zOShard placements should have negative dims normalized in the user-facing APIs: r   r   z for placement number .zplacement type z not supported!)r    r%   strider$   is_shardr   r   r&   AssertionErrorr#   r"   r!   r   r   r	   RuntimeErrortype)rC   r   r   tensor_shapetensor_strider4   r5   r6   shard_placementr7   local_dim_sizeis               r   compute_global_tensor_inforP      sR   8 &L)M#J/		#"5)4O""Q&$--<,=?  (++I{{* 	{*DV[[MQghkgllmn* *4N&4&DL# 3}-.>m&6-:R&R'4'7-'GM$ / I	7';<<i0AQRR3 04 &&    op_callargsc                 f   U H  n[        U[        R                  [        45      (       a  UR                  s  $ [        U[
        [        45      (       d  MS  [        U5      S:  d  Md  [        US   [        R                  [        45      (       d  M  US   R                  s  $    [        SU  S35      e)zy
Find the device mesh object from args.
It returns None if no mesh is found.
NOTE: we can optimize this search if needed
r   z+Cannot find device mesh from args for op : rE   )	r   dtensorDTensorr   device_meshr    r-   r!   
ValueError)rR   rS   args      r   try_find_mesh_from_argsrZ      s     cGOO[9::??"sT5M**C13q6GOO[#ABBq6%%%  B7)1M
NNrQ   global_stridec                   ^ ^ S/[        T 5      -  m[        U5       H{  u  p4UR                  5       (       d  M  [        [        U5      R
                  n[        [        T 5      5       H-  nT U   T U   :  d  M  TU==   UR                  U5      -  ss'   M/     M}     [        U U4S j[        [        T 5      5       5       5      $ )z
Compute the stride of a local tensor shard, given the global stride of the DTensor.
NOTE: Currently this function is assuming the DTensor is evenly shardable.
r   c              3   :   >#    U  H  nTU   TU   -  v   M     g 7fr   r   )r   rO   r[   stride_divisorss     r   r   'compute_local_stride.<locals>.<genexpr>  s$      8Q1aOA..8Qs   )	r!   r$   rG   r   r   r&   r"   r%   r-   )r[   r   r   mesh_idxr   rO   jr^   s   `      @r   compute_local_striderb      s     cC..O ,::<<UA""A 3}-. #mA&66#A&$))H*==& / -  8=c->P8Q  rQ   c                 $   [        U [        R                  5      (       a  U $ [        U [        5      (       a  U /nOA[	        U 5      S:X  a'  [        U S   [
        5      (       a  [        U S   5      nO[        U 5      n[        R                  " U5      $ )z
Unify variable types of size argument to torch.Size
Acceptable types include:
    int, Sequence[int], Tuple[int], Tuple[Sequence[int]],
    or torch.Size
r   r   )r   torchSizeintr!   r   r    )r%   
torch_sizes     r   normalize_to_torch_sizerh     sn     $

##$V
	TaJtAw99$q']
$Z
::j!!rQ   )#collections.abcr   typingr   rd   torch.distributed.tensor._apidistributedrC   _apirU   torch._prims_commonr   torch.distributed.device_meshr   &torch.distributed.tensor._dtensor_specr   (torch.distributed.tensor.placement_typesr   r	   r
   r   r   r-   rf   rB   Tensorr    rP   _ops
OpOverloadobjectrZ   rb   re   rh   r   rQ   r   <module>rv      s$   $   / / ) 4 > Q8Q8#-Q8;CI;NQ8
5c?E#s(O+,Q8h8'LL8' *8'8@8K8'
49d3i 8'vOZZ""O*26*:OO*$.<DY<O
38_,"UZZ "rQ   