
    shy                        S SK r S SKrS SKJr  S SKJrJrJrJr  S SK	J
r
  S SKrS SKJs  Js  Jr  S SKJs  Js  Jr  S SKJr  S SKJrJr  S SKJrJr  S SKJrJr  S SKJ r J!r!  S S	K"J#r#J$r$J%r%  S S
K&J'r'J(r(J)r)J*r*  / SQr+\RX                  RZ                  r- " S S\R\                  R^                  5      r0 " S S\R\                  R^                  5      r1 " S S\Rd                  5      r3  S/S S.S\Rd                  S\\   S\\\(      S\\4   S\34
S jjjr5\
" S5       S0S\Rd                  S\\*   S\\   SS4S jj5       r6    S1S\Rn                  S\\   S\\\8\Rn                  \/S4      S\\\Rn                  \\/S4      S\\\Rn                  \\/S4      S\Rn                  4S  jjr9  S/S!\Rt                  S\\   S\\\(      S\34S" jjr;S\Rx                  S#SSS$.S%\\Rz                     S&\R|                  S'\?S\\   S\\\(      S\34S( jjr@S\Rx                  S#SSS$.S%\\Rz                     S&\R|                  S'\?S\\   S\\\(      S\34S) jjrAS\Rx                  S#SSS$.S%\\Rz                     S&\R|                  S'\?S\\   S\\\(      S\34S* jjrBS#S\Rx                  SSS+.S'\?S%\\Rz                     S&\R|                  S\\   S\\\(      S\34S, jjrCS#S\Rx                  SSS+.S'\?S%\\Rz                     S&\R|                  S\\   S\\\(      S\34S- jjrDS#S\Rx                  SSS+.S'\?S%\\Rz                     S&\R|                  S\\   S\\\(      S\34S. jjrEg)2    N)Sequence)AnyCallablecastOptional)
deprecated)_mesh_resources
DeviceMesh)check_tensor_metamesh_broadcast)DTensorSpec
TensorMeta)Redistributeredistribute_local_tensor)compute_global_tensor_info%compute_local_shape_and_global_offsetnormalize_to_torch_size)Partial	Placement	ReplicateShard)	DTensordistribute_tensordistribute_moduleonesemptyfullrandrandnzerosc                   h    \ rS rSr\SSS\\\      4S j5       r\S\	R                  4S j5       rSrg	)
_ToTorchTensorE   inputr   grad_placementsc                 j    UR                   U l        X l        UR                  nUR	                  U5      $ N)_specdtensor_specr%   _local_tensorview_as)ctxr$   r%   local_tensors       q/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/distributed/tensor/_api.pyforward_ToTorchTensor.forwardF   s4     !;;-**
 ##L11    grad_outputc           
      P   U R                   nUR                  nU R                  nUR                  n[	        XUR
                  5      u  pg[        U5      nU=(       d    UR
                  n[        UU[        UR                  UUR                  S9S9n[        UUUR                  S9S 4$ )Nshapestridedtypetensor_metarequires_grad)r)   meshr%   r9   r   
placementstupler   r   r5   r7   r   r;   )	r,   r2   r)   r<   r%   dtensor_meta_tensor_stride	grad_specs	            r.   backward_ToTorchTensor.backwardU   s    ''  --#//5|66
 m,)D\-D-D""(($"((
	 )77
 
 	
r1    N)__name__
__module____qualname____firstlineno__staticmethodr   r   r   r/   torchTensorrC   __static_attributes__rE   r1   r.   r"   r"   E   sO    22 "(9"562 2 
5<< 
 
r1   r"   c                       \ rS rSr\  SS\R                  S\S\\	S4   S\
S\\R                     S	\\\S4      S
S4S jj5       r\SS j5       rSrg)_FromTorchTensoru   Nr$   device_meshr=   .	run_checkr5   r6   returnr   c           	         X0l         X l        U(       a
  U(       a  XVpOPU(       d6  U(       d/  [        XU5      u  p[        R                  " U	5      [        U
5      pO[        SU SU S3S5      eUR                  5       c  UR                  SUR                  S9nOjU(       ac  U(       + =(       a    U(       + n[        XS9  [        U5       H6  u  pUR                  5       (       d  M  UR                  5       n[        XUS9  M8     [        UU[!        UUUR"                  5      S	9n[%        UR'                  U5      UUR                  S9nU$ )
NzFound shape:z	, stride:.z3Please pass both shape and stride at the same time.r   r:   )check_shape_stride)mesh_dimr8   )previous_placementprevious_device_meshr   rK   Sizer>   RuntimeErrorget_coordinate	new_emptyr;   r   	enumerateis_replicate
contiguousr   r   r   r7   r   r+   )r,   r$   rQ   r=   rR   r5   r6   tensor_shaperA   global_shapeglobal_striderV   idx	placement	dist_specdist_tensors                   r.   r/   _FromTorchTensor.forwardv   sE    ",#. V*/-v +EJ+'L +0**\*BE-DX-ugYvha8E 
 %%'/ OOAU5H5HOIE &+!96zeK #,J"7))++ ",,.E"5D #8  "
	 MM%   --
 r1   c                    U R                   nU R                  nUR                  U:w  aJ  UR                  n[	        UUUR                  R
                  S9nUR                  n[        XdUSS9nUS S S S S 4$ UR                  5       S S S S S 4$ )Nr8   T)is_backward)	rX   rY   r=   r(   r   r9   r*   r   to_local)r,   r2   rX   rY   current_spectarget_specr-   outputs           r.   rC   _FromTorchTensor.backward   s     33"77
 !!%77&,,L%$"'--99K
 '44L.KTF
 4tT477 ##%tT4tCCr1   rE   NN)r2   r   )rF   rG   rH   rI   rJ   rK   rL   r
   r>   r   boolr   rZ   intr/   rC   rM   rE   r1   r.   rO   rO   u   s     '+,0@||@  @ )S.)	@
 @ 

#@ sCx)@ 
@ @D D Dr1   rO   c                      \ rS rSr% Sr\R                  \S'   \\S'   SS/r	\
R                  " 5       r\
R                  \S'   \\R                  S\R                  S\S\S	S 4S
 j5       5       rS rS r\S 5       rS rS*S jr\\R                  S+S j5       5       r\  S,SSSS.S\R                  S\\   S\\\      S\S\\R8                     S\\\S4      S	S 4S jjj5       rSS.S\\\      S	\R                  4S jjr   S,SS.S\\   S\\\      S \S	S 4S! jjjr!SS.S\\\      S	\R                  4S" jjr"\#S	\4S# j5       r$\#S	\\S4   4S$ j5       r%S%\&S&\'4S' jr(S( r)S) r*Sr+g)-r      ay  
``DTensor`` (Distributed Tensor) is a subclass of ``torch.Tensor`` that provides single-device like
abstraction to program with multi-device ``torch.Tensor``. It describes the distributed tensor sharding
layout (DTensor Layout) through the :class:`DeviceMesh` and following types of :class:`Placement`:

* :class:`Shard`: Tensor sharded on the tensor dimension ``dim`` on the devices of the ``DeviceMesh`` dimension
* :class:`Replicate`: Tensor replicated on the devices of the ``DeviceMesh`` dimension
* :class:`Partial`: Tensor is pending reduction on the devices of the ``DeviceMesh`` dimension

When calling PyTorch operators, ``DTensor`` overrides the PyTorch operators to perform sharded computation and issue
communications whenever necessary. Along with the operator computation, ``DTensor`` will transform or propagate the
placements (DTensor Layout) properly (based on the operator semantic itself) and generate new ``DTensor`` outputs.

To ensure numerical correctness of the ``DTensor`` sharded computation when calling PyTorch operators, ``DTensor``
requires every Tensor argument of the operator be DTensor.

.. note:: Directly using the Tensor subclass constructor here is not the recommended way to create a ``DTensor``
    (i.e. it does not handle autograd correctly hence is not the public API). Please refer to the `create_dtensor`_
    section to see how to create a ``DTensor``.
r*   r(   _op_dispatcherr-   specr;   rS   c          
      t   UR                   (       a  U(       d  [        R                  " S5        UR                  c   S5       e[        R
                  R                  U UR                  R                  UR                  R                  UR                  UR                  UR                  US9nX$l        Xl        U$ )a  
Construct a DTensor from a local tensor, device mesh, and placement and
other tensor properties (i.e. shape, requires_grad, strides, etc).

.. note:: This is not a public API and it's only supposed to be used by the
    operator implementations and internals. If you want to construct a
    DTensor from a local tensor, consider using ``DTensor.from_local``, if
    you want to construct a DTensor from a "global" tensor (where you
    already have tensor initialized and want to shard this tensor),
    consider using ``distribute_tensor``.
zxTo construct DTensor from torch.Tensor, it's recommended to use local_tensor.detach() and make requires_grad consistent.zTensorMeta should not be None!)stridesr7   devicelayoutr;   )r;   warningswarnr9   rK   rL   _make_wrapper_subclassr5   r6   r7   ry   rz   r(   r*   )clsr-   rv   r;   rs        r.   __new__DTensor.__new__   s    ( %%mMMO +M-MM+LL//""$$++$$&&&&' 0 
 &r1   c                 ~    SU R                    SU R                  R                   SU R                  R                   S3$ )NzDTensor(local_tensor=z, device_mesh=z, placements=))r*   r(   r<   r=   selfs    r.   __repr__DTensor.__repr__  s=    &t'9'9&:.HYYfgkgqgqg|g|f}}~r1   c                 8    S/U R                   U R                  44$ )zM
protocol to inform how to flatten a DTensor to local tensor
for PT2 tracing
r*   )r(   r;   r   s    r.   __tensor_flatten__DTensor.__tensor_flatten__#  s!    
   4::t/A/A"BBBr1   c                     Uc   S5       eU S   nUu  pV[        UUUR                  R                  S9n[        UR                  UR
                  US9n[        UUUS9$ )NzEExpecting spec to be not None from `__tensor_flatten__` return value!r*   r4   r8   r:   )r   r9   r7   r   r<   r=   r   )	inner_tensorsflatten_spec
outer_sizeouter_strider-   rv   r;   unflatten_tensor_metaunflatten_specs	            r.   __tensor_unflatten__DTensor.__tensor_unflatten__*  s    ' 	
S	
' %_5* *""((!

 %IIOO-

 '
 	
r1   c                     [        S U R                   5       5      (       d  U $ U R                   Vs/ s H$  n[        U[        5      (       a
  [	        5       OUPM&     nnU R                  U R                  US9$ s  snf )Nc              3   B   #    U  H  n[        U[        5      v   M     g 7fr'   )
isinstancer   ).0ps     r.   	<genexpr>6DTensor.__coerce_tangent_metadata__.<locals>.<genexpr>B  s     C?a:a))?s   rQ   r=   )anyr=   r   r   r   redistributerQ   )r   r   r=   s      r.   __coerce_tangent_metadata__#DTensor.__coerce_tangent_metadata__A  sr    C4??CCCKBF//
BQQ:a11IKq8/ 	 
   T-=-=* UU
s   +A9Nc                 Z    Ub  g Uu  p4U R                  U R                  UR                  S9$ )Nr   )r   rQ   r=   )r   r   expected_typerv   r@   s        r.   #__coerce_same_metadata_as_tangent__+DTensor.__coerce_same_metadata_as_tangent__I  s:    $ 	  (( ! 
 	
r1   rE   c                 V    [         R                  R                  UUU=(       d    0 5      $ r'   )r   ru   dispatch)r~   functypesargskwargss        r.   __torch_dispatch__DTensor.__torch_dispatch__S  s*    
 %%..Lb
 	
r1   F)rR   r5   r6   rQ   r=   rR   r5   r6   .c                v   U=(       d    [         R                  " 5       nUR                  nX`R                  R                  :w  a"  U R
                  (       d  U R                  U5      n Uc.  [        UR                  5       Vs/ s H  n[        5       PM     nnO|[        U5      n[        U5       Hb  u  pU	R                  5       (       d  M  [        [        U	5      n	U	R                  S:  d  M>  [        U	R                  U R                  -   5      X('   Md     [         R#                  U U[%        U5      UUU5      $ s  snf )a  
Create a :class:`DTensor` from a local torch.Tensor on each rank
according to the ``device_mesh`` and ``placements`` specified.

Args:
    local_tensor (torch.Tensor): local torch.Tensor on each rank.
    device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to place the
        tensor, if not specified, must be called under a DeviceMesh
        context manager, default: None
    placements (List[:class:`Placement`], optional): the placements that
        describes how to place the local torch.Tensor on DeviceMesh, must
        have the same number of elements as ``device_mesh.ndim``.

Keyword args:
    run_check (bool, optional): at a cost of extra communications, perform
        sanity check across ranks to check each local tensor's meta information
        to ensure correctness. If have :class:`Replicate` in ``placements``, the
        data on first rank of the device mesh dimension will be broadcasted
        to other ranks. default: False
    shape (torch.Size, optional): A List of int which specifies the size of
        DTensor which build on top of `local_tensor`. Note this needs to be
        provided if the shape of ``local_tensor`` are different across the ranks.
        If not provided, ``shape`` will be computed assuming the given distributed
        tensor is evenly sharded across ranks. default: None
    stride (tuple, optional): A List of int which specifies the stride of DTensor.
        If not provided, ``stride`` will be computed assuming the given distributed
        tensor is evenly sharded across ranks. default: None

Returns:
    A :class:`DTensor` object

.. note:: When ``run_check=False``, it is the user's responsibility to ensure the
    local tensor passed in is correct across ranks (i.e. the tensor is sharded for
    the ``Shard(dim)`` placement or replicated for the ``Replicate()`` placement).
    If not, the behavior of the created DTensor is undefined.

.. note:: ``from_local`` is differentiable, the `requires_grad` of the created
    `DTensor` object will depend on if `local_tensor` requires_grad or not.
r   )r	   get_current_meshdevice_typery   typeis_metatorangendimr   listr^   is_shardr   r   dimrO   applyr>   )
r-   rQ   r=   rR   r5   r6   r   r@   rd   re   s
             r.   
from_localDTensor.from_local^  s	   l "G_%E%E%G!-- --222<;O;O'??;7L /4[5E5E/FG/F!)+/FJGJj)J"+J"7%%'' $UI 6I }}q(*/	@Q@Q0Q*R
 #8  %%*
 	
 Hs    D6)r%   r%   c                    [         R                  " 5       (       d  U R                  $ Ub   [        U[        5      (       d  [	        U5      n[
        R                  X5      $ )aT  
Get the local tensor of this DTensor on its current rank. For sharding it returns
a local shard of the logical tensor view, for replication it returns the replica on
its current rank.

Keyword args:
    grad_placements (List[:class:`Placement`], optional): the placements describes
        the future layout of any gradient layout of the Tensor returned from this
        function.
        `to_local` converts DTensor to local tensor and the returned local tensor
        might not be used as the original DTensor layout later in the code. This
        argument is the hint that user can give to autograd in case the gradient
        layout of the returned tensor does not match the original DTensor layout.
        If not specified, we will assume the gradient layout remains the same
        as the original DTensor and use that for gradient computation.

Returns:
    A :class:`torch.Tensor` or ``AsyncCollectiveTensor`` object. it represents the
    local tensor on its current rank. When an ``AsyncCollectiveTensor`` object is returned,
    it means the local tensor is not ready yet (i.e. communication is not finished). In this
    case, user needs to call ``wait`` to wait the local tensor to be ready.

.. note:: ``to_local`` is differentiable, the ``requires_grad`` of the local tensor returned
    will depend on if the `DTensor` requires_grad or not.
)rK   is_grad_enabledr*   r   r>   r"   r   )r   r%   s     r.   rk   DTensor.to_local  sQ    8 $$&&%%%&z/5/Q/Q#O4O##
 	
r1   )async_opr   c                   U=(       d    U R                   nUc  [        S5      e[        U5      n[        U5       Hr  u  pEUR	                  5       (       a  [        S5      e[        U[        5      (       d  M<  UR                  S:  d  MN  [        UR                  U R                  -   5      X$'   Mt     [        U5      n[        R                  " XX#5      $ )a{  
``redistribute`` performs necessary collective operations that redistribute the current
DTensor from its current placements to a new placements, or from is current DeviceMesh
to a new DeviceMesh. i.e. we can turn a Sharded DTensor to a Replicated DTensor by
specifying a Replicate placement for each dimension of the DeviceMesh.

When redistributing from current to the new placements on one device mesh dimension, we
will perform the following operations including communication collective or local operation:

1. ``Shard(dim)`` -> ``Replicate()``: ``all_gather``
2. ``Shard(src_dim)`` -> ``Shard(dst_dim)``: ``all_to_all``
3. ``Replicate()`` -> ``Shard(dim)``: local chunking (i.e. ``torch.chunk``)
4. ``Partial()`` -> ``Replicate()``: ``all_reduce``
5. ``Partial()`` -> ``Shard(dim)``: ``reduce_scatter``


``redistribute`` would correctly figure out the necessary redistribute steps for DTensors
that are created either on 1-D or N-D DeviceMesh.

Args:
    device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to place the
        DTensor. If not specified, it would use the current DTensor's DeviceMesh.
        default: None
    placements (List[:class:`Placement`], optional): the new placements that
        describes how to place the DTensor into the DeviceMesh, must
        have the same number of elements as ``device_mesh.ndim``.
        default: replicate on all mesh dimensions

Keyword args:
    async_op (bool, optional): whether to perform the DTensor redistribute operation
        asynchronously or not. Default: False

Returns:
    A :class:`DTensor` object

.. note:: ``redistribute`` is differentiable, which means user do not need to worry about
    the backward formula of the redistribute operation.

.. note:: ``redistribute`` currently only supports redistributing DTensor on the same DeviceMesh,
    Please file an issue if you need to redistribute DTensor to different DeviceMesh.
z&placements is needed for redistribute!zTCan not redistribute to Partial, redistributing to Partial is for internal use only!r   )rQ   r[   r   r^   
is_partialr   r   r   r   r>   r   r   )r   rQ   r=   r   ire   s         r.   r   DTensor.redistribute  s    j "5T%5%5GHH*%
%j1LA##%%"j  Iu--)--!2C %immdii&? @
 2 :&
 !!$ZJJr1   c                    U R                  [        5       /U R                  R                  -  SS9n[        R                  X!5      $ )aA  
Return the full tensor of this DTensor. It will perform necessary collectives
to gather the local tensors from other ranks in its DeviceMesh and concatenate
them together. It's a syntatic sugar of the following code:

``dtensor.redistribute(placements=[Replicate()] * mesh.ndim).to_local()``

Keyword args:
    grad_placements (List[:class:`Placement`], optional): the placements describes
        the future layout of any gradient layout of the full Tensor returned from this
        function.
        `full_tensor` converts DTensor to a full torch.Tensor and the returned torch.tensor
        might not be used as the original replicated DTensor layout later in the code. This
        argument is the hint that user can give to autograd in case the gradient
        layout of the returned tensor does not match the original replicated DTensor layout.
        If not specified, we will assume the gradient layout of the full tensor be replicated.

Returns:
    A :class:`torch.Tensor` object that represents the full tensor of this DTensor.

.. note:: ``full_tensor`` is differentiable.
F)r=   r   )r   r   rQ   r   r"   r   )r   r%   
redist_ress      r.   full_tensorDTensor.full_tensor   sF    4 &&!}t'7'7'<'<<u ' 

 ##J@@r1   c                 .    U R                   R                  $ )z
The :class:`DeviceMesh` attribute that associates with this DTensor object.

.. note:: ``device_mesh`` is a read-only property, it can not be set.
)r(   r<   r   s    r.   rQ   DTensor.device_mesh?  s     zzr1   c                 .    U R                   R                  $ )z
The placements attribute of this DTensor that describes the layout of this
DTensor on the its DeviceMesh.

.. note:: ``placements`` is a read-only property, it can not be set.
)r(   r=   r   s    r.   r=   DTensor.placementsH  s     zz$$$r1   fqnobjectc                     SSK Jn  [        U R                  S5      (       a  U R                  R	                  X5      $ [        U R                  [        R                  5      (       a	  U" X5      /$ [        S5      e)Nr   )_create_write_items_for_dtensor__create_write_items__Unsupported tensor type!)	,torch.distributed.checkpoint.planner_helpersr   hasattrr*   r   r   rK   rL   r[   )r   r   r   r   s       r.   r   DTensor.__create_write_items__R  se    	
 4%%'?@@%%<<SII**ELL993C@AA9::r1   c                     SSK Jn  [        U R                  S5      (       a  U R                  R	                  5       $ [        U R                  [        R                  5      (       a	  U" U 5      /$ [        S5      e)a  
Return a list of ChunkStorageMetadata, which is a dataclass that describes the size/offset of the local shard/replica
on current rank. For DTensor, each rank will have a single local shard/replica, so the returned list usually only
has one element.

This dunder method is primariy used for distributed checkpoint purpose.

Returns:
    A List[:class:`ChunkStorageMetadata`] object that represents the shard size/offset on the current rank.
r   )_create_chunk_from_dtensor__create_chunk_list__r   )	r   r   r   r*   r   r   rK   rL   r[   )r   r   s     r.   r   DTensor.__create_chunk_list__^  sc    	
 4%%'>??%%;;==**ELL99.t4559::r1   c                     [        U R                  S5      (       a  U R                  R                  U5      $ [        U R                  [        R
                  5      (       a  U R                  5       $ [        S5      e)N__get_tensor_shard__r   )r   r*   r   r   rK   rL   rk   r[   )r   indexs     r.   r   DTensor.__get_tensor_shard__t  s\    4%%'=>>%%::5AA**ELL99==?"9::r1   r'   )rE   Nrp   ),rF   rG   rH   rI   __doc__rK   rL   __annotations__r   	__slots__op_dispatchOpDispatcherru   rJ   _disable_dynamorq   r   r   r   r   r   r   classmethodr   r   r
   r   r   rZ   r>   rr   r   rk   r   r   propertyrQ   r=   strr   r   r   r   rM   rE   r1   r.   r   r      s   * << '*I 0;/G/G/INK,,I
'll' '
 ' 
'  'V@C 
 
,V
 

  
  -148R

  &*,0R
llR
j)R
 Xi01R

 R
 

#R
 sCx)R
 
R
 R
j CG#
"*8I+>"?#
	#
N -148FK
 FKj)FK Xi01FK
 FK 
FKR CGA"*8I+>"?A	A> Z   %E)S.1 % %
;# 
;s 
;;,;r1   r   src_data_ranktensorrQ   r=   r   rS   c          
      L   [         R                  R                  S5        U=(       d    [        R                  " 5       nUR
                  nUS:X  a   SSKJn  U" XU5      $ U R                  (       d  [        S5      eX@R                  R                  :w  a"  U R                  (       d  U R                  U5      n Uc-  [        UR                   5       Vs/ s H  n[#        5       PM     nn[%        U5      UR                   :w  a%  ['        S[%        U5       S	UR                    S
35      e[)        U [*        5      (       ac  U R,                  U:w  a  ['        SU R,                   SU S
35      eU R.                  [1        U5      :w  a  ['        SU R.                   SU S35      eU $ U R3                  5       n	[5        U5      n[7        U5       H  u  pUR9                  5       (       aZ  [;        [<        U5      nUR>                  S:  a&  [=        UR>                  U R                   -   5      nXU
'   URA                  XX5      n	Mt  URC                  5       (       a$  [;        ["        U5      nURE                  XX5      n	M  [        SU SU
 S35      e   [1        U5      nU	c   S5       e[G        UU[I        U RK                  5       U RM                  5       U RN                  S9S9n[+        U	RQ                  U RR                  5      UU RR                  S9$ ! [         a  nSn[        U5      UeSnAff = fs  snf )a,	  
Distribute a leaf ``torch.Tensor`` (i.e. nn.Parameter/buffers) to the ``device_mesh`` according
to the ``placements`` specified. The rank of ``device_mesh`` and ``placements`` must be the
same. The ``tensor`` to distribute is the logical or "global" tensor, and the API would use
the ``tensor`` from first rank of the DeviceMesh dimension as the source of truth to preserve
the single-device semantic. If you want to construct a DTensor in the middle of the Autograd
computation, please use :meth:`DTensor.from_local` instead.

Args:
    tensor (torch.Tensor): torch.Tensor to be distributed. Note that if you
        want to shard a tensor on a dimension that is not evenly divisible by
        the number of devices in that mesh dimension, we use ``torch.chunk``
        semantic to shard the tensor and scatter the shards. The uneven sharding
        behavior is experimental and subject to change.
    device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to distribute the
        tensor, if not specified, must be called under a DeviceMesh context
        manager, default: None
    placements (List[:class:`Placement`], optional): the placements that
        describes how to place the tensor on DeviceMesh, must have the same
        number of elements as ``device_mesh.ndim``. If not specified, we will
        by default replicate the tensor across the ``device_mesh`` from the
        first rank of each dimension of the `device_mesh`.

Keyword args:
    src_data_rank (int, optional): the rank of the source data for the logical/global tensor, it is
        used by :meth:`distribute_tensor` to scatter/broadcast the shards/replicas to other ranks.
        By default, we use ``group_rank=0`` on each DeviceMesh dimension as the source data to preserve
        the single-device semantic. If passing ``None`` explicitly, :meth:`distribute_tensor` simply uses
        its local data instead of trying to preserve the single-device semantic via scatter/broadcast.
        Default: 0

Returns:
    A :class:`DTensor` or ``XLAShardedTensor`` object.

.. note::
    When initialize the DeviceMesh with the ``xla`` device_type, ``distribute_tensor``
    return `XLAShardedTensor` instead. see `this issue <https://github.com/pytorch/pytorch/issues/92909>`__
    for more details. The XLA integration is experimental and subject to change.
ztorch.dtensor.distribute_tensorxlar   )xla_distribute_tensorDTo use DTensor API with xla, you must install the torch_xla package!NzY`distribute_tensor` should be used to distribute leaf tensors! but found non-leaf tensor!zW`placements` must have the same length as `device_mesh.ndim`! Found placements length: z, and device_mesh.ndim: rU   z-Cannot distribute a DTensor with device mesh z to a different device mesh z,Cannot distribute a DTensor with placements z to a different placements z-. do you want to call `redistribute` instead?z8Trying to distribute tensor with unsupported placements z on device mesh dimension !z(distributing a tensor should not be Noner4   )r<   r=   r9   r:   )*rK   _C_log_api_usage_oncer	   r   r   torch_xla.distributed.spmdr   ImportErroris_leafr[   ry   r   r   r   r   r   r   len
ValueErrorr   r   rQ   r=   r>   detachr   r^   r   r   r   r   _shard_tensorr_   _replicate_tensorr   r   sizer6   r7   requires_grad_r;   )r   rQ   r=   r   r   r   emsgr@   r-   rd   re   rv   s                r.   r   r   }  s$   ^ 
HH  !BC C!A!A!CK))Ke
	* )jII
 >>g
 	

 mm(((;' +01A1A+BC+Baik+B
C
:+***((+J'88PQ\QaQaPbbce
 	
 &'""
 ,?@R@R?S T..9]!=  j 11>v?P?P>Q R--7L 9*+ 
 ==?L j!J#J/UI.I}}q !)--&++"=>	"+3$223L ##%%Y	2I$663L J9+Uopsottuv ! 0& z"J#O%OO# ++-==?,,
D ##F$8$89** ]  	*XCc")	* Ds   L L!
LLLzAPlease use `distribute_tensor` with `src_data_rank=None` instead.r   c                     [        XUSS9$ )a4  
Locally shards a full tensor based on indicated sharding arrangement, and
returns a DTensor containing the local shard.

.. warning:: This is a private API that is subject to change. It skips the
    communication otherwise required by `distribute_tensor`. It is only
    applicable to cases where all ranks have the same `full_tensor`. For
    example, in distributed inference all ranks load from the same
    checkpoint. This API will not check for data equality between ranks, it
    is thus user's responsibility to ensure the `full_tensor` is the same
    across ranks.

Args:
    full_tensor (torch.Tensor): the full tensor to be sharded.
    placements (Sequence[:class:`Shard`]): the placements that
        describes how to place the local tensor on DeviceMesh.
    device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to place the
        DTensor.  Must have same dimension as the number of placements.
        If not specified, would be retrieve from current context.

Returns:
    A :class:`DTensor` object with the shard as its local tensor.

Examples:
    >>> # xdoctest: +SKIP("need world_size and rank")
    >>> device_mesh = dist.init_device_mesh("cuda", (world_size,))
    >>> full_tensor = torch.arange(world_size, device=f"cuda:{rank}")
    >>> dtensor = _shard_tensor(full_tensor, [Shard(1)], device_mesh)
Nr   )r   )r   r=   rQ   s      r.   r   r     s    F [zQUVVr1   modulepartition_fninput_fn	output_fnc                 t  ^^^ [         R                  R                  S5        [        U SS5      nU(       a  [	        S5      eT=(       d    [
        R                  " 5       mTR                  nUS:X  a   SSKJ	n  U" U TUTT5      $ S
[        R                  S[        SS	4S jn
Uc#  U R                  5        H  u  pU
" UT5        M     O+U R                  5        H  u  pU" XT5        U
" UT5        M     Tb  [        [         R"                  " T5      R$                  5      nUS:X  a1  [&        R(                  " S[*        SS9  U R-                  UU4S j5        O,US:X  a  U R-                  UU4S j5        O[/        SU S35      eTb  [        [         R"                  " T5      R$                  5      nUS:X  a1  [&        R(                  " S[*        SS9  U R1                  UU4S j5        O,US:X  a  U R1                  UU4S j5        O[/        SU S35      eSU l        U $ ! [         a  nSn	[        U	5      UeS	nAff = f)a  
This function expose three functions to control the parameters/inputs/outputs of the module:

1. To perform sharding on the module before runtime execution by specifying the
``partition_fn`` (i.e. allow user to convert Module parameters to :class:`DTensor`
parameters according to the `partition_fn` specified).
2. To control the inputs or outputs of the module during runtime execution by
specifying the ``input_fn`` and ``output_fn``. (i.e. convert the input to
:class:`DTensor`, convert the output back to ``torch.Tensor``)

Args:
    module (:class:`nn.Module`): user module to be partitioned.
    device_mesh (:class:`DeviceMesh`): the device mesh to place the module.
    partition_fn (Callable): the function to partition parameters (i.e. shard certain
        parameters across the ``device_mesh``). If ``partition_fn`` is not specified,
        by default we replicate all module parameters of ``module`` across the mesh.
    input_fn (Callable): specify the input distribution, i.e. could control how the
        input of the module is sharded. ``input_fn`` will be installed as a module
        ``forward_pre_hook`` (pre forward hook).
    output_fn (Callable): specify the output distribution, i.e. could control how the
        output is sharded, or convert it back to torch.Tensor. ``output_fn`` will be
        installed as a module ``forward_hook`` (post forward hook).

Returns:
    A module that contains parameters/buffers that are all ``DTensor`` s.

.. note::
    When initialize the DeviceMesh with the ``xla`` device_type, ``distribute_module``
    return nn.Module with PyTorch/XLA SPMD annotated parameters. See
    `this issue <https://github.com/pytorch/pytorch/issues/92909>`__
    for more details. The XLA integration is experimental and subject to change.

ztorch.dtensor.distribute_module_distribute_module_appliedFzhdistribute_module should only be called once on a module, but it has already been called on this module!r   r   )xla_distribute_moduler   Nmr<   rS   c                    [        5       /UR                  -  nU R                  R                  5        H[  u  p4Uc  M
  [	        U[
        5      (       a  M!  U R                  U[        R                  " [        UR                  X5      5      5        M]     U R                  R                  5        H:  u  p5Uc  M
  [	        U[
        5      (       a  M!  [        XQU5      U R                  U'   M<     g r'   )r   r   _parametersitemsr   r   register_parameternn	Parameterr   data_buffers)r  r<   full_replicatekeyparambuffers         r.   replicate_module_params_buffers:distribute_module.<locals>.replicate_module_params_buffersy  s     $+2----/JC E7)C)C$$LL!25::t!TU 0 ::++-KC!*VW*E*E"3F."Q

3 .r1      zDeprecating input_fn that takes two arguments (inputs, device_mesh), please use input_fn that takes in (module, inputs, device_mesh) instead!)
stacklevelc                    > T" UT5      $ r'   rE   )r@   inputsrQ   r   s     r.   <lambda>#distribute_module.<locals>.<lambda>  s    (6;"?r1      c                    > T" XT5      $ r'   rE   )modr  rQ   r   s     r.   r  r    s    HS+$Fr1   z-input_fn should take in 3 arguments, but got z arguments!zDeprecating output_fn that takes two arguments (inputs, device_mesh), please use output_fn that takes in (module, inputs, device_mesh) instead!c                    > T" UT5      $ r'   rE   r  r  outputsrQ   r   s      r.   r  r    s    Yw-Lr1   c                    > T" XT5      $ r'   rE   r  s      r.   r  r    s    Ys[-Qr1   z.output_fn should take in 3 arguments, but got T)rK   r   r   getattrr[   r	   r   r   r   r  r   r  Moduler
   named_modulesr   inspect	signature
parametersr{   r|   FutureWarningregister_forward_pre_hookr   register_forward_hookr   )r   rQ   r   r   r   already_distributedr   r  r   r   r  namesubmodnum_argss    ` ``         r.   r   r   5  sH   R 
HH  !BC!&*FN=
 	

 C!A!A!CK))Ke	* )\8Y R299 RJ R4 R&  #002LD+FK@ 3 #002LD{3+FK@ 3
 w((2==>q=MM[	 ,,? ],,F ?zU  w((3>>?q=MM\	 ((L ]((Q @
+V  )-F%Me  	*XCc")	*s   3H 
H7$H22H7r   c           	         U=(       d    [         R                  " 5       nUR                  US'   U=(       d%    [        S [	        UR
                  5       5       5      nUR
                  [        U5      :X  d   S5       eUS   [        R                  :X  d   S5       e[        R                  R                  U5      n[        XU5      u  pgU [        R                  :X  a  UR                  SS5      nU " Xh40 UD6n	GOU [        R                  :X  d  U [        R                  :X  a  UR!                  S[        R"                  " 5       5      n
[%        US	U
5      n['        U[        U5      US
9n[(        R*                  " U5      (       a4  [(        R,                  (       d  [(        R.                  " U5      [(        l        [(        R,                  c   e[(        R,                  R1                  U5         U " U40 UD6n	S S S 5        O	U " U40 UD6n	['        U[        U5      [%        UUW	R2                  5      S
9n[5        U	UUS   S9$ ! , (       d  f       NE= f)Nry   c              3   6   #    U  H  n[        5       v   M     g 7fr'   )r   )r   r@   s     r.   r   '_dtensor_init_helper.<locals>.<genexpr>  s     $R:QQY[[:Qs   z6mesh dimension does not match the length of placementsrz   zlayout value not supported!
fill_valuer   r7   )r   r8   r;   r:   )r	   r   r   r>   r   r   r   rK   strided_prims_commonmake_contiguous_strides_forr   r   popr   r   getget_default_dtyper   r   randomis_rng_supported_mesh_rng_trackerOffsetBasedRNGTracker_distribute_regionr7   r   )init_opr   rQ   r=   r   torch_stridelocal_shaper@   r.  r-   r7   r9   rv   s                r.   _dtensor_init_helperr=    s    C!A!A!CK"..F8 Ru$R%@P@P:Q$RRJ s:. @. (u}},K.KK,&&BB4HL ;:NK
 %**ZZa0
{A&A	EJJ	'U[["8

7E$;$;$=> tU3;j(9{S''44V=P=P"(">">{"KF""...  33D9";9&9L :9 {5f5j
D _-  :9s   '
H::
IFr7   rz   r;   rQ   r=   r7   rz   r;   c           
      R    [        U5      n[        [        R                  UU UUUUS9$ )a  
Returns a :class:`DTensor` filled with the scalar value 1, with the shape defined
by the variable argument ``size``.

Args:
    size (int...): a sequence of integers defining the shape of the output :class:`DTensor`.
        Can be a variable number of arguments or a collection like a list or tuple.
        E.g.: ones(1,2,3..) or ones([1,2,3..]) or ones((1,2,3..))

Keyword args:
    dtype (:class:`torch.dtype`, optional): the desired data type of returned :class:`DTensor`.
        Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
    layout (:class:`torch.layout`, optional): the desired layout of returned DTensor.
        Default: ``torch.strided``.
    requires_grad (bool, optional): If autograd should record operations on the
        returned :class:`DTensor`. Default: ``False``.
    device_mesh: :class:`DeviceMesh` type, contains the mesh info of ranks
    placements: a sequence of :class:`Placement` type: ``Shard``, ``Replicate``

Returns:
    A :class:`DTensor` object on each rank
r>  )r   r=  rK   r   r7   rz   r;   rQ   r=   r   
torch_sizes          r.   r   r     s4    < ).J

# r1   c           
      R    [        U5      n[        [        R                  UU UUUUS9$ )a0  
Returns a :class:`DTensor` filled with uninitialized data. The shape of the :class:`DTensor`
is defined by the variable argument ``size``.

Args:
    size (int...): a sequence of integers defining the shape of the output :class:`DTensor`.
        Can be a variable number of arguments or a collection like a list or tuple.
        E.g.: empty(1,2,3..) or empty([1,2,3..]) or empty((1,2,3..))

Keyword args:
    dtype (:class:`torch.dtype`, optional): the desired data type of returned :class:`DTensor`.
        Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).        layout (:class:`torch.layout`, optional): the desired layout of returned :class:`DTensor`.
        Default: ``torch.strided``.
    requires_grad (bool, optional): If autograd should record operations on the
        returned :class:`DTensor`. Default: ``False``.
    device_mesh: :class:`DeviceMesh` type, contains the mesh info of ranks
    placements: a sequence of :class:`Placement` type: ``Shard``, ``Replicate``

Returns:
    A :class:`DTensor` object on each rank
r>  )r   r=  rK   r   r@  s          r.   r   r   =  s4    < ).J# r1   c                T    [        U 5      n[        [        R                  UUUUUUUS9$ )aq  
Returns a :class:`DTensor` filled with ``fill_value`` according to ``device_mesh`` and
``placements``, with the shape defined by the argument ``size``.

Args:
    size (int...): a sequence of integers defining the shape of the output :class:`DTensor`.
        Can be a variable number of arguments or a collection like a list or tuple.
        E.g.: ones(1,2,3..) or ones([1,2,3..]) or ones((1,2,3..))
    fill_value(Scalar): the value to fill the output tensor with.

Keyword args:
    dtype (:class:`torch.dtype`, optional): the desired data type of returned :class:`DTensor`.
        Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
    layout (:class:`torch.layout`, optional): the desired layout of returned DTensor.
        Default: ``torch.strided``.
    requires_grad (bool, optional): If autograd should record operations on the
        returned :class:`DTensor`. Default: ``False``.
    device_mesh: :class:`DeviceMesh` type, contains the mesh info of ranks.
    placements: a sequence of :class:`Placement` type: ``Shard``, ``Replicate``

Returns:
    A :class:`DTensor` object on each rank
)r.  r7   rz   r;   rQ   r=   )r   r=  rK   r   )r   r.  r7   rz   r;   rQ   r=   rA  s           r.   r   r   h  s8    B ).J

#	 	r1   )r;   r7   rz   rQ   r=   c           
      R    [        U5      n[        [        R                  UUUU UUS9$ )aK  
Returns a :class:`DTensor` filled with random numbers from a uniform distribution
on the interval ``[0, 1)``. The shape of the tensor is defined by the variable
argument ``size``.

Args:
    size (int...): a sequence of integers defining the shape of the output :class:`DTensor`.
        Can be a variable number of arguments or a collection like a list or tuple.
        E.g.: ones(1,2,3..) or ones([1,2,3..]) or ones((1,2,3..))

Keyword args:
    dtype (:class:`torch.dtype`, optional): the desired data type of returned :class:`DTensor`.
        Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
    layout (:class:`torch.layout`, optional): the desired layout of returned DTensor.
        Default: ``torch.strided``.
    requires_grad (bool, optional): If autograd should record operations on the
        returned :class:`DTensor`. Default: ``False``.
    device_mesh: :class:`DeviceMesh` type, contains the mesh info of ranks.
    placements: a sequence of :class:`Placement` type: ``Shard``, ``Replicate``

Returns:
    A :class:`DTensor` object on each rank
r>  )r   r=  rK   r   r;   r7   rz   rQ   r=   r   rA  s          r.   r   r     s4    > ).J

# r1   c           
      R    [        U5      n[        [        R                  UUUU UUS9$ )aJ  
Returns a :class:`DTensor` filled with random numbers from a normal distribution
with mean 0 and variance 1. The shape of the tensor is defined by the variable
argument ``size``.

Args:
    size (int...): a sequence of integers defining the shape of the output :class:`DTensor`.
        Can be a variable number of arguments or a collection like a list or tuple.
        E.g.: ones(1,2,3..) or ones([1,2,3..]) or ones((1,2,3..))

Keyword args:
    dtype (:class:`torch.dtype`, optional): the desired data type of returned :class:`DTensor`.
        Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
    layout (:class:`torch.layout`, optional): the desired layout of returned DTensor.
        Default: ``torch.strided``.
    requires_grad (bool, optional): If autograd should record operations on the
        returned :class:`DTensor`. Default: ``False``.
    device_mesh: :class:`DeviceMesh` type, contains the mesh info of ranks.
    placements: a sequence of :class:`Placement` type: ``Shard``, ``Replicate``

Returns:
    A :class:`DTensor` object on each rank
r>  )r   r=  rK   r   rE  s          r.   r   r     s4    > ).J# r1   c           
      R    [        U5      n[        [        R                  UUUU UUS9$ )a  
Returns a :class:`DTensor` filled with the scalar value 0.

Args:
    size (int...): a sequence of integers defining the shape of the output :class:`DTensor`.
        Can be a variable number of arguments or a collection like a list or tuple.
        E.g.: zeros(1,2,3..) or zeros([1,2,3..]) or zeros((1,2,3..))
Keyword args:
    requires_grad (bool, optional): If autograd should record operations on the
        returned :class:`DTensor`. Default: ``False``.
    dtype (:class:`torch.dtype`, optional): the desired data type of returned :class:`DTensor`.
        Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
    layout (:class:`torch.layout`, optional): the desired layout of returned :class:`DTensor`.
        Default: ``torch.strided``.
    device_mesh: :class:`DeviceMesh` type, contains the mesh info of ranks
    placements: a sequence of :class:`Placement` type: ``Shard``, ``Replicate``

Returns:
    A :class:`DTensor` object on each rank
r>  )r   r=  rK   r    rE  s          r.   r    r      s4    8 ).J# r1   rp   r'   )NNNN)Fr!  r{   collections.abcr   typingr   r   r   r   typing_extensionsr   rK   "torch.distributed.tensor._dispatchdistributedr   	_dispatchr    torch.distributed.tensor._random_randomr5  torch.nnr  torch.distributed.device_meshr	   r
   *torch.distributed.tensor._collective_utilsr   r   &torch.distributed.tensor._dtensor_specr   r   &torch.distributed.tensor._redistributer   r   torch.distributed.tensor._utilsr   r   r   (torch.distributed.tensor.placement_typesr   r   r   r   __all__opsatenautogradFunctionr"   rO   rL   r   rr   r   r   r  r   r   rZ   r=  r/  r7   rz   rq   r   r   r   r   r   r    rE   r1   r.   <module>r\     s      $ 0 0 (  8 8 1 1  E X J 
 
 yy~~.-
U^^,, -
`]Du~~.. ]D@e;ell e;T )-04O
 $%OLLO*%O ),-O
 C=O Od OP )-"W"W"W *%"W 	"W Q"WN )-KOGKHLRIIR*%R 8S"))Z$@$$FGHR xC <d BCD	R
 "))S*!=t!CDER YYR| )-04	?
**? *%? ),-	? ?H $( ==(,04(EKK ( LL( 	(
 *%( ),-( (Z $( ==(,04(EKK ( LL( 	(
 *%( ),-( (^ $( ==(,04, EKK 	,
 LL, , *%, ),-, ,b  #' ==(,04)) EKK ) LL	)
 *%) ),-) )\  #' ==(,04)) EKK ) LL	)
 *%) ),-) )\  #' ==(,04&& EKK & LL	&
 *%& ),-& &r1   