
    shE                        S SK r S SKJrJr  S SKrS SKJr  S SKJ	s  J
r  S SKJ	s  Jr  S SKJr  S SKJr  S SKJrJrJr  S SKJrJr  S SKJr  S SKJrJrJr  S S	KJ r   \RB                  RD                  r"S
/r#\ RH                  S 5       r%S\&\ S4   S\'S\'4S jr(S\&\ S4   S\S\4S jr)S\RT                  RV                  S\&\,S4   S\-\.\,4   S\4S jr/S r0S\RT                  RV                  S\&\,S4   S\-\.\,4   S\,4S jr1S\RT                  RV                  S\&\,S4   S\-\.\,4   S\,4S jr2S\S\S\\   S\\   S\'S\'S \Rf                  S!\'S\S"\'S\&\\4   4S# jr4S\RT                  RV                  S\&\,S4   S\-\.\,4   S\,4S$ jr5S%\S\S\S\\   S\'S\'S&\S \Rf                  S!\'S\S"\'S\4S' jr6S\RT                  RV                  S\&\,S4   S\-\.\,4   S\,4S( jr7\"R`                  Rp                  \1\"Rr                  Rp                  \2\"Rt                  Rp                  \5\"Rv                  Rp                  \5\"Rx                  Rp                  \7\"Rz                  Rp                  \70r>S) r?S* r@g)+    N)castOptional)Tensor)
DeviceMesh)DTensor	ReplicateShard)DTensorSpec
TensorMeta)_MaskPartial)	_skip_dim	Reductionreplicate_reduction_dims)	Placementloss_parallelc               #   <   #    [        5         Sv   [        5         g7f)a  
A context manager that enables loss parallelism, where efficient parallelized loss computation
can be performed when the input is sharded on the class dimension. Currently only the cross-entropy
loss is supported.

Within this context manager, one can use :func:`~torch.nn.functional.cross_entropy` or
:class:`~torch.nn.CrossEntropyLoss` as usual, with the following assumptions on the input parameters.
The corresponding ``backward()`` call, if any, also needs to happen under this context manager.

Args:
    input (:class:`DTensor`):
        Input logits. Assumed to be sharded on the class dimension.
    target (Union[:class:`torch.Tensor`, :class:`DTensor`]):
        Must be ground truth class indices (class probabilities currently not supported).
        Assumed to be replicated across the ``DeviceMesh``.
    weight (Union[:class:`torch.Tensor`, :class:`DTensor`], optional):
        If given, assumed to be replicated across the ``DeviceMesh``.
    label_smoothing:
        Currently not supported.

Returns:
    A replicated :class:`DTensor`.

Example:
    A sharded DTensor is manually created here to showcase the usage.
    In practice, it is usually the output of a TP module.

    >>> # xdoctest: +SKIP("distributed")
    >>> from torch.distributed.tensor.parallel import loss_parallel
    >>> from torch.distributed.device_mesh import init_device_mesh
    >>> ...
    >>> device_mesh = init_device_mesh("cuda", (8,))
    >>> input = torch.randn(4, 16, device="cuda", requires_grad=True)
    >>> dist_input = distribute_tensor(input, device_mesh, placements=[Shard(1)])
    >>> target = torch.randint(16, (4,), device="cuda")
    >>> with loss_parallel():
    >>>     loss = F.cross_entropy(dist_input, target, reduction="mean")
    >>>     loss.backward()
    >>> ...
N)_enable_custom_loss_ops_disable_custom_loss_ops     z/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/distributed/tensor/parallel/loss.pyr   r      s     T 	s   
placements.dimreturnc                     [        U 5      S:X  d  [        S5      eU S   R                  U5      (       d  [        SU S35      eg)N   zLCurrently loss_parallel() only supports input on one-dimensional DeviceMesh.r   zUloss_parallel() should be enabled only when the input tensor is sharded on dimension .)len
ValueErroris_shard)r   r   s     r   _find_all_reduce_mesh_dimr!   P   sV    z?aZ
 	
 a=!!#&&cdgchhij
 	
 r   meshc                     [        U [        5      (       a.  U R                  U:X  a  U $ [        SU SU R                   S35      e[        U [        R
                  5      (       a  [        R                  " XUSS9$ [        S[        U 5       35      e)Nz	Expected z	 but got r   F)device_meshr   	run_checkzUnsupported type )	
isinstancer   r   RuntimeErrortorchr   
from_local	TypeErrortype)tensorr   r"   s      r   _cast_to_dtensorr-   \   s     &'""
*M:,i@Q@Q?RRSTUU	FELL	)	)!!u
 	
 +DL>:;;r   op_callargskwargsc                 :   [         R                  R                  XU5      n[         R                  R                  R	                  UR
                  5      n[        U[        5      (       a  U$ [        U[        5      (       a  US   $ [        S[        U5       S35      e)Nr   zUnexpected tensor meta type: r   )r   _op_dispatcherunwrap_to_op_infosharding_propagator_propagate_tensor_metaschemar&   r   tupler'   r+   )r.   r/   r0   op_infotensor_metas        r   r5   r5   l   s    
 $$66wfMG((<<SSK +z**	K	'	'1~:4;L:MQOPPr   c                    U(       a   U R                   [        R                  :X  d   e[        R                  " U [        R
                  R                  S9u  pVU R                  U[        R                  S9n U R                  5       S:X  a  U nOR[        R                  " XSS9n[        R                  " U[        R                  R                  R                   X44S9nX-
  n[        R"                  " [        R$                  " U5      USS9n	[        R                  " U	[        R                  R&                  R                   X44S9n	[        R(                  " U	5      n
Xz-
  nU(       d  UR                  U5      nU$ )N)type_promotion_kind)dtypememory_formatr   T)keepdim)reduceOpgroup)r<   r(   halfutilselementwise_dtypesELEMENTWISE_TYPE_PROMOTION_KINDDEFAULTtocontiguous_formatnumelamaxfuncol
all_reducec10dReduceOpMAXnamesumexpSUMlog)xr   half_to_floatr"   mesh_dimcomputation_dtyperesult_dtypeshiftedx_maxshifted_sumexpshifted_logsumexpresults               r   _log_softmaxr^      s$   ww%**$$$&+&>&>	uDDLL'# 	
$E4K4KLAwwyA~

140!!DMM--224:J
 )YYuyy13EN&&!2!2!7!7?ON 		.1(F<(Mr   c                 |   [        [        US   5      n[        [        US   5      n[        [        US   5      nUR                  n[        UR                  U5      n[        XU5      n[        UR                  XEUR                  U5      n	[        UR                  UR                  US9n
[        U	U
U	R                  S9$ )Nr   r      r9   requires_grad)r   r   intbool_specr!   r   r5   r^   _local_tensorr"   r
   rc   )r.   r/   r0   rT   r   rU   specrV   output_tensor_metaresres_specs              r   _log_softmax_handlerrl      s    
 	Wd1gA
sDG
CtAw'M77D(#>H/vF
qDIIx
PC		&H '' r   c                     [        [        US   5      n[        [        R                  US   5      nUR	                  U5      $ )Nr      )r   r   r(   r<   rF   )r.   r/   r0   grad_outputinput_dtypes        r   _log_softmax_backward_handlerrq      s7    
 wQ(Ku{{DG,K>>+&&r   rT   targetweightlocal_weight	reductionignore_indexinput_shapechannel_dimrV   c
                 Z  ^^ U R                  5       mSmTS:  a  SmS[        S[        4UU4S jjn
Ub  U
" U5      nUc   eU
" U5      nX-  n [        R                  " X:g  US5      nUR	                  T5      n[        UTS9nUR                  XU	5      n[        R                  " U TU5      nUR                  UX5      nUR                  T5      * n[        R                  " X:g  US5      nU[        R                  R                  :X  a  TS:  a  U R                  SS	5      nUU4$ Ub}  [        U R                  5      nS
UT'   WR!                  U5      n[        R                  " UTU5      R                  T5      n[        R                  " X:g  US5      nUR#                  5       nO!X:g  R#                  5       R%                  U 5      nU[        R&                  R                  :X  a  UR#                  5       nUU4$ U[        R(                  R                  :X  a  UR#                  5       U-  nUU4$ )Nr   r`   r   rs   r   c                 n   > TS:  a+  S/T-  nU R                   S   UT'   U R                  U5      nU$ U nU$ )Nr   r   )shapeview)rs   r{   wrx   n_dimss      r   _weight_view'_nll_loss_forward.<locals>._weight_view   sQ    A:E "(aE+E"A  Ar   offset_shape
offset_dimr   g        )r   r   r(   where	unsqueezer   _partition_valuegather_reduce_valuesqueezer   NONEvaluenew_fulllistr{   expandrP   rF   rR   MEAN)rT   rr   rs   rt   ru   rv   rw   rx   r"   rV   r   r}   local_wsafe_targetsafe_target_partial_placementsafe_target_partial_result_partialresult_reducedr]   total_weight	new_shapewsumr~   s          `               @r   _nll_loss_forwardr      s    UUWFKz	V 	 	 	  '''|,K++f4fa@K((5L %++V,==H \\![2FGN&44^TTN$$[11F[[/;FINN(((VaZzz"c*|##M	!#	+HHY||A{L9AA+N{{614;xxz.33588; IMM''' < 
inn**	*,<r   c                    [        [        US   5      nUS   nUS   n[        [        US   5      n[        [        US   5      nUR                  5       S:  a  SOSnUR                  n	[        U	R                  U5      n
[        [        U	R                  U/5      U5      n[        5       4U	R                  R                  -  n[        XKU	R                  5      nS nUb  [        X\U	R                  5      n[        U	R                  R                  5       Vs/ s H  oU
:X  a  [        S5      O	[        5       PM     nnUR                  U	R                  U5      R                   nUR"                  S   UR                   R"                  U   :X  d   eU[$        R&                  R(                  :X  a  UnOUn[+        U5      nXEsUS'   US'   [-        U [/        U5      U5      n[1        UR                   UR                   Ub  UR                   OS UUUUR"                  UU	R                  U
5
      u  nn[3        U	R                  UUS9n[        UUUR4                  S9U4$ s  snf )Nr   r   r`   rn      ra   rb   )r   r   rd   r   rf   r!   r   r   r   r   r"   ndimr-   ranger	   redistributerg   r{   r   r   r   r   r5   r7   r   r
   rc   )r.   r/   r0   rT   rr   rs   ru   rv   rx   rh   rV   target_placementsall_replicate_placementsrt   isharded_placementsoutput_placementsri   r]   r   out_specs                        r   _nll_loss_forward_handlerr     s6   
 	Wd1gA!WF!WFS$q'"IT!W%Luuw!|!K77D(+FH " ;-@+ !*~		>fCFL!&DIIN
 AFdiinn@U
@U1XE!H9;6@U 	 
 **4996HIWW!!!$(=(=k(JJJJINN(((-4 :DDGT!W/tfM,	 & 2			FL 499&7EWXH 	 ..	

 	 =
s   $Iro   r   c                    UR                  5       S:  a  SOSnU[        R                  R                  :X  a  X-  n UR	                  U5      n[
        R                  " X%:g  US5      n[
        R                  " U5      n[        XxS9nUR                  U5      R                  5       nUR                  XU
5      nUR                  R                  c   eUR                  R                  R                  UR                  5      S-
  n[
        R                   " UR"                  S   UR$                  S9nUR                  5       S:X  a  XU'   OUR                  5       S:X  a  XUU4'   OeUR'                  US5      nUR"                  nUR)                  SUR"                  U   5      nUUUU4'   UR+                  U5      R'                  US5      nUR                  5       U R                  5       s=:  a  S:  a  O  OU R	                  U5      n Ub  [-        UR                  5       5       Vs/ s H  nSPM     nnUR"                  S   UU'   UR)                  U5      n[/        UR"                  5      nSUU'   UR1                  U5      n[
        R2                  " UX5      nU U-  n [
        R                  " X%:g  U S5      n U[
        R4                  " U5      -   U -  $ s  snf )Nr`   r   r   r   g      ?)devicer   )r   r   r   r   r   r(   r   
zeros_liker   r   flattenr   mask_bufferdatarF   r<   aranger{   r   	transposereshaper|   r   r   r   r   rQ   )ro   rT   rr   rs   ru   rv   r   rw   rx   r"   rV   r   
grad_inputr   masked_safe_targetgrad_update	arange_1dgrad_input_tintermidate_shapegrad_input_2d_r   r}   w_targets                           r   "_nll_loss_and_log_softmax_backwardr   X  s    uuw{!KINN(((!0k*F++f4fa@K!!!$J %+V%%k2::<K*;;KxX((--999#//4477
8H8HICOK  #,>,E,EI
 	uuw!|)4%&	
A4?9001!++K<(..$,,R1EF7Bi!334"''(9:DD[RTU
~~+//+/a/!++K8 %aeeg/1Q	/!'a	+	* M	!#	+MM)$<<;7!H,++f4k1EK 1%44# 0s   !Kc                    [        [        US   5      n[        [        US   5      nUS   nUS   n[        [        US   5      n[        [        US   5      n[        [        US   5      n	UR	                  5       S:  a  SOSn
UR
                  n[        UR                  U
5      n[        [        UR                  U
/5      U
5      n[        5       4UR                  R                  -  n[        X]UR                  5      nUb  [        XnUR                  5      n[        U5      nXVsUS'   US'   [        XUR                  5      US'   [        U [!        U5      U5      n[#        UR$                  UR$                  UR$                  Ub  UR$                  OS UUU	UR&                  U
UR                  U5      n[)        UR                  UR                  US9n[        UUUR*                  S	9$ )
Nr   r   r`   rn   r         ra   rb   )r   r   rd   r   r   rf   r!   r   r   r   r   r"   r   r-   r   r5   r7   r   rg   r{   r
   rc   )r.   r/   r0   ro   rT   rr   rs   ru   rv   r   rx   rh   rV   r   r   ri   r]   r   s                     r   _nll_loss_backward_handlerr     s   
 wQ(KWd1gA!WF!WFS$q'"IT!W%LQ(Luuw!|!K77D(+FH " ;-@+ !*~		>fCF!&DIIN :DDGT!W|tyyQDG/tfM/!!	 & 2			F 		&H ** r   c                  ^    [         R                  R                  R                  [        5        g N)r   r2   _custom_op_handlersupdatecustomized_loss_opsr   r   r   r   r     s    ..556IJr   c                  p    [          H,  n [        R                  R                  R	                  U 5        M.     g r   )r   r   r2   r   pop)	custom_ops    r   r   r     s&    (	2266yA )r   )A
contextlibtypingr   r   r(   torch._prims_common_prims_commonrB   )torch.distributed._functional_collectivesdistributed_functional_collectivesrJ   "torch.distributed.distributed_c10ddistributed_c10drL   r   torch.distributed.device_meshr   torch.distributed.tensorr   r   r	   &torch.distributed.tensor._dtensor_specr
   r   ,torch.distributed.tensor._ops._embedding_opsr   'torch.distributed.tensor._ops._math_opsr   r   r   (torch.distributed.tensor.placement_typesr   opsaten__all__contextmanagerr   r7   rd   r!   r-   _ops
OpOverloadobjectdictstrr5   r^   rl   rq   Sizer   r   r   r   default_log_softmax_backward_datanll_loss_forwardnll_loss2d_forwardnll_loss_backwardnll_loss2d_backwardr   r   r   r   r   r   <module>r      s    !  # : : 1 1  4 > > J E 
 ? yy~~ 
 - -d	%	3*? 	c 	c 	<in-<5?<< QZZ""Q

Q fQ 	Q&4ZZ""

 f 	>'ZZ""'

' f' 	'F F F  VF  6"	F 
 F  F  F  F  F  F  66>F RAZZ""A

A fA 	AVB5B5B5 B5 V	B5
 B5 B5 B5 B5 B5 B5 B5 B5J8ZZ""8

8 f8 	8x 	3##++-J!!#<##%>""$>$$&@ KBr   