
    sh                      S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKJrJr  S SKJr  S SKJrJrJrJrJrJr  S SKrS SKJr  S SKrS SKrS SKJs  Jr  S SKJ r   S SK!J"r"J#r#  S S	K$J%r%  S S
K&J'r'  S SK(J)r)J*r*J+r+  S SK,J-r-  SSK.J/r/J0r0J1r1J2r2  SSK3J4r4  SSK5J6r6J7r7J8r8  SSK9J:r:  SSK;J<r<J=r=J>r>  SSK?J@r@  SSKAJBrB  SSKCJDrD  SSKEJFrFJGrGJHrHJIrI  SSKJJKrKJLrL  SSKMJNrNJOrOJPrPJQrQ  SSKJRrRJSrSJTrTJUrUJVrVJWrWJXrXJYrYJZrZJ[r[J\r\J]r]J^r^J_r_  SSK`JarbJcrcJdrdJere  SSKfJgrg  SSKhJiri  SSKjJkrkJlrlJmrmJnrnJoroJprpJqrqJrrrJsrsJtrtJuruJvrvJwrwJxrxJyry  SS KzJ{r{J|r|J}r}J~r~JrJr  SS!KJrJrJrJrJr  SS"KJr  \(       a&  S S#KJr  S S$KJr  S S%KJr  SS&K7Jr  SS'KJr  \" S(5      r\GR$                  " \5      r\GR*                  GR-                  \S)5      r\GR*                  GR-                  \S*5      r\GR*                  GR-                  \S+5      r\:" 5       r9 " S, S-5      r\" S5      SWS. j5       r\" S5      SWS/ j5       r " S0 S15      r\GR<                   " S2 S35      5       r\GR<                   " S4 S55      5       r        SXS6 jr " S7 S8\t5      r\" 5       GRF                  rSYS9 jrSYS: jrSZS; jrSYS< jrS[S= jrS\S> jr " S? S@\o5      rS]SA jrS^S_SB jjr " SC SD\s5      r\GR_                  SE5         " SF SG\5      r " SH SI5      r\GR<                   " SJ SK5      5       r " SL SM5      r\GR<                   " SN SO5      5       r " SP SQ\n\\\\\\4   4   4   5      r " SR SS\\   5      r " ST SU\5      rS`SV jrg)a    )annotationsN)IterableSequence)	lru_cache)AnyCallablecastOptionalTYPE_CHECKINGUnion)
PRECEDENCE)get_interface_for_device)identitypreserve_rng_state)is_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)has_triton_package   )free_symbol_is_type
prefix_strsymbol_is_typeSymT)ValueRanges   )configirmetrics)AsyncCompile)	code_hashget_pathPyCodeCache)DefaultHandler)triton_heuristics)benchmarker)AutotuneHintDevicePropertiesTRITON_MAX_BLOCKTRITON_MAX_RSPLIT)get_max_y_gridnext_power_of_2)BaseSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)cache_on_selfDelayReplaceLineget_bounds_index_exprget_fused_kernel_nameget_kernel_metadatais_welford_reductionPlaceholderprefix_is_reduction	sympy_dotsympy_product
sympy_substriton_typetriton_version_uses_attrs_dictupcast_compute_type)_opsReductionType	StoreModeV)"get_kernel_category_by_source_code   )BlockPatternMatcher)ArgNameBackendFeatureConstexprArgCSECSEVariableDeferredLineIndentedBufferInplacedBufferOpOverridesPythonPrinter
RemovedArgSizeArg	TensorArgWorkspaceArgWorkspaceZeroMode)constant_reprIterationRangesIterationRangesEntryIterationRangesRoot
SIMDKernelSIMDScheduling)	config_ofequal_1_arg_indicesnon_constexpr_signatureshould_unwrap_unspec_argsignature_to_meta)SymbolicCallArg)
ModuleType)TypeVarDtypePropagationOpsHandler)IRNode)SIMDKernelFeatures_T
perf_hintsschedulefusionc                  J    \ rS rSr% Sr0 rS\S'   0 rS\S'   \S
S j5       r	Sr
g	)OpDtypeSupportv   z
Some Triton ops such as libdevice and tl.math only support float32 and float64.
This class records which dtypes are supported by specific IR ops.
z"dict[str, OrderedSet[torch.dtype]]supported_dtypeszdict[str, bool]convert_outputsc                    UR                   n[        [        R                  [        R                  /5      U R
                  U'   X R                  U'   g N)__name__r   torchfloat32float64ro   rp   )clsfuncconvert_outputop_names       r/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/_inductor/codegen/triton.pyregister_upcastOpDtypeSupport.register_upcast   s;    --(2EMM5==3Q(RW%'5G$     N)rx   zCallable[..., str]ry   boolreturnNone)rs   
__module____qualname____firstlineno____doc__ro   __annotations__rp   classmethodr|   __static_attributes__r   r~   r{   rm   rm   v   s1    
 <>8=')O_)6 6r~   rm   c                 x    [        5       (       d  gSSKn [        U R                  R                  S5      (       a  gg)zX
import AttrsDescriptor if the triton version is new enough to have this
class defined.
 r   NAttrsDescriptorz4from triton.compiler.compiler import AttrsDescriptor)r   triton.compiler.compilerhasattrcompiler)tritons    r{   gen_attr_descriptor_importr      s3     # v''):;;Er~   c                     [        5       n U R                  S5        [        5       =n(       a  U R                  U5        U R                  S5        U R	                  5       $ )NzD
        import triton
        import triton.language as tl
        a  
        from torch._inductor.runtime import triton_helpers, triton_heuristics
        from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
        from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
        )rM   splicer   	writelinegetvalue)imports	attr_descs     r{   gen_common_triton_importsr      s[    GNN	 /00y0)$NN	 r~   c                     \ rS rSrSr\" \R                  \R                  /5      r	\" \R                  \R                  \R                  /\	Q5      r\ V VVVs0 s H$  nU[        R                  " [         U    S3SSS9_M&     snnnn r\ V VVVs0 s H2  nU[        R                  " [         U   R%                  5        S3SSS9_M4     snnnn r\SS j5       r\SS	 j5       rS
rgs  snnnn f s  snnnn f )TritonSymbols   zM
Stores sympy.Symbol instances and constants associated with triton codegen.
offsetTintegernonnegativeBLOCKr   positivec                4    U R                   UR                     $ rr   )block_sizessymtrw   trees     r{   get_block_sizeTritonSymbols.get_block_size   s    tyy))r~   c                4    U R                   UR                     $ rr   )block_offsetsr   r   s     r{   get_block_offsetTritonSymbols.get_block_offset   s      ++r~   r   N)r   rW   r   zsympy.Symbol)rs   r   r   r   r   r   r   R0_INDEXR1_INDEXreduction_typesXBLOCKYBLOCKZBLOCKblock_typessympySymbolr   r   upperr   r   r   r   r   ).0r   r   r   s   0000r{   r   r      s    !$--!?@Odkk4;;VoVWK  D 	ellj./v6RVWWM  	  D 	ell$%%'(.t
 	
  	K * * , ,#
s   +C'
9C/
r   c                      \ rS rSr% S\S'   S\S'   S\S'   S\S	'   S
\S'   SS jrSS jrSS jrSS jrSS jr	\
SS j5       rSrg)IndexingOptions   str	index_strOrderedSet[str]	mask_varsOptional[str]
expand_strr   _has_rindex
sympy.Exprindexc                ,    [        U R                  5      $ rr   )r   r   selfs    r{   has_maskIndexingOptions.has_mask   s    DNN##r~   c                J    [        U R                  [        R                  5      $ rr   )r   r   r   TMPr   s    r{   has_indirectIndexingOptions.has_indirect   s    "4::txx88r~   c                    U R                   $ rr   )r   r   s    r{   
has_rindexIndexingOptions.has_rindex   s    r~   c                :    [        S U R                   5       5      $ )Nc              3  V   #    U  H  n[        U5      R                  S 5      v   M!     g7f)tmpNr   
startswithr   masks     r{   	<genexpr>.IndexingOptions.has_tmpmask.<locals>.<genexpr>   s"     J>43t9''..>   ')anyr   r   s    r{   has_tmpmaskIndexingOptions.has_tmpmask   s    J4>>JJJr~   c                :    [        S U R                   5       5      $ )Nc              3  V   #    U  H  n[        U5      R                  S 5      v   M!     g7f)rNr   r   s     r{   r   ,IndexingOptions.has_rmask.<locals>.<genexpr>   s"     H3t9'',,r   r   r   s    r{   	has_rmaskIndexingOptions.has_rmask   s    HHHHr~   c                z    U R                   (       a)  SR                  [        [        U R                   5      5      $ S$ )N & r   )r   joinmapr   r   s    r{   mask_strIndexingOptions.mask_str   s'    7;~~uzz#c4>>23Q6Qr~   r   Nr   r   r   r   )rs   r   r   r   r   r   r   r   r   r   propertyr   r   r   r~   r{   r   r      sP    N$9 KI R Rr~   r   c                  |   \ rS rSr% S\S'   S\S'   S\S'   S\S	'   S
\S'   S\S'   S
\S'   SrS\S'   \S#S j5       r\S#S j5       r\S#S j5       r	\S#S j5       r
          S$S jr\            S%S j5       r        S&S jrS'S(S jjrS)S jrS*S jrS+S jrS,S jrS,S jrS,S jrS,S  jrS,S! jrS"rg)-BlockPtrOptions   BlockParametersparamsr   constant_offset	list[int]orderr   r   Sequence[sympy.Expr]broadcast_shapez
list[bool]broadcasting_dimsfinal_shapeNzOptional[list[int]]_boundary_checkc                .    U R                   R                  $ rr   )r   shaper   s    r{   r   BlockPtrOptions.shape   s    {{   r~   c                .    U R                   R                  $ rr   )r   block_shaper   s    r{   r   BlockPtrOptions.block_shape   s    {{&&&r~   c                .    U R                   R                  $ rr   )r   stridesr   s    r{   r  BlockPtrOptions.strides       {{"""r~   c                .    U R                   R                  $ rr   )r   offsetsr   s    r{   r  BlockPtrOptions.offsets   r  r~   c                b  ^	 [        U R                  U R                  5       VVs/ s H(  u  pVU(       a  [        R                  R
                  OUPM*     nnn[        XU5      n[        R                  R                  m	[        U R                  5      =(       a<    [        U5      [        U5      :g  =(       d    [        U	4S j[        Xs5       5       5      nU(       a  U(       a0  SU S[        R                  R                  U R                  5       S3n[        XR                  U5      nU$ s  snnf )z
Generate a broadcast and a reshape for the block pointer.
This restores stride-0 dimensions which were removed from the block pointer.
c              3     >#    U  H6  u  pTR                  US 5      =(       d    TR                  X5      (       + v   M8     g7frE   N)statically_known_equals)r   pre_dimpost_dimsizevarss      r{   r   @BlockPtrOptions.codegen_broadcast_and_reshape.<locals>.<genexpr>  sL      
 *O%G 44Wa@ K77J  *Os   >Atl.broadcast_to(, ))zipr   r   r   SOnetriton_reshaperC   graphr  r   lenkernelindex_to_str)
r   valueinitial_shaper   allow_implicitdimis_broadcastingpre_broadcast_shaperequire_broadcastr  s
            @r{   codegen_broadcast_and_reshape-BlockPtrOptions.codegen_broadcast_and_reshape  s    ),$$d&<&<)
)$ +EGGKK3) 	 
 u5HI 77## 6 67 	
#$K(88  
 *--@)N  	 !2&ugR0E0EdFZFZ0[/\\]^E u&:&:KH=
s   /D+c                V  ^^ [         R                  R                  mS	U4S jjnU" U R                  5      U l        U" U R                  5      U l        U R                   Vs/ s H  nTR                  US5      PM     nnU R                   Vs/ s H  nTR                  US5      PM     n	n[        U	5      (       a  SU	S'   [        U R                  U	5       VV
s/ s H  u  pU
(       a  M  UPM     nnn
[        X5       Vs/ s H  n[        U5      PM     snmU4S jn[        S
0 [        R                  " U 5      R                  5        VVs0 s H  u  pX" U5      _M     snnD6n U Vs/ s H  n[        R                  U5      PM     nn[         R                   R"                  (       a&  US   R$                  S:X  d   eUR'                  S5        [         R                   R(                  n[         R                   R*                  (       d  [-        U R                  5      [-        [         R                   R.                  5      U-
  :X  aN  [         R                   R0                  R3                  5       (       a!  U[4        R6                  R8                  /U-  -  n[;        U [         R                  R                  R=                  U5      [?        [A        [C        [-        U R                  5      5      5      5      UUUUS9nURE                  U5        U$ s  snf s  snf s  sn
nf s  snf s  snnf s  snf )z,Helper to create a  BlockPtrOptions instancec                R   > U  Vs/ s H  nTR                  U5      PM     sn$ s  snf rr   )lookup_precomputed_size)exprsexprr  s     r{   lookup_size+BlockPtrOptions.create.<locals>.lookup_size=  s&    GLMutH44T:uMMMs   $r   rE   Fc                d   > [        U T5       VVs/ s H  u  pU(       a  M  UPM     snn$ s  snnf )z@Removes any broadcasting or singleton dims from a given sequence)r  )ititemis_removableremovable_dimss      r{   remove_dims+BlockPtrOptions.create.<locals>.remove_dims_  s7     +.b.*A*A&D# *A  s   ,,x)r   r   r   r   r   r   r   )r'  zIterable[sympy.Expr]r   list[sympy.Expr]r   )#rC   r  r  r   r  r  r   allr  r   r   dataclassesasdictitemsr   r   r  no_x_dimprefixpopnum_reduction_dimsinside_reductionr  numelsfeaturesis_reductionr   r  r  r   r&  listreversedrangecompute_boundary_check)r   r   range_treesr   get_max_blockr)  strider   r  singleton_dimsis_singletonr   dimsr1  keyvalr   r   reduction_ndimresultr0  r  s                       @@r{   createBlockPtrOptions.create0  s    77##	N #6<<0$V^^4
 GMnn
FTFH,,VQ7n 	 
 AG@R@R
@RH,,S!4@R 	 
 ~!&N2 &)););^%L
%L! %L 	 
 14N0VW0V#d)0VW	 ! 
5@5G5G5O5U5U5WX5WsK$$5WX

 GRRkd}33D9kR88q>((C///OOA44))FNN#s188??';n'LL!!..00 EGGKK=>99K GG,,DD_Uxc&,,&7 89:#+/
 	%%m4


 X Y Ss*   !LLL*L LL *L&c                B    [         R                  U   n[        XU05      $ )z>
Replaces instances of {symt}_offset with the new expression.
)r   r   r<   )r   r(  replacementr   roffsets        r{   replace_offsetBlockPtrOptions.replace_offset  s$      --d3$+ 677r~   c           	       ^  SU 4S jjn[         R                  R                  n/ T R                  QnU(       d  U Vs/ s H
  oc" U5      PM     nnT R                  S:w  a  U SU" T R                  5       S3OUSU" T R
                  5       3SU" T R                  5       3SU" T R                  5       3SU" T R                  5       3S	U" U5       3/nS
SR                  U5       S3$ s  snf )z
Codegen a call to tl.make_block_ptr()

Args:
    name: variable name for pointer
    roffset: should rn_offset be included in offsets=..., for use with tl.advance()

Returns:
    "tl.make_block_ptr(...)"
c                   > [         R                   H*  nTR                  U [        R                  " S5      U5      n M,     U $ Nr   )r   r   rT  r   Integer)r(  r   r   s     r{   remove_roffsets/BlockPtrOptions.format.<locals>.remove_roffsets  s5    %55**4q1A4H 6Kr~   r    + (r  zshape=zstrides=zblock_shape=zorder=zoffsets=ztl.make_block_ptr(r  )r(  r   r   r   )
rC   r  r  r  r   r   r  r   r   r   )r   namerS  rZ  fr  r   argss   `       r{   formatBlockPtrOptions.format  s    	
 HH!!!DLL/=DEW6v.WGE ''1, &Qt3345Q7Qtzz]O$q'(1T--./0Qtzz]O$qzl#
 $DIIdO#4A66 Fs   C*c           
     <   [         R                  R                  n[        R                  R                  5        VVs0 s H  u  p4XA" [        U   5      _M     nnn[        [        U R                  5      5       Vs/ s GH  nUR                  U R                  U   [        R                  R                  5      (       a  MB  UR                  U R                  U   U R                   U   5      (       a  Mu  UR                  U R                  U   [#        U R                   U   U5      5      (       a  M  [         R$                  R&                  (       a5  U R                   U   [        R                  [(        R*                     :X  a  GM  UPGM     snU l        gs  snnf s  snf )z6List of indices to pass to tl.load(boundary_check=...)N)rC   r  r  r   r   r8  r   rC  r  r   r  r  r   r  Zerostatically_known_multiple_ofr   r<   r  r9  r   r   r   )r   rF  r  r   
block_sizeblock_to_maxidxs          r{   rD  &BlockPtrOptions.compute_boundary_check  sG   77## %2$=$=$C$C$E/
$E  j&677$E 	 /
 S_- 
-44T\\#5FU  !==JJsOT%5%5c%:	  !==JJsOZ0@0@0E|%T  HH%%((-1J1J4;;1WW C- 
/

 
s%   F:AF?/F29F/AFFc                8    U R                   c   eU R                   $ rr   r   r   s    r{   boundary_checkBlockPtrOptions.boundary_check  s     ##///###r~   c           	         [         R                  U   nU R                   Vs/ s HA  nU R                  X2U5      U R                  U[        R
                  R                  U5      -
  PMC     nnU$ s  snf )aF  
Codegen string to pass to tl.advance(name, ...).

Advance is the difference between offsets in each loop iteration.
To compute it, we replace rN_offset with multiples of RN_BLOCK.
Since we expect rN_offset to vary in range(0, rN_numel, RN_BLOCK), the first
iteration has rN_offset=0, while the second has rN_offset=RN_BLOCK.
)r   r   r  rT  r   r  rc  )r   r   rblockr   advances        r{   advance_roffsetBlockPtrOptions.advance_roffset  sw     **40 ,,

 ' ##FD9%%feggllDAB ' 	 
 
s   AA.c                    gNFr   r   s    r{   r   BlockPtrOptions.has_indirect      r~   c                :    [        S U R                   5       5      $ )Nc              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7frr   )r   r   r   )r   r(  s     r{   r   -BlockPtrOptions.has_rindex.<locals>.<genexpr>  s'      
(  m&C&CDD(r   )r   r   r   s    r{   r   BlockPtrOptions.has_rindex  s"     
((
 
 	
r~   c                "    U R                  5       $ rr   )r   r   s    r{   r   BlockPtrOptions.has_rmask  s      r~   c                    grs  r   r   s    r{   r   BlockPtrOptions.has_tmpmask  ru  r~   c                4    [        U R                  5       5      $ rr   )r   rk  r   s    r{   r   BlockPtrOptions.has_mask  s    D'')**r~   rj  r   r4  )
r  r   r  r   r   r   r  r   r   r   )r   r   r   r   rE  zlist[IterationRangesRoot]r   r   rF  Callable[[str], int]r   r   )r(  r   rR  r   r   r   r   r   T)r]  r   r   r   )rF  r  r   r   )r   r   )r   r   r   r   r   )rs   r   r   r   r   r   r   r   r   r  r  r"  staticmethodrO  rT  r`  rD  rk  rp  r   r   r   r   r   r   r   r~   r{   r   r      se   ))!!%%+/O(/! ! ' ' # # # #++ ,+ *	+
 + 
+Z TT $T /	T
 #T ,T 
T Tl88-78?C8	8!7F
:$&
!+r~   r   c                   [        U[        5      (       a  [        U[        5      (       d   eU Vs/ s H"  n[        R                  R	                  U5      PM$     nnU Vs/ s H"  n[        R                  R	                  U5      PM$     nnXE:X  a  U $ U Vs/ s H  ofS:w  d  M
  UPM     snU:w  a  SU  SSR                  U5       S3$ Sn/ nU HK  n	U[        U5      :  a   XU   :X  a  UR                  S5        US-  nM2  U	S:X  d   eUR                  S	5        MM     U[        U5      :X  d   eU  S
SR                  U5       S3$ s  snf s  snf s  snf )z7Workaround https://github.com/openai/triton/issues/28361ztl.reshape(z, [r  z])r   :rE   r   [])
isinstancerA  rC   r  r  r   r  append)
r  	old_shape	new_shaper   old_shape_strnew_shape_strsrg  expandsizes
             r{   r  r    sF    i&&:i+F+FFF?HIyeQXX**51yMI?HIyeQXX**51yMI% -=aH=->UG3tyy'?&@CC
CF]##c0B(BMM#1HC3;;MM&!  #m$$$$WAdii'(**% JI .s   )E )E
	E#Ec                  0   \ rS rSrS S jrS S jrS S jrS S jrS S jrS S jr	S S jr
S S	 jrS S
 jrS S jrS S jrS S jr\rS S jrS!S jrS S jrS S jrS S jrS S jrS S jrS S jrS S jrS S jrS S jrS S jrS S jrS S jrS S jrS S jr Sr!g)"TritonPrinteri  c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ )NrE   libdevice.trunc(r   ).to(r  r  r_  _printrC   r  index_dtyper   r(  s     r{   _print_TruncToIntTritonPrinter._print_TruncToInt  M    499~"""t{{499Q<89qxx?S?S>TTUV	
r~   c                    [         R                  " 5       (       a$  [        R                  R                  (       a  U nU$ SU S3nU$ )Nztl.full([], z, tl.float64))r   	is_fbcodert   versionhip)r   r(  rets      r{   _print_FloatTritonPrinter._print_Float   s@    %--"3"3FC 
 !m4C
r~   c                    [        UR                  5      S:X  d   eU R                  UR                  S   [        S   S-
  5      nU S3$ )NrE   r   Atom      ?z.to(tl.float64))r  r_  parenthesizer   )r   r(  r  s      r{   _print_ToFloatTritonPrinter._print_ToFloat'  sI    499~"""diilJv,>,DEO$$r~   c                   UR                   u  p#UR                  (       a8  UR                  (       a'  U R                  UR                   S[        S   S-
  5      $ U R	                  U5      nU R	                  U5      nSU SU S3$ )N % r  r  z!triton_helpers.remainder_integer(r  r  )r_  is_nonnegative	stringifyr   r  r   r(  quotdivquot_sdiv_ss         r{   _print_PythonModTritonPrinter._print_PythonMod,  sp    II	3#5#5>>$))UJv4F4LMMT"C 26("UG1EEr~   c                ,   UR                   (       d   eUR                  u  p#UR                  (       a8  UR                  (       a'  U R                  UR                  S[        S   S-
  5      $ U R                  U5      nU R                  U5      nSU SU S3$ )N // r  r  z!triton_helpers.div_floor_integer(z,  r  )
is_integerr_  r  r  r   r  r  s         r{   _print_FloorDivTritonPrinter._print_FloorDiv4  s|    II	3#5#5>>$))VZ5G#5MNNT"C 26(#eWAFFr~   c                P    U R                  UR                  S[        S   S-
  5      $ )N / r  r  )r  r_  r   r  s     r{   _print_IntTrueDivTritonPrinter._print_IntTrueDiv?  s#    ~~dii
60BS0HIIr~   c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ NrE   libdevice.floor(r   r  r  r  r  s     r{   _print_floorTritonPrinter._print_floorD  r  r~   c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ r  r  r  s     r{   _print_FloorToIntTritonPrinter._print_FloorToIntJ  r  r~   c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ NrE   libdevice.ceil(r   r  r  r  r  s     r{   _print_ceilingTritonPrinter._print_ceilingP  K    499~""" TYYq\!: ;5AUAU@VVWXXr~   c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ r  r  r  s     r{   _print_CeilToIntTritonPrinter._print_CeilToIntT  r  r~   c                ,    SU R                  U5       S3$ )Nzlibdevice.sqrt(().to(tl.float32)))r  r  s     r{   _helper_sqrtTritonPrinter._helper_sqrtX  s    !$++d"3!44EFFr~   c                    SU R                  UR                  S   5       SU R                  UR                  S   5       S3$ )Nlibdevice.pow(r   r  rE   r  )r  r_  r  s     r{   _print_FloatPowTritonPrinter._print_FloatPow[  s?    T[[167r$++diiPQl:S9TTUV	
r~   c                    U R                  UR                  S   5      nU R                  UR                  S   5      nU R                  UR                  S   5      nSU SU SU S3$ )Nr   rE   r   	tl.where(r  r  )doprintr_  )r   r(  cpqs        r{   _print_WhereTritonPrinter._print_Whereb  s_    LL1&LL1&LL1&1#Rs"QCq))r~   c                   [        UR                  5      S:X  a  U R                  UR                  S   5      $ [        UR                  5      S-  n[        U5      nU R                  U" UR                  SU 6 5      nU R                  U" UR                  US 6 5      n[	        S XV4 5       5      u  pVUS;   d   SU S35       eS	U S
U SU SU SU S
U SU SU S3$ )z3
Helper for max/min code genereration.
cmp: > or <
rE   r   r   Nc              3  .   #    U  H  nS U S3v   M     g7f)(r  Nr   r   r3  s     r{   r   6TritonPrinter._print_min_max_helper.<locals>.<genexpr>w  s     .v!q1Xvs   )><zUnexpected comparator: ''r  z * ( z= z) + )))r  r_  r  typetuple)r   r(  cmpmidrw   abs          r{   _print_min_max_helper#TritonPrinter._print_min_max_helperh  s    
 tyy>Q;;tyy|,,$))n!4jKKTYYt_-.KKTYYst_-. .v..j C$<SE"CC 1#T!AcU"QCtA3d1#Qse1QCrBBr~   c                &    U R                  US5      $ )Nr  r  r  s     r{   
_print_MinTritonPrinter._print_Min{      ))$44r~   c                &    U R                  US5      $ )Nr  r  r  s     r{   
_print_MaxTritonPrinter._print_Max~  r  r~   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrE   tl_math.abs(r   r  r  r_  r  r  s     r{   
_print_AbsTritonPrinter._print_Abs  s9    499~"""dkk$))A,78::r~   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrE   zlibdevice.cos((r   r  r  r  s     r{   _print_OpaqueUnaryFn_cos&TritonPrinter._print_OpaqueUnaryFn_cos  :    499~""" TYYq\!: ;;LMMr~   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrE   zlibdevice.cosh((r   r  r  r  s     r{   _print_OpaqueUnaryFn_cosh'TritonPrinter._print_OpaqueUnaryFn_cosh  :    499~"""!$++diil";!<<MNNr~   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrE   zlibdevice.acos((r   r  r  r  s     r{   _print_OpaqueUnaryFn_acos'TritonPrinter._print_OpaqueUnaryFn_acos  r  r~   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrE   zlibdevice.sin((r   r  r  r  s     r{   _print_OpaqueUnaryFn_sin&TritonPrinter._print_OpaqueUnaryFn_sin  r  r~   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrE   zlibdevice.sinh((r   r  r  r  s     r{   _print_OpaqueUnaryFn_sinh'TritonPrinter._print_OpaqueUnaryFn_sinh  r  r~   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrE   zlibdevice.asin((r   r  r  r  s     r{   _print_OpaqueUnaryFn_asin'TritonPrinter._print_OpaqueUnaryFn_asin  r  r~   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrE   zlibdevice.tan((r   r  r  r  s     r{   _print_OpaqueUnaryFn_tan&TritonPrinter._print_OpaqueUnaryFn_tan  r  r~   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrE   zlibdevice.tanh((r   r  r  r  s     r{   _print_OpaqueUnaryFn_tanh'TritonPrinter._print_OpaqueUnaryFn_tanh  r  r~   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrE   zlibdevice.atan((r   r  r  r  s     r{   _print_OpaqueUnaryFn_atan'TritonPrinter._print_OpaqueUnaryFn_atan  r  r~   c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ )NrE   zlibdevice.llrint(r   r  r  r  r  s     r{   _print_RoundToIntTritonPrinter._print_RoundToInt  sM    499~"""DIIaL 9:%@T@T?UUVW	
r~   c                    [        UR                  5      S:X  d   eUR                  u  p#UR                  (       a  US:  d   e[        SU S35      eU R	                  U[
        S   5      nSU SU SU*  3$ )	Nr   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .Mulzlibdevice.nearbyint(1e * z) * 1e)r  r_  r  
ValueErrorr  r   )r   r(  numberndigits
number_strs        r{   _print_RoundDecimal!TritonPrinter._print_RoundDecimal  s    499~"""))Q;;abiajjkl  &&vz%/@A
'yJ<vwhZPPr~   r   N)r(  r   r   r   )r(  r   r  r   r   r   )"rs   r   r   r   r  r  r  r  r  r  r  r  r  r  r  r  _print_PowByNaturalr  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r'  r   r   r~   r{   r  r    s    
%
FGJ


YYG

 **C&55;NOONOONOO
Qr~   r  c                *    [        [        U 5      5      $ )zCConvert torch.dtype to triton type and upcast [b]float16 to float32)r=   r?   dtypes    r{   triton_compute_typer-    s    *5122r~   c                `    U [         R                  :X  a  [         R                  n [        U 5      $ )z@Convert torch.dtype to triton type, with fix for storing tl.bool)rt   r   int8r=   r+  s    r{   triton_store_typer0    s"    



ur~   c                    [        U 5      (       a1  U R                  (       a   U R                  S::  a  [        R                  $ [        U 5      $ )z0Implicit upcasts used for Triton reduction types   )r   	is_signeditemsizert   int32r?   r+  s    r{   upcast_acc_dtyper6    s3    5??u~~7J{{u%%r~   c                *    [        [        U 5      5      $ )z:Convert torch.dtype to triton type, with reduction upcasts)r-  r6  r+  s    r{   triton_acc_typer8    s    /677r~   c                F    U R                   S:*  =(       a    U R                  $ )Nr   )r4  is_floating_pointr+  s    r{   low_precision_fpr;    s    >>Q:5#:#::r~   c                    [        U [        5      (       d  gU R                  n[        U[        R                  5      (       a  [	        U5      $ S$ rs  )r  rK   r,  rt   r;  )varr,  s     r{   low_precision_fp_varr>    s<    c;''IIE&0&D&DE"O%Or~   c                  2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )TritonCSEVariablei  c                h   > [         TU ]  XU5        [        [           " 5       U l        Uc   S5       eg )Nz!TritonCSEVariable must have dtype)super__init__r   r   r   )r   r]  boundsr,  	__class__s       r{   rC  TritonCSEVariable.__init__  s2    u-#C* E"EE r~   c                p   U H  n[        U[        5      (       a'  U R                  R                  UR                  5        M?  [        U[        R
                  5      (       d  M`  [        R                   H<  n[        XE5      (       d  M  U R                  R                  [        U    S3/5          M     M     g )Nr   )
r  r@  r   updater   r   r   r   r   r   )r   r]  r_  kwargsargr   s         r{   update_on_args TritonCSEVariable.update_on_args  s    C#011%%cmm4C.. *55D%c00--*T2B1C4/H.IJ 6 r~   )r   )rD  zValueRanges[Any]r,  torch.dtyper   r   )rs   r   r   r   rC  rK  r   __classcell__rE  s   @r{   r@  r@    s    F r~   r@  c                     SSK Jn   U " 5       $ )Nr   rd   )!torch._inductor.dtype_propagationre   rd   s    r{   get_dtype_handlerrR    s    L%''r~   c                :   ^ ^^ SS jmSU4S jjmSU UU4S jjnU$ )z
Codegen helper to upcast arguments to float32, depending on the config and dtype.
This decorates tl.math/libdevice codegen functions.
c                    [         R                  R                  (       + =(       aD    [        U [        5      =(       a-    U R
                  [        R                  [        R                  4;   $ rr   )	r   r   codegen_upcast_to_fp32r  rK   r,  rt   float16bfloat16)r=  s    r{   needs_upcast*maybe_upcast_float32.<locals>.needs_upcast  sD    444 =3,=		emmU^^<<	
r~   c                2   > T" U 5      (       a  SOSnU  U 3$ )N.to(tl.float32)r   r   )r=  upcast_stringrX  s     r{   maybe_upcast_arg.maybe_upcast_float32.<locals>.maybe_upcast_arg  s$    -9#->->)B}o&&r~   c                L   >^  [         R                  T T5        SUU UU4S jjnU$ )Nc                   > U  Vs/ s H  nT" U5      PM     nnUR                  5        VVs0 s H  u  pEUT" U5      _M     nnnT" U0 UD6nT=(       a7    [        U4S j[        R                  " XR	                  5       5       5       5      nU(       d  S O#[        [        5       TR                  5      " U 0 UD6n	U	[        R                  S 4;  n
U
(       a  U	b  S[        U	5       S3OSnU U 3$ s  snf s  snnf )Nc              3  4   >#    U  H  nT" U5      v   M     g 7frr   r   )r   r=  rX  s     r{   r   Kmaybe_upcast_float32.<locals>.decorator.<locals>.wrapped.<locals>.<genexpr>  s      6-ScS!!-Ss   .to(r  r   )r8  r   	itertoolschainvaluesgetattrrR  rs   rt   ru   r=   )r_  rI  rJ  upcast_argsrK  rL  upcast_kwargsrN  any_needs_upcastresult_dtypeneeds_downcastdowncast_stringry   rx   r]  rX  s               r{   wrapped8maybe_upcast_float32.<locals>.decorator.<locals>.wrapped  s   <@ADS+C0DKAHNWHCS"23"77MW ;8-8F-  # 6-6__T==?-S6 3
 ( .0$--@$Q&Q 
 *%--1FFN "l&> {<013 
 Xo.//' BWs
   C0C5r   )rm   r|   )rx   rn  ry   r]  rX  s   ` r{   	decorator'maybe_upcast_float32.<locals>.decorator  s$    &&t^<	0 	0. r~   r   r   )rx   Callable[..., Any]r   rr  r   )ry   rp  r]  rX  s   ` @@r{   maybe_upcast_float32rs    s    
' : r~   c                     \ rS rSrSr\R                  " \R                  5      r\	  SV   SWS jj5       r
\	SXS j5       r\	S 5       r\S 5       r\	\" 5       S	 5       5       r\	S
 5       r\	S 5       r\	\" 5       S 5       5       r\	\" 5       S 5       5       r\	\" 5       S 5       5       r\	\" 5       S 5       5       r\	\" 5       S 5       5       r\	\" 5       S 5       5       r\	\" 5       S 5       5       r\	S 5       r\	S 5       r\	S 5       r\	S 5       r\	S\R>                  SSS.S j5       r \	\" 5       S 5       5       r!\	\" 5       S 5       5       r"\	\" 5       S 5       5       r#\	\" 5       S 5       5       r$\S 5       r%\	S 5       r&\	\" 5       S  5       5       r'\	\" 5       S! 5       5       r(\	\" 5       S" 5       5       r)\	\" 5       S# 5       5       r*\	\" 5       S$ 5       5       r+\	\" 5       S% 5       5       r,\	\" 5       S& 5       5       r-\	\" 5       S' 5       5       r.\	\" 5       S( 5       5       r/\	\" 5       S) 5       5       r0\	\" 5       S* 5       5       r1\	\" 5       S+ 5       5       r2\	\" 5       S, 5       5       r3\	\" 5       S- 5       5       r4\	\" 5       S. 5       5       r5\	\" 5       S/ 5       5       r6\	\" 5       S0 5       5       r\	\" 5       S1 5       5       r7\	S2 5       r8\	S3 5       r9\	S4 5       r:\	S5 5       r;\	S6 5       r<\	S7 5       r=\	S8 5       r>\	S9 5       r?\	S: 5       r@\	S; 5       rA\	S< 5       rB\	S= 5       rC\	S> 5       rD\	S? 5       rE\	\" 5       S@ 5       5       rF\	\" 5       SA 5       5       rG\	\" 5       SB 5       5       rH\	\" 5       SC 5       5       rI\	\" 5       SD 5       5       rJ\	SE 5       rK\	\" 5       SF 5       5       rL\	\" 5       SG 5       5       rM\	\" 5       SH 5       5       rN\	\" 5       SI 5       5       rO\	\" SJSK9SL 5       5       rP\	\" SJSK9SM 5       5       rQ\	\" 5       SN 5       5       rR\	\" 5       SO 5       5       rS\	SP 5       rT\	SQ 5       rU\	\" 5       SR 5       5       rV\	SS 5       rW\	\" 5       ST 5       5       rXSUrYg)YTritonOverridesi0  zMap element-wise ops to TritonNTc                J         SS jnUb=  [        U" X!5      [        R                  R                  5      [        R                  l        U[        R
                  :X  a  SU  S3$ U[        R                  :X  a  U  S3$ U(       a  [        U5      nO[        U5      nU  SU S3$ )Nc                    X:X  a  g[         R                  [         R                  4nX;   a  X;   a  X:w  a   S5       eU [         R                  :X  d  U[         R                  :X  a  gU [         R                  :X  d  U[         R                  :X  a  gg)Nr   zCConversions between float8_e5m2 and float8_e4m3fn is not supported!r2  r   )rt   float8_e4m3fnfloat8_e5m2)	src_dtype	dst_dtype
fp8_dtypess      r{   _get_min_elements_per_thread>TritonOverrides.to_dtype.<locals>._get_min_elements_per_thread<  s     % ##!!J '+*U U	U 
 E---e>O>O1OE///9@S@S3Sr~   r  z != 0)z.to(tl.int8).to(tl.uint8)rc  r  )rz  rM  r{  rM  r   int)	maxrC   r  min_elem_per_threadrt   r   uint8r-  r0  )r3  r,  rz  use_compute_typesr}  	out_dtypes         r{   to_dtypeTritonOverrides.to_dtype5  s    	"	/:		6   ,/,Y>,,,AHH(
 EJJqc= ekk! S122+E2I)%0ID1%%r~   c                    UR                   UR                   :X  d   eU R                  U:w  a  U  S[        U5       S3n U  S[        U5       S3n[        U5      U:w  a  U S[        [        U5      5       S3nU$ )Nrc  r  z, bitcast=True))r4  r,  r=   r?   )r3  r,  rz  outs       r{   to_dtype_bitcast TritonOverrides.to_dtype_bitcastn  s    !!U^^333 77i#T+i013A4E*+?;u%.Ek*=e*DEFaHC
r~   c                    [         R                  R                  U5      n[        U" U 5      5      n[	        U5      nUS:X  a  U$ SU SU SU S3$ )Nz
tl.float32tl.full(r  r  )rt   _prims_commondtype_to_typerV   r-  )r  r,  r   type_
triton_valr=   s         r{   _shaped_constant TritonOverrides._shaped_constant  s[    ##11%8"5<0
)%0,& %:,bQ??r~   c                "    U R                  X/ S9$ )Nr   )r  )rw   r  r,  s      r{   constantTritonOverrides.constant  s    ##E#;;r~   c                    SU  S3$ )Nr  r  r   r3  s    r{   absTritonOverrides.abs       aS""r~   c                    SU  SU S3n[        U 5      (       d  [        U5      (       aN  [        5       R                  X5      nU[        R                  [        R
                  4;   a  U S[        U5       S3nU$ )Nr  r  r  rc  )r>  rR  truedivrt   rV  ru   r=   r3  yr  r  s       r{   r  TritonOverrides.truediv  sp    !Cs!n""&:1&=&=)+33A9IU]]EMM::T+i"8!9;
r~   c                    SU  SU S3n[        U 5      (       d  [        U5      (       aN  [        5       R                  X5      nU[        R                  [        R
                  4;   a  U S[        U5       S3nU$ )Nr  r  r  rc  )r>  rR  modrt   rV  ru   r=   r  s       r{   r  TritonOverrides.mod  sp    !Cs!n""&:1&=&=)+//5IU]]EMM::T+i"8!9;
r~   c                    SU  S3$ )Nzlibdevice.abs(r  r   r  s    r{   libdevice_absTritonOverrides.libdevice_abs        s!$$r~   c                f    [         R                  (       a  SU  S[        R                   S3$ SU  S3$ )z
When use_fast_math, use the ftz (flushing to zero) variant
of exponent computation.

Check https://github.com/triton-lang/triton/issues/5735 for
more details.
libdevice.exp2(r"  r  ztl_math.exp()r   use_fast_mathru  _LOG_2_Er  s    r{   expTritonOverrides.exp  s8     $QCs?+C+C*DAFF!!A&&r~   c                    SU  S3$ )Nzlibdevice.exp(r  r   r  s    r{   libdevice_expTritonOverrides.libdevice_exp  r  r~   c                    SU  S3$ )Nr  r  r   r  s    r{   exp2TritonOverrides.exp2       !1%%r~   c                    SU  S3$ )Nzlibdevice.expm1(r  r   r  s    r{   expm1TritonOverrides.expm1       "!A&&r~   c                    SU  S3$ Nzlibdevice.sqrt(r  r   r  s    r{   sqrtTritonOverrides.sqrt  r  r~   c                    SU  S3$ r  r   r  s    r{   libdevice_sqrtTritonOverrides.libdevice_sqrt  r  r~   c                   [         R                  R                  nUS:X  a  gUS:X  a	  SU  SU  S3$ US:X  a  U  S3$ Uc:  [        R                  " [        R
                  " S	[        R                  5      U 5      $ [        S
U< 35      e)Ncompile_errorzcompile error!runtime_errorz"triton_helpers.device_assert_then(z == 0, "injected assert fail", r  accuracyz + 1r   z:unrecognized config triton.inject_relu_bug_TESTING_ONLY = )	r   r   inject_relu_bug_TESTING_ONLYopsmaximumr  rt   r5  AssertionError)r3  bugs     r{   reluTritonOverrides.relu  s    mm88/!#O# 8s:YZ[Y\\]^^JS:[;;s||Au{{;Q?? LSGT r~   c                    SU  SU S3$ )Nztriton_helpers.minimum(r  r  r   r  r  s     r{   minimumTritonOverrides.minimum      (2aS22r~   c                    SU  SU S3$ )Nztriton_helpers.maximum(r  r  r   r  s     r{   r  TritonOverrides.maximum  r  r~   c                    SU  SU SU S3$ )Nr  r  r  r   )r  r  r  s      r{   whereTritonOverrides.where  s    1#Rs"QCq))r~   rE   )constraintsr,  is_purepackc                    [        U5      nSR                  U Vs/ s H  n[        U5      PM     sn5      nUc&  SR                  S/U V	s/ s H  n	SPM     sn	-   5      nSU  SU SU SU SU S	U S
3$ s  snf s  sn	f )Nr  z=rr   ztl.inline_asm_elementwise('z', 'z', [z	], dtype=z
, is_pure=z, pack=r  )r-  r   r   )
asmr  r,  r  r  inputsr=   i
input_refs_s
             r{   inline_asm_elementwise&TritonOverrides.inline_asm_elementwise  s     *%0YY71A78
))TF6-B6ac6-B$BCK,SEk]$zlR[\g[hhrszr{  |C  DH  CI  IJ  K  	K  8-Bs   A5A:
c                    SU  S3$ )Nztl_math.cos(r  r   r  s    r{   cosTritonOverrides.cos  r  r~   c                    SU  S3$ )Nzlibdevice.cos(r  r   r  s    r{   libdevice_cosTritonOverrides.libdevice_cos  r  r~   c                    SU  S3$ )Nztl_math.sin(r  r   r  s    r{   sinTritonOverrides.sin  r  r~   c                    SU  S3$ )Nzlibdevice.sin(r  r   r  s    r{   libdevice_sinTritonOverrides.libdevice_sin  r  r~   c                    [        S5      e)Nz/ops.index_expr not implemented outside a kernelNotImplementedError)rw   r(  r,  s      r{   
index_exprTritonOverrides.index_expr  s    !"STTr~   c                    [        S5      e)Nz+ops.masked not implemented outside a kernelr  )r   bodyothers      r{   maskedTritonOverrides.masked  s    !"OPPr~   c                    SU  S3$ )Nzlibdevice.lgamma(r  r   r  s    r{   lgammaTritonOverrides.lgamma       #1#Q''r~   c                    SU  S3$ )Nzlibdevice.erf(r  r   r  s    r{   erfTritonOverrides.erf"  r  r~   c                    SU  S3$ )Nzlibdevice.cosh(r  r   r  s    r{   coshTritonOverrides.cosh'  r  r~   c                    SU  S3$ )Nzlibdevice.sinh(r  r   r  s    r{   sinhTritonOverrides.sinh,  r  r~   c                    SU  S3$ )Nzlibdevice.acos(r  r   r  s    r{   acosTritonOverrides.acos1  r  r~   c                    SU  S3$ )Nzlibdevice.acosh(r  r   r  s    r{   acoshTritonOverrides.acosh6  r  r~   c                    SU  S3$ )Nzlibdevice.asin(r  r   r  s    r{   asinTritonOverrides.asin;  r  r~   c                    SU  S3$ )Nzlibdevice.asinh(r  r   r  s    r{   asinhTritonOverrides.asinh@  r  r~   c                    SU  SU S3$ )Nzlibdevice.atan2(r  r  r   r3  r  s     r{   atan2TritonOverrides.atan2E       "!Bqc++r~   c                    SU  S3$ )Nzlibdevice.atan(r  r   r  s    r{   atanTritonOverrides.atanJ  r  r~   c                    SU  S3$ )Nzlibdevice.atanh(r  r   r  s    r{   atanhTritonOverrides.atanhO  r  r~   c                    SU  SU S3$ )Nzlibdevice.copysign(r  r  r   r  s     r{   copysignTritonOverrides.copysignT  s     %QCr!A..r~   c                    SU  S3$ )Nzlibdevice.erfc(r  r   r  s    r{   erfcTritonOverrides.erfcY  r  r~   c                    SU  S3$ )Nzlibdevice.erfinv(r  r   r  s    r{   erfinvTritonOverrides.erfinv^  r  r~   c                    SU  SU S3$ )Nzlibdevice.hypot(r  r  r   r  s     r{   hypotTritonOverrides.hypotc  r  r~   c                    SU  S3$ )Nzlibdevice.log10(r  r   r  s    r{   log10TritonOverrides.log10h  r  r~   c                    SU  S3$ )Nzlibdevice.log2(r  r   r  s    r{   log2TritonOverrides.log2m  r  r~   c                    SU  SU S3$ )Nzlibdevice.nextafter(r  r  r   r  s     r{   	nextafterTritonOverrides.nextafterr  s     &aS1#Q//r~   c                    U  SU 3$ Nr   r   r  s     r{   logical_andTritonOverrides.logical_andw      Cs|r~   c                    U  S3$ )Nz == 0r   r  s    r{   logical_notTritonOverrides.logical_not{  s    E{r~   c                    U  SU 3$ Nz | r   r  s     r{   
logical_orTritonOverrides.logical_or  r3  r~   c                    SU  SU S3$ )Nr   ^ r  r   r  s     r{   logical_xorTritonOverrides.logical_xor  s    1#S1~r~   c                    U  SU 3$ r0  r   r  s     r{   bitwise_andTritonOverrides.bitwise_and  r3  r~   c                    SU  3$ )N~r   r5  s    r{   bitwise_notTritonOverrides.bitwise_not  s    1#wr~   c                    U  SU 3$ r9  r   r  s     r{   
bitwise_orTritonOverrides.bitwise_or  r3  r~   c                    U  SU 3$ )Nr=  r   r  s     r{   bitwise_xorTritonOverrides.bitwise_xor  r3  r~   c                    U  SU 3$ )Nz << r   r  s     r{   bitwise_left_shift"TritonOverrides.bitwise_left_shift      D}r~   c                    U  SU 3$ )Nz >> r   r  s     r{   bitwise_right_shift#TritonOverrides.bitwise_right_shift  rP  r~   c                     SU S3nSU  SU S3$ )Nr  ).to(tl.uint32)ztl.rand(r  r  r   seedr   s     r{   randTritonOverrides.rand  s%    VHO,$r&++r~   c                     SU S3nSU  SU S3$ )Nr  rU  z	tl.randn(r  r  r   rV  s     r{   randnTritonOverrides.randn  s%    VHO,4&6(!,,r~   c           	     ,    SU S3nSU  SU SU SU S3	$ )Nr  rU  ztriton_helpers.randint64(r  r  r   )rW  r   lowhighs       r{   	randint64TritonOverrides.randint64  s1    VHO,*4&6("SED6KKr~   c                    [        S5      e)Nz.ops.load_seed not implemented outside a kernelr  )r]  r   s     r{   	load_seedTritonOverrides.load_seed  s    !"RSSr~   c                    SU  S3$ )Nzlibdevice.rsqrt(r  r   r  s    r{   rsqrtTritonOverrides.rsqrt  r  r~   c                    SU  S3$ )Nzlibdevice.log1p(r  r   r  s    r{   log1pTritonOverrides.log1p  r  r~   c                    SU  S3$ )Nzlibdevice.tan(r  r   r  s    r{   tanTritonOverrides.tan  r  r~   c                    SU  S3$ )Nzlibdevice.tanh(r  r   r  s    r{   tanhTritonOverrides.tanh  r  r~   c                    SU  S3$ )Nztl.sigmoid(r  r   r  s    r{   sigmoidTritonOverrides.sigmoid  s     QCq!!r~   c                    SU  SU  SU  S3$ )Nz(libdevice.signbit(z) != 0) if (z).dtype is tl.float32 else z < 0r   r  s    r{   signbitTritonOverrides.signbit  s#     "!L3NqcQUV	
r~   c                    SU  SU S3$ )Nzlibdevice.fmod(r  r  r   r  s     r{   fmodTritonOverrides.fmod  s     !2aS**r~   c                    SU  SU S3$ )Nr  r  r  r   r  s     r{   powTritonOverrides.pow  s      s"QCq))r~   c                    SU  S3$ )Nztl_math.log(r  r   r  s    r{   logTritonOverrides.log  r  r~   c                    SU  S3$ )Nzlibdevice.log(r  r   r  s    r{   libdevice_logTritonOverrides.libdevice_log  r  r~   F)ry   c                    SU  S3$ )Nzlibdevice.isinf().to(tl.int1)r   r  s    r{   isinfTritonOverrides.isinf       "!M22r~   c                    SU  S3$ )Nzlibdevice.isnan(r  r   r  s    r{   isnanTritonOverrides.isnan  r  r~   c                    SU  S3$ )Nzlibdevice.nearbyint(r  r   r  s    r{   roundTritonOverrides.round  s     &aS**r~   c                    SU  S3$ )Nr  r  r   r  s    r{   floorTritonOverrides.floor  r  r~   c                H    U  SU 3nU  SU 3nSU  SU SU SU SU SU S	3$ )
Nr  r  z
tl.where((z
 < 0) != (z < 0), tl.where(z != 0, z - 1, ), r  r   )r  r  r  rems       r{   floordivTritonOverrides.floordiv  sV    
 D}3qclA3j+;C5vVTXSYY\]a\bbcddr~   c                l   [         R                  " S[        R                  5      n[         R                  " [         R
                  " X5      [        R                  5      n[         R                  " [         R
                  " X5      [        R                  5      n[         R                  " X#5      nU SU  S3$ )Nr   rc  .dtype))r  r  rt   r5  r  ltr/  sub)r3  zleftrightr  s        r{   signTritonOverrides.sign  so    LLEKK(||SVVA\EJJ7cffQlUZZ8ggd"d1#W%%r~   c                    SU  S3$ )Nr  r  r   r  s    r{   truncTritonOverrides.trunc  r  r~   c                    U  SU 3$ )Nr  r   r  s     r{   truncdivTritonOverrides.truncdiv  s     D}r~   c                    SU  S3$ )Nr  r  r   r  s    r{   ceilTritonOverrides.ceil  r  r~   r   )NT)r,  rM  rz  zOptional[torch.dtype])r,  rM  rz  rM  )Zrs   r   r   r   r   mathr*  er  r  r  r  r  r   r  rs  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rt   ru   r  r  r  r  r  r  r  r  r  r  r  r  r  r
  r  r  r  r  r  r  r!  r$  r'  r-  r1  r6  r:  r>  rA  rE  rH  rK  rN  rR  rX  r[  r`  rc  rf  ri  rl  ro  rr  ru  rx  r{  r~  r  r  r  r  r  r  r  r  r  r  r   r   r~   r{   ru  ru  0  s/   (yy H ,0	6&6& )6& 6&p    @ @ < < #  #     %  % '  ' %  % &  & '  ' &  & &  &  " 3 3 3 3 * * "&emmTPQK K #  # %  % #  # %  % U U Q Q (  ( %  % &  & &  & &  & '  ' &  & '  ' ,  , &  & '  ' /  / &  & (  ( ,  , '  ' &  & 0  0                     , , - - L L T T '  ' '  ' %  % &  & "  " 
 
 +  + *  * #  # %  % /3 0 3 /3 0 3 +  + '  ' e e & & '  '  
 &  &r~   ru  r   c                  h    \ rS rSrSr\S 5       r\S 5       r\S 5       r	\S 5       r
\S 5       rSrg	)
TritonKernelOverridesi  a  Map element-wise ops to Triton within a TritonKernel

Unlike TritonOverrides, these assume the code is going to be inserted into
the body of the main triton kernel and so it may use indexing and mask
variables which are assumed to already be defined in the current scope.
c                j    [         R                  R                  5       nS/U-  nU R                  XUS9$ )NrE   r  )rC   r  triton_tensor_ndimr  )rw   r  r,  ndimr   s        r{   r  TritonKernelOverrides.constant'  s7    
 xx**,d
##E#>>r~   c                x   [         R                  R                  USS9n[        U[        5      (       d   e[         R                  R
                  S:X  a  [        R                  O[        R                  nU[        R                  [        R                  4;  a  UOUn[        R                  R                  n S[        R                  l        [         R                  R                  R                  [         R                  R                  UR                  [!        U5      US9nU[        R                  l        U[        R                  [        R                  4;  aZ  [         R                  R                  R                  [         R                  R                  U R#                  Xb5      [%        U5      S9nOUnUR&                   Ht  n[)        U[*        R,                  5      (       d  M$  [        R.                  " U[         R                  R                  R0                  UR2                     R4                  5      nMv     X$:w  aP  [         R                  R                  R                  [         R                  R                  U R#                  Xd5      US9nUR6                  Ul        U$ ! U[        R                  l        f = f)NF	block_ptrtl.int32rD  r,  r+  )rC   r  indexingr  r   r  rt   r5  int64r   test_configsruntime_triton_dtype_assertcsegeneratecomputer   r4   r  r?   free_symbolsr   r   r   promote_typesvarname_mapr]  r,  r   )rw   r(  r,  r  r  origr=  	index_vars           r{   r   TritonKernelOverrides.index_expr0  s   88$$TU$;(O4444 &'XX%9%9Z%GekkU[[u{{EKK&@@k "">>		C>CF;((,,''  "",T2	 ( C ?CF;ekk22((,,''  S()%0 ( C  E!..	!)TXX66!//qxx||77	GMME / #hhll++HH$$LL2% ,  !**
9 ?CF;s   7A*J" "J9c           
        U bm  [         R                  R                  bR  [        R                  R
                  R                  [        R                  R                  U  S3[         R                  S9n UR                  R                  SS9nU(       d   S5       eSnU HH  nUR                   H5  nUR                  S:w  d  [        UR                  S   5      (       d  M2  S	n  MF     MJ     U(       a  S OUn[        R                  R                  XS
9 nU" 5       n	S S S 5        U(       a  W	R                  R                   (       a  [        U5      n[        R                  R
                  R                  [        R                  R                  SU	 S[#        U5       SU	 S3[$        R&                  " U5      U	R(                  S9n[*        R,                  " WX5      n
OW	n
U
R.                  R1                  W5        U
$ ! , (       d  f       N= f)N.to(tl.int1)r+  output)opz)graph for body does not contain an outputFloadrE   Tr  r  z.shape, r  r  r  )rt   r  r  rC   r  r  r  r  r   r  
find_nodesr_  targetr_   
mask_loadsrD  is_boolrV   r   wrapr,  r  r  r   discard)r   r  r  nodes
need_wherenoderJ  r  new_maskrN  r  s              r{   r  TritonKernelOverrides.maskedc  s    1 1 =88<<((  &%jj ) D 

%%%2AAAu
 Dyy::'+CCHHQK+P+P!%J !  #XX   3xVF 4 }}$$UHHLL))  6((=+?*@6('R"''.ll	 * E ))Hf4CCh'
' 43s   G77
Hc                    [         R                  R                  R                  U 5      nSU S[         R                  R                  R	                  SU5       S3$ )Ntl.load( + load_seed_offsetr  )rC   r  r_  inputseed_offset)r]  r   r=  s      r{   rc  TritonKernelOverrides.load_seed  sI    hhmm!!$'se3qxx}}889KVTUUVW	
r~   c                   SU  S3n[         R                  R                  R                  U5      =n(       a  U$ [         R                  R                  R	                  U R
                  S9n[         R                  R                  R	                  [        R                  S9n[         R                  R                  R                  U SU SU  S35        [         R                  R                  R                  XU45        X44$ )Nzfrexp(r  r+  r  z = triton_helpers.frexp()rC   r  r  try_getnewvarr,  rt   r5  r  r   put)r3  	cache_keycse_valmantissaexponents        r{   frexpTritonKernelOverrides.frexp  s    QCqM	hhll**95575N88<<&&QWW&588<<&&U[[&9	""j8*$<QCqA	
 	
x$89##r~   r   N)rs   r   r   r   r   r   r  r  r  r  rc  r  r   r   r~   r{   r  r    sm     ? ? 0 0d * *X 
 
 $ $r~   r  c                  V    \ rS rSr% SrS\S'   S\S'   SS jrSS	.SS
 jjrS rS r	Sr
g)HelperFunctionsi  z#An ordered set of helper functions.zdict[str, str]_templates_seen	list[str]finalized_helpersc                     0 U l         / U l        g rr   r  r  r   s    r{   rC  HelperFunctions.__init__  s    !!#r~   _triton_helper_fn	base_namec                   U R                   R                  U5      nUb  U$ U [        U R                  5       3nX@R                   U'   U R                  R	                  UR                  US95        U$ )a  This accepts a function definition with the function name
left as a format specifier e.g.

    @triton.jit
    def {name}(arg0, arg1):
        return arg0 + arg1

We add the templated code to the function set and return the name
assigned to that function.

)r]  )r  getr  r  r  r`  )r   template_coder  existing_namer]  s        r{   addHelperFunctions.add  su     ,,00?$  S!7!789:.2]+%%m&:&:&:&EFr~   c                ,    [        U R                  5      $ rr   )iterr  r   s    r{   __iter__HelperFunctions.__iter__  s    D**++r~   c                     U R                   U   $ rr   )r  )r   rg  s     r{   __getitem__HelperFunctions.__getitem__  s    %%c**r~   r  Nr   r   )r  r   r   r   )rs   r   r   r   r   r   rC  r  r  r  r   r   r~   r{   r  r    s+    -##  $ 4G ,,+r~   r  c                      \ rS rSr% Sr\R                  " \S9rS\	S'   \R                  " \S9r
S\	S'   \R                  " \S9rS\	S'   \R                  " \S9rS\	S'   SS	 jrS
rg)r   i  zE
Class representing ND block dimensions, for block pointer analysis.
)default_factoryr4  r   r   r  r  c                    [        U 5      n[        S X4 5       5      u  p4U" S0 U Vs0 s H  oUX5   XE   -   _M     snD6$ s  snf )z 
Concatenates block parameters.
c              3  N   #    U  H  n[         R                  " U5      v   M     g 7frr   )r6  r7  r  s     r{   r   *BlockParameters.__add__.<locals>.<genexpr>  s     BMq[''**Ms   #%r   )r  r  )r   r  rw   r  r  rK  s         r{   __add__BlockParameters.__add__  sL     4jBTMBB9a8as16AF?*a8998s   Ar   N)r  r   r   r   )rs   r   r   r   r   r6  fieldrA  r   r   r   r  r  r  r   r   r~   r{   r   r     sf     *//EEE$/$5$5d$KK!K + 1 1$ GGG + 1 1$ GGG:r~   r   c                  4    \ rS rSrSrS rS	S jrS rS rSr	g)
"CooperativeReductionWorkspaceCachei  z
The scratch space used for cooperative reductions can be reused
after two reduction loops.  This keeps track of what can be reused.
c                    Xl         / U l        / U l        [        R                  " [        R
                  5      U l        SU l        SU l        g rX  )	r_  current_loop
prior_loopcollectionsdefaultdictdequeready_for_reuse
loop_countstore_count)r   r_  s     r{   rC  +CooperativeReductionWorkspaceCache.__init__  s>    	*66{7H7HIr~   c                    U R                   R                  U5      nU(       a  UR                  5       $ U R                  R	                  US5      u  p4U R
                  R                  XU45        X44$ rs  )r  r  popleftr_  	workspacer  r  )r   nbytescachedws_name	ws_offsets        r{   allocate+CooperativeReductionWorkspaceCache.allocate  sc    %%))&1>>##!YY00?  &9!=>##r~   c                    U R                    H%  u  pnU R                  U   R                  X#45        M'     U R                  U l         / U l        U =R                  S-  sl        g NrE   )r	  r  r  r  r  )r   r  r  r  s       r{   on_loop_end.CooperativeReductionWorkspaceCache.on_loop_end  sT    *.//&FY  (//0DE +:++1r~   c                H    U R                   nU =R                   S-  sl         U$ r  )r  )r   priors     r{   increment_store_count8CooperativeReductionWorkspaceCache.increment_store_count  s#      Ar~   )r_  r  r  r	  r  r  N)r  r   )
rs   r   r   r   r   rC  r  r  r   r   r   r~   r{   r  r    s    
$r~   r  c                  ,    \ rS rSr% S\S'   S rS rSrg)FixedTritonConfigi  zdict[str, int]r   c                     U R                   U   $ rr   r   r   r.  s     r{   r  FixedTritonConfig.__getitem__	  s    {{4  r~   c                    XR                   ;   $ rr   r%  r&  s     r{   __contains__FixedTritonConfig.__contains__  s    {{""r~   r   N)rs   r   r   r   r   r  r)  r   r   r~   r{   r#  r#    s    !#r~   r#  c                  "    \ rS rSrSrSS jrSrg)	TritonCSEi  zy
Subclasses CSE to apply the current load mask to the cache key to avoid CSEing
variables across separate masked blocks.
c                b    [         R                  R                  =n(       a  XR                  4$ U$ rr   )rC   r  
_load_maskr]  )r   r  r   s      r{   augment_keyTritonCSE.augment_key  s*    88&&&4&yy))r~   r   N)r  r   r   zUnion[str, tuple[str, str]])rs   r   r   r   r   r/  r   r   r~   r{   r,  r,    s    
r~   r,  c                    ^  \ rS rSr% \rS\S'   \rS\S'   Sr	   SK     SLU 4S jjjr
SMS	 jrSNS
 jrS rS rS rS rSNS jrS r\SOS j5       rSSSSS. SPS jjr SQ       SRS jjrSQS jr        SSS jrS rSTS jr SU         SVS jjrS r  SW               SXS jjrSOS jrSYS jr          SZS jr   S[S  jr!S[S! jr"S" r#S# r$S$ r%S% r&S& r'      S\S' jr(S]S( jr)        S^S) jr*          S_S* jr+S+ r,S`S, jr-S- r.S. r/S/ r0\1S0 5       r2SUS1 jr3\1S2 5       r4\1S3 5       r5S4 r6SaS5 jr7S6 r8SUSbS7 jjr9ScS8 jr:SdS9 jr;SeS: jr<SfS; jr=      SgS< jr>SfS= jr?ShS> jr@SiS? jrASjS@ jrBSNSA jrCSkSB jrD\ES`SC j5       rFSlSD jrGSmSE jrH\ESnSF j5       rISoSG jrJSlSH jrK      SpSI jrLSJrMU =rN$ )qTritonKerneli  r  helper_functionszCallable[[sympy.Expr], str]kexprTNc                8  > X0l         X@l        [        TU ]  " U40 UD6  [	        U R
                  U R                  5      U l        [        5       U l	        [        5       U l
        [        [           " 5       U l        X l        [        R                   " 5       U l        [$        [&        [&        4   " 5       U l        [+        5       U l        [.        R0                  " [$        5      U l        [.        R4                  " 5       U l        [        [8           " 5       U l        S U l        U R>                  (       a  U RA                  U RB                  5        U RD                  (       a  U RG                  5         U RI                  5         U RD                  (       a  U RK                  5         g g rr   )&optimize_maskfixed_configrB  rC  r,  newvar_prefixsuffixr  rM   post_loop_combinepost_loop_storer   r   outside_loop_varsr  rd  countblock_ptr_iddictr   block_ptr_to_bufferr  r3  r
  r  pointer_advancementsCounter_load_countsr(   autotune_hintstriton_metar=  codegen_reduction_numelsr  cooperative_reductioninit_cooperative_reductioncodegen_range_treeinit_cooperative_reduction_mask)r   tilingr  r6  r7  rI  rE  s         r{   rC  TritonKernel.__init__#  s-    $1(*6*T//=1?1A/=/?!+C!2#6 %OO-#'S>#3  / 1##D) 	! 7B6I6I6K )6859  ))$))4%%++-!%%002 &r~   c                    [        U5      $ rr   )r=   )r   r,  s     r{   dtype_to_strTritonKernel.dtype_to_strJ  s    5!!r~   c                z    U R                   =(       a)    [        R                  R                  U R                  5      $ rr   )r=  rC   choices should_use_cooperative_reductionr?  r   s    r{   rR  -TritonKernel.should_use_cooperative_reductionM  s-    $$ 
)S)SMM*
 	
r~   c                6  ^  T R                   (       d   eT R                   H'  nUR                  c  M  U=R                  S-  sl        M)     T R                  S   nT R                  (       a  [        UT R                  S   5      nT R                  R                  U5      T l        [        T R                  5      T l
        T R                  R                  S5        [        U 4S jT R                   5       5      (       a  T R                  R                  S5        gg)z/One time setup code for cooperative reductions.NrE   r3  r   a              RSPLIT_NEXT_POWER_OF_2: tl.constexpr = triton_helpers.constexpr_next_power_of_2(RSPLIT)
            RSPLIT_IS_POWER_OF_2: tl.constexpr = RSPLIT == RSPLIT_NEXT_POWER_OF_2
            HAS_RSPLIT: tl.constexpr = RSPLIT > 1
            rsplit_id = tl.program_id(0)
            num_rblocks = (rnumel + RBLOCK - 1) // RBLOCK
            rsplit_chunk = (num_rblocks + RSPLIT - 1) // RSPLIT * RBLOCK
            rsplit_start = rsplit_chunk * rsplit_id
            rsplit_end = rsplit_chunk * (rsplit_id + 1)
            c              3  v   >#    U  H.  nUR                   (       d  M  TR                  U5      (       + v   M0     g 7frr   )r@  _has_constant_mask)r   r   r   s     r{   r   :TritonKernel.init_cooperative_reduction.<locals>.<genexpr>n  s4      
(   .''---(s   99z>rsplit_end = tl.where(rsplit_end < rnumel, rsplit_end, rnumel))rG  rE  grid_dimr>  r7  r   r_  
semaphoressemaphores_namer  %cooperative_reduction_workspace_cacher  r   r   r   )r   r   	sem_counts   `  r{   rH  'TritonKernel.init_cooperative_reductionR  s    )))) $$D}}(" % KK$		4+<+<X+FGI#yy33I>5WII6
2 					
  
((
 
 

 IIP
r~   c                .   SnU R                   (       d  U S3nU R                  R                  SU 35        U R                  5       (       a  U R                  R	                  S5        g U R                   (       a   eU R                  R                  S5        g )Nz$tl.arange(0, RSPLIT_NEXT_POWER_OF_2)z	[None, :]zrsplit_arange = z                if RSPLIT_IS_POWER_OF_2:
                    rsplit_mask: tl.constexpr = None
                else:
                    rsplit_mask = rsplit_arange < RSPLIT
                zSrsplit_mask = xmask if RSPLIT_IS_POWER_OF_2 else ((rsplit_arange < RSPLIT) & xmask))r9  r  r   _has_constant_xmaskr   )r   rsplit_aranges     r{   rJ  ,TritonKernel.init_cooperative_reduction_maskw  s~    >}},oY7M		.}o>?##%%II }}$$IIer~   c                Z   U R                    H}  nUR                  (       d  U R                  XR                  5        M1  U R                  (       d  MD  U R                  R                  UR                   SU R                  U5       35        M     U R                  (       a  [        S U R                    5       5      (       aP  U R                  SSSS9nU R                  U5      nU R                  R                  SU R                  U5       35        g U R                  U R                  5        g g )Nzbase = c              3  8   #    U  H  oR                   v   M     g 7frr   )is_loopr   r   s     r{   r   2TritonKernel.codegen_range_tree.<locals>.<genexpr>  s     =,<D<<,<   baseTr   zrbase = )rE  rd  iteration_ranges_codegen_headerr  r=  r   r:  iteration_ranges_ranges_coder   _get_reduction_symbols_flatten_reduction_indicesr   r  codegen_reduction_indices)r   r   rn_basesrbases       r{   rI  TritonKernel.codegen_range_tree  s    $$D<<44T99E&&& 		##{{m74+L+LT+R*ST %   =D,<,<===66Dd 7  77A		  8D,=,=e,D+E!FG ..tyy9 !r~   c                    g)z
Indicate whether we need provide numel as arguments for the generated
kernel calls in the benchmark.

Should be true for pointwise/reduction kernels but false for triton
matmul kernels.
Tr   r   s    r{   need_numel_argsTritonKernel.need_numel_args  s     r~   c                    U R                   =(       a4    [        R                  R                  U R                  U R
                  5      $ rr   )r=  rC   rQ  should_use_persistent_reductionr?  rG  r   s    r{   ru  ,TritonKernel.should_use_persistent_reduction  s5    $$ 
)R)RMM455*
 	
r~   c                
   U R                   (       ar  [        U R                  5      U R                  S-   :X  aL  U R                  (       a  U R                  S   S:H  $ [
        R                  R                  U R                  5      $ g)NrE   r   F)	persistent_reductionr  r>  r<  r7  rC   rQ  want_no_x_dimr?  r   s    r{   ry  TritonKernel.want_no_x_dim  sb    %%DKK D$;$;a$??  ((2a7799**4==99r~   c                    g)Nztl.device_assertr   r   s    r{   assert_functionTritonKernel.assert_function  s    !r~   F)
copy_shapedense_indexingoverride_maskr  c          
     	  ^ ^^^^^ T R                  T5      mTR                  nSn[        [           " 5       mU GH  n[	        U[
        R                  5      (       d   eU=(       d    [        U[        R                  5      nU(       a  MQ  [        U[        R                  5      (       a@  T R                  R                  UR                     n	TR                  U	R                   5        M  [        U[        R"                  [        R$                  [        R&                  [        R(                  [        R*                  [        R,                  45      (       a  GM  [        R.                   V
s/ s H  n
[        X5      (       d  M  [0        U
   PM      nn
[3        U5      S:X  d   SUR                   35       eTR5                  US    S35        GM     [6        R8                  R:                  =(       d    U=(       d    T R<                  SL=(       a    TS:g  nSnSn[        [           " 5       nT R?                  5        HF  nURA                  URB                  5      (       a  SnOSnUR5                  URD                   S35        MH     U(       a  T RF                  (       a  [6        R8                  RH                  (       a  U(       d  T R<                  (       d~  [3        TU-
  5      S:X  al  T RK                  T5      (       dV  U(       aO  T RL                  S:X  a?        SS	 jm      SU 4S
 jjm      SUU4S jjmSUUUU 4S jjnU" 5       nUb  U$ SnT RO                  T5      n[	        T[
        RP                  5      (       a  U(       a  U S3OT RS                  5       nSU SU S3nT RT                  (       a"  T RW                  5       (       d  [        S/5      mO
[        5       mT R<                  (       a  TR5                  T R<                  5        [Y        UTUUT5      $ U(       a/  U(       d(  U(       a  U S3OT RS                  5       nSU SU S3nUmOU(       d  U(       a  SU SU S3nUmU(       a  [        U/5      mT R<                  (       a  TR5                  T R<                  5        T R[                  T5        [Y        UTUUT5      $ s  sn
f )z?
Compute the index and mask to pass to tl.load() or tl.store()
FrE   zAmbiguous type: r   r   NTr  c                    [         R                  " XR                  5       5      nUc  g[        UR                  /[
        R                  U5      /U/[
        R                  U5      /S9$ )zg
Matches expressions of the form:
    idx = s * xindex

This implies stride (s,), and shape (XBLOCK,).
Nr   r   r  r  )rF   match_affine_block_exprsymbolr   numelr   r   r   )r   
range_treerG  s      r{   match_affine_block1TritonKernel.indexing.<locals>.match_affine_block  sj     -DD,,. >&%++,!.!=!=j!I J#H*;;JGH	 r~   c                  >^^ UR                  5       n[        R                  " S[        R                  " [        R
                  U/S9S9u  p4[        S[        TR                  5      U R                  [        X#5      5      U R                  [        X#U5      5      -   5      n[        R                  " XUR                  U5      nUc  gUu  nnn	[        R                  " U5      n
[         R"                  R$                  mTR'                  UR(                  5      m[+        UU4S jU
 5       5      (       a  g[,        R/                  U5      n[1        XS   5      /[3        U
SS USS 5       VVs/ s H%  u  p[        R4                  " [1        X5      U5      PM'     snn-   nU	 Vs/ s H#  n[7        X[,        R9                  U5      05      PM%     nn[;        UUUUS	9$ s  snnf s  snf )
a  
Matches higher-dimensional blocks coming from FloorDiv and ModularIndexing.

Example expression to match:
   sN * ((rindex//(d1 * ... * d(N-1))))
       + s1 * ModularIndexing(rindex, 1, d1)
       + ...
       + s(N-1) * ModularIndexing(rindex, d1 * ... * d(N-2), d(N-1))

This iterates over a block of shape (dN, ..., d1) and stride
(sN, ..., s1). (d1,...,d(N-1)) and (s1,...,sN) are
wildcards that we match.

Note that dN does not appear in the expression, but we solve for it
using range tree numels and the other dims.
zdenom modulo)exclude)rw   r   Nc              3     >#    U  H9  nTR                  UT5      (       + =(       a    TR                  U5      (       + v   M;     g 7frr   )rd  statically_known_power_of_2)r   r  	max_blockr  s     r{   r   ETritonKernel.indexing.<locals>.match_mod_div_block.<locals>.<genexpr>f  sG       ". !==eYOO H$@@GGH!-s   AAr   rE   r  )r  r   symbols	functoolspartialWildr  r  range_tree_nodesr=  r   r   rF   match_mod_div_block_exprr  get_slice_numelsrC   r  r  r  r:  r   r   r   r   r  Minr<   r   r   )r   r  r  denommodulonum_dimsmatch_resultrJ  r  block_index_exprsslice_numelslinear_block_sizer  r  r   r(  r   r  r  r   s                    @@r{   match_mod_div_block2TritonKernel.indexing.<locals>.match_mod_div_block'  s   ( '--/	 !&"!))%**ykJ! --.HY$>?++oi&OPQ	  3KKj&6&6   ' !	%2CCDI 77++ NN:+<+<=	  ".  
   %2$@$@$L!-A?1 '*,qr*:DH&E&E
 IIg&7?E&E1 !2	3 !2 -*H*H*TU !2	  3 ' +#)	 3s   ,,G"*G c                :   > TT4 H  nU" X5      nUc  M  Us  $    g)zE
Match a block indexing subexpression involving a single range tree.
Nr   )r(  r  
match_funcmatchr  r  s       r{   match_block_pointer_subexpr:TritonKernel.indexing.<locals>.match_block_pointer_subexpr  s3     ''#J 't8E($# r~   c            	     |  > [        TTR                  R                  5        V Vs0 s H  u  pXR                  _M     snn 5      nTR	                  SS9nU Vs/ s H'  n[
        R                  " X$R                  5       5      PM)     nn[        S U 5       5      n[        5       n[        X55       H@  u  pH[        UR                  UR                  5      5      S:  a    g T" X5      n	U	c    g Xy-  nMB     U[        U5      -
  n
TR                  T5        [         R#                  UU
UTTR$                  S9$ s  snn f s  snf )NT)reorderc              3  @   #    U  H  oR                  5       v   M     g 7frr   )r  re  s     r{   r   ETritonKernel.indexing.<locals>.match_block_pointer.<locals>.<genexpr>  s     *Q[T;;==[   rE   )r   r   rE  r   rF  )r<   r  r8  r(  active_range_treesrF   get_subexpr_involving_symbolr  r   r   r  r  intersectionr  sumfilter_masksr   rO  r  )vtindex_relative_to_xyr_indexrE  r   index_subexprsrange_symbolsblock_paramssubexprr   r   r   r   r  r   s              r{   match_block_pointer2TritonKernel.indexing.<locals>.match_block_pointer  sM   .8$2G2G2M2M2OP2O$!AvvI2OP/+ #55d5C !,	" !, (DD3[[] !,	  " !+*Q[*Q Q.0%(%EMD =55g6J6JKLqP# 9GF~# *L &F 5s>7JJ !!),&--'$* +'"&.. .  E Q"s   D3.D9z.shaper  r  z, tl.int32)xmaskr  r  .shape))r   r   r  rY   r   Optional[BlockParameters])r(  r   r  rY   r   r  )r   zOptional[BlockPtrOptions]).prepare_indexingr  r   r   r  r   r   r   r   r   r   r   r  r  r]  rH  r   UNBACKED_INTSIZEPRECOMPUTED_SIZEINDEXFLOATUNBACKED_FLOATr   r   r  r  r   r   r  r.  r  r  var_listr:  allow_block_ptruse_block_ptris_indirect_indexingr  r  rY  dense_size_strr7  r_  r   r  )r   r   r~  r  r  r  
index_varsr   r=  cse_varr   prefix_matches
need_dense
have_densehave_loop_varsdense_mask_varsr   r  optionsr   r   r   r  r  r  s   ``                   @@@@r{   r  TritonKernel.indexing  s6    %%e,''

sO%	Cc5<<0000# ~]22(J TXX..((..sxx8  !2!23%%II))JJJJ''
 
 
 !. 9 9" 9%c0 %Jt$ 9  "
 >*a/N3CCHH:1NN/!2 3489? D MM(( ++d* qj	 	 
$S/+++-D&&t}}55!%"
4;;-t 45 . $$++!OOI/0A5--e44  J.!/B*,`!`/B`*`D .A*  * *Z *+G"
%%e,	eU]]++2<J<v.$BUBUBWJ":,b;GI  )A)A)C)C&y1	&L	doo."9iZQVWWj2<J<v.$BUBUBWJ*9+R
|1EI'IJ*9+R
|7KI'I"M?3I??MM$//*)$y)ZUSSy"s   *SSc                   UR                  5       nU(       d  SnOU(       a  US:X  d   eSU< S3nOSU< 3nU R                  (       a  U R                  S   R                  (       a  UR	                  5       (       a  S[        U R                  5       3nU R                  R                  [        X SUR                  USS	9 35      5        XR                  U'   [        R                   HL  nUR                  U5      n[        S
 U 5       5      (       a  M-  U R                   U   n	Xi;  d   S5       eXU'   MN     Xd4$ UR                  U5      nXd4$ )Nr   , other=0.0, boundary_check=z, padding_option='zero'r+  r   = F)rS  c              3     #    U  HC  n[         R                  R                  R                  U[        R
                  " S 5      5      v   ME     g7fr   N)rC   r  r  r  r   rY  )r   r   s     r{   r   1TritonKernel.codegen_block_ptr.<locals>.<genexpr>	  s;      "1 GG$$<<VU]]STEUVV"1s   AAz@duplicate advancement for pointer '{block_ptr}' at type '{symt}')rk  r=  rE  rd  r   nextr>  r  r   rL   r`  r@  r   r   rp  r5  rA  )
r   r]  r=  r  r  checkr  r   advance_offsetsadvancementss
             r{   codegen_block_ptrTritonKernel.codegen_block_ptr  sg    '')EM)))'y0GHE'y1E!!  $,,##%%#D):):$;#<=IIIKs8??3?+N*OP 37$$Y/ &55"*":":4"@  "1   #88> 4 V4 +:Y' 6"  !,Ir~   c                    UR                  XBR                  UR                  S5      nU S[        [        R
                  R                  U5      5       S3nSU SU U S3$ )NFrc  r  	tl.store(r  )r"  r   r   r0  rC   r  	get_dtype)r   r]  r  r  r  r  s         r{   codegen_block_ptr_store_line)TritonKernel.codegen_block_ptr_store_line  sh    66'')=)=u

 '/0A0A$0GHIK9+RwugQ77r~   c                    U(       d  U(       d  g [        U[        R                  5      (       d   eU R                  USS9n[        U[        5      (       d   eUR
                  nUR                  5       (       a  UR                  OS nU(       a  [        U R                  U5      5      OS nU R                  Xc(       a  SOS X5      n	U R                  U5      n
U R                  R                  XS[        R                  S9  g )NFr  0)
assignmentr,  )r  r   Exprr  r   r   r   r   texprrename_indexingindirect_assertget_load_bufferr  r  rt   r5  )r   r(  r  lowerr   r  r   r   size_strlinebuffers              r{   check_boundsTritonKernel.check_bounds"  s     $

++++===7(O4444&&	(0(9(9(;(;8$$8=5--d344 ##esx
 %%h/&5Lr~   c                &   UR                  5       (       d  UR                  5       (       a  U R                  $ U R                  (       a?  U R                  S   R
                  (       a!  UR                  5       (       d  U R                  $ U R                  $ )Nr+  )	r   r   r  r=  rE  rd  r   r  loads)r   r  s     r{   r  TritonKernel.get_load_buffer<  sk      ""h&:&:&<&<<<!!  $,,'')) 99::r~   c           
     
  ^^^^^ U R                   R                  T5      nU R                  mTT==   S-  ss'   [        nU R	                  U5      mUnU R                  USS9nUR                  5       mUR                  5       n[        S U R                  U5      R                  5        5       5      nU R                  U5      (       a  Sn	OiU(       d  Sn	O_U R                  (       aL  U R                  S   R                  (       a.  UUUUU4S jn
TT   mSn	[        R                   " ["        S	U
5      nOS
n	U(       d  T(       aB  UR%                  5       (       a-  U R&                  (       a  S[)        U R&                  5       3nOSnOS
n Sn[*        R,                  R.                  (       a"  U R0                  R3                  5       nUT   S:  n U R                  U5      (       + =(       a(    U R                  (       + =(       a    U(       + =(       a    UnS
nU(       a  SnS n[4        R6                  R9                  T5      n[;        T5      (       a8  UnU[<        R>                  [<        R@                  4;   a  [<        RB                  nGOW[E        U[F        5      (       aK  U RI                  TX6U5      u  nnSU U U	 U S3nURK                  UURL                  URN                  S5      nO[[E        U[P        RR                  5      (       a  SU SU S3nURT                  nO&SU SURV                   SURX                   U	 U U S3
nU[<        R>                  [<        R@                  4;   a4  [*        R,                  RZ                  (       a  US-  n[<        RB                  nU[<        R\                  :X  a0  [<        R^                  R`                  c  US-  n[<        R\                  nU Rc                  U5      nU Rd                  Rg                  UU" U5      US9nURh                  S:  a  TT==   S-  ss'   [E        U[j        5      (       d   eURl                  Ul6        U(       a  SU SU S3nU Rd                  Rg                  UUUS9nURl                  (       a  URn                  (       a  SnOU[<        R\                  :X  a  SnOSnU R&                  (       a  [)        U R&                  5      OUnSURX                   SU SU S3nU Rd                  Rg                  UUUS9nU R                  (       a  URq                  5       (       d"  T(       d  U Rr                  Ru                  U5        U$ )NrE   Tr  c              3  *   #    U  H	  oS :H  v   M     g7fr
  r   )r   r  s     r{   r   $TritonKernel.load.<locals>.<genexpr>`  s      
MqFM   z, eviction_policy='evict_last'r+  c                 6   > TT   T :  a  T(       d  T(       a  gg)N
evict_lastevict_firstr   )expected_countr   indirect_indexingload_countsr]  s   r{   decide_later'TritonKernel.load.<locals>.decide_lateri  s    t$~5"3'$r~   z, eviction_policy='<EP>'z<EP>r   z, other=r  z, cache_modifier='.cg'r  r  r\  r  r  r[  r  r+  r  r  z0.0Truer  r  );r_  r  rC  r   r  r  r   r   r   get_strides_of_loadrf  is_broadcastedr=  rE  rd  r  r  r3   r   _load_otherrV   r   r   skip_l1_cacher?  buffer_read_countsrC   r  r  r_   rt   rV  rW  ru   r  r   r  r"  r   r   r   rY  r   r   r   rU  r   r  r  r  r  r  	use_countr@  r   r:  r   r<  r  )r   r]  r   r=  	make_lineoriginal_indexr  r   is_coalescedepr  r  has_read_depsr  r  cachemodappend_broadcastr,  r  r  load_buffer
result_varzero	other_valr  r   r  r  s    `                      @@@@r{   r  TritonKernel.loadK  s   iiood#''DQCK	 55e<==$=7((*
**,  
 44^DKKM
 
 ~..1B1B""t'7'7';'C'C% % ).N+B!))*:FLQIB:8+<+<+>+>"=1A1A#B"CD%E	 ==&&!%!A!A!C.t4q8M	 ##N33 )))!! 	 	 /H!!$'#D))D 77 (O44#'#9#9$u#U 	5!)UGB4zC==(..0D0Dd NEMM::!#d>*:"=#+#6#6 !#d8+=+=*>c(BSBSATUWTXY^X_`h_iijk %--88MM88))

"u}}'8'8'@ &

**84XX&&{IdO5&Q
!#"*&78888'11
%j\4D3EQGD**;E*JJ!!** Dejj(!DD7;7G7GM$"2"23T  #8#4#4"5R
|2i[PQR!XX..{D.N
$$X-?-?-A-A*""&&z2r~   c           	        U R                   R                  U5      nUnU R                  USUS L S9nXR                   R                  ;   nU R	                  U5      n	U(       a,  U	(       a%  U R
                  R                  [        US5      5        [        U[        5      (       a(  U R                  XU5      u  pU R                  XXU5      nO_Uc$  SU SUR                   SU SUR                   S3	nO8US	:X  a$  S
U SUR                   SU SUR                   S3	nO[        SU 35      e[        R                   " 5       nU R"                  (       d;  U R$                  (       a*  UR'                  U R)                  XR
                  5      5        U R
                  R                  [        X5      5        U R"                  (       d  U R*                  R-                  U5        UR/                  5         g )NT)r  r  ztl.debug_barrier()r  r\  r  r  r  
atomic_addztl.atomic_add(z, sem='relaxed')zstore mode=)r_  r  r  inplace_buffersr  storesr   rL   r  r   r  r  r   r   r  
contextlib	ExitStackr=  rG  enter_contextguard_cooperative_storer<  r  close)r   r]  r   r  moder=  r  r  
is_inplacer  r  r  r  
exit_stacks                 r{   storeTritonKernel.store  s    iit$==ttt|=T YY666
,,^<.KK!!,t5I"JKh00#55dJI44	%D \se4(:(:';3ugRHYHYGZZ[\D\!#C5X-?-?,@E7"XM^M^L__opD%D6&:;;))+
$$)C)C$$T%A%A$%TUl467$$""&&u-r~   c                    U R                   R                  5       nUR                  [        USU S35      5        UR	                  5       $ )z
For cooperative reductions only one thread block should write out the result.
We rotate which thread block does each write for better parallelism
zif rsplit_id == (z % RSPLIT):)r[  r   r   rL   indent)r   r]  r  rg  s       r{   r  $TritonKernel.guard_cooperative_store  sC    
 88NNPd.?uK,PQR}}r~   c                   U R                   R                  [        R                  5        U R                  R                  US   5      nU R                  US   5      n	U R                  US   5      n
U R                  US   5      nU(       a  U R                  R                  US   5      OSnU(       a  U R                  US   5      OSnU[        R                  :X  a  SnO"U[        R                  :X  a  SnO[        S5      eU R                  R                  U R                  S	U S
U S
U	 S
U
 S
U S
U S
U S
U S
U S
U S
U S3US9nU$ )z#
See [Note: Inductor bucketize op]
r   rE   r   r   r   r  ztl.int64z5Bucketize only supports indexing with int32 and int64z'triton_helpers.bucketize_binary_search(r  z, )r+  )rD  r  r(   ONE_ELEMENT_PER_THREADr_  r  r  rt   r5  r  r  r  r  r  )r   rf  
boundariesboundary_indicesindexing_dtyper  sortersorter_indicesboundaries_ptrboundary_sizeboundaries_underlying_numelboundary_stride
sorter_ptrsorter_stridetriton_dtyperN  s                   r{   	bucketizeTritonKernel.bucketize	  s^   $ 	 C CDA7))*Q-8&*&7&7
1&F#++JqM:39TYY__VAY/v
8>))&)4FU[[(%Lu{{*%L%G  ""LL5fXRbr2M1NbQ`Paac nBgRl"]O2  ! # 
 r~   c                    U R                  5       nUS:X  a  SU S3$ U R                  nS/X#-
  -  S/U-  -   nU SSR                  U5       S3$ )	NrE   z!triton_helpers.promote_to_tensor(r  r  r   r  r  r  )r  r<  r   )r   r  ndimsnreducesizess        r{   reduction_resizeTritonKernel.reduction_resize8	  sh    '')A:6ugQ??)))VHw,>>$))E*+1--r~   c           
         U R                   S:X  a  U$ U R                  5       U R                   -
  nU R                  5       nUSU S/-   n[        U R                  R                  U[        X%U5      US95      $ )z3
Reshape to RBLOCK, collapsing all reduction dims.
rE   NRBLOCKr+  )r<  r  dense_size_listr   r  r  r  )r   r  r  r,  target_ndimr  target_shapes          r{   reduction_collapse_dims$TritonKernel.reduction_collapse_dimsA	  s    
 ""a'L--/$2I2II,,.$\k2hZ?HHu\JRW  
 	
r~   c                  ^ ^^^2^3^4^5^6^7 S8S jn[         R                  " U5       Vs/ s H  ofR                  PM     nn[         R                  " XT5      n[	        S U 5       5      (       aJ  [
        R                  " U[
        R                  5      n[
        R                  " T[
        R                  5      mT R                  (       d   e[        S T R                   5       5      nT R                  U5        [        U5      nT R                  (       a  UR                  T R                  5        T R                  S   R                  S   n	T R!                  5       m3T R#                  U3U 4S jU5      nT R%                  5       T R&                  -
  m4      S9U4UUU 4S jjm5        S:U54S jjn
U4UU6U 4S	 jnUTU4nUT R(                  R*                  ;   a  T R(                  R*                  U   $ [-        U5      n[/        U5      nT R(                  R1                  US
9n[        S U 5       5      Ul        SR5                  U5      m2U24S jm7T R6                  (       Ga<  [8        R:                  R=                  TU5      nT R#                  [>        U5      nS;U U74S jjnTS:X  a  OG[A        U[B        5      (       a)  [E        UU5       VVs/ s H  u  nnU" UU5      PM     nnnO	U" UU5      nTS;   a  [G        T R(                  RI                  T RJ                  SU	 SW S3[L        RN                  RP                  S:X  a  [
        RR                  O[
        RT                  S
95      nSSS.T   m6U" T RJ                  UUU5        GOTS:X  a=  T RV                  (       a  T RY                  UTUT7UT5      nGOT R[                  TU5      nGOpTS:X  aS  [A        W[\        5      (       d   eUu  nnn[C        UU 4S jT R_                  T RJ                  UUUT4T5       5       5      nGOTS:X  a  T Ra                  TU5      nGO[A        W[b        5      (       d   eT R(                  RI                  T RJ                  T5" T RJ                  [G        U5      S 5      UR                  S
9nGOT R(                  Re                  SU 3US
9n[8        R:                  Rg                  TU5      nT R#                  [>        U5      n[A        U[B        5      (       d5  T Rh                  Rk                  U ST R!                  5        SU SU S35        TS;   a  SU S3nT Rl                  Ro                  5       nT Rh                  Rk                  U ST R!                  5        S[
        Rp                  " U5      Rr                   ST Ru                  U5       S35        SSS.T   m6T RJ                  Rw                  SU S U S!T6 S"U SU SU SU	 S#U S$T7" U S%3U5       S&U S$T7" U S%3U5       S&35        U" T Rx                  UUU5        GO[{        T5      (       a  T RY                  UTUT7UT5      nGOTS:X  Ga%  SU S'3nSU S(3nT Rh                  Rk                  U ST R!                  5        S)U S35        T Rh                  Rk                  U S*T R!                  5        SU S35        T RJ                  Rw                  S+U S U S,U SU SU S[|        R~                   S-35        T RJ                  Rw                  S+U S$T7" U S%3U5       S+U S$T7" U S%3U5       S+3	5        UnT R(                  R1                  TS
9nT R                  T Rx                  UUUUT4T5      nO[8        R                  " TU5      nU" UU5      n T RJ                  Rk                  U S$T7" U U5       35        U[
        R                  :X  a/  U S.3n![        T5      n"U
" T Rx                  [G        U5      U!U"5        O'U
" T Rx                  [G        U5      [G        U5      S 5        T RV                  (       Ga  [8        R:                  Rg                  TU5      n[        R                  " 5       n#T Rx                  T R                  4 H3  n$U$Rk                  S/5        U#R                  U$R                  5       5        M5     TS;   a  T Rx                  Rk                  U S0T R                  U S135       35        T R                  U S23UU5      n%T Rl                  Ro                  5       nT R                  UU[
        Rp                  " U5      Rr                  5      n&U" T R                  UU%U&5        GOD[{        T5      (       a  TS:X  d   eUu  n'n(n)T R                  U'[/        U5      US   5      n*T R                  U([/        U5      US3   5      n+T R                  U)[/        U5      US4   5      n,T R                  T R                  U'U(U)U*U+U,T4T5	        OTS:X  ae  Uu  nnT R                  U[/        U5      US   5      n-T R                  U[/        U5      US3   5      n.T R                  T R                  UUU-U.T4T5        O:T R                  U[/        U5      U5      n/U
" T R                  [G        U5      U/S 5        U#R                  5         UT R(                  R*                  U'   [A        U[B        5      (       a  [        S5 U 5       5      (       d   eT R                  R                  U5        TS6;   a  [        U5      S3:X  d   e[        U5      U-  n[        U5      [        U5      :X  d   e[E        X5       HJ  u  n0n1U1c   eU0R                  U1:w  d  M  T Rx                  Rk                  U0 S$U0 S7[        U15       S35        ML     U$ [A        U[        5      (       d   eT R                  R                  U5        UR                  US   :w  a8  US   c   eT Rx                  Rk                  U S$U S7[        US   5       S35        U$ s  snf s  snnf )<Nc                    U R                   [        R                  [        R                  4;   a%  [        R
                  " U [        R                  5      $ U $ rr   )r,  rt   rV  rW  r  r  ru   r  s    r{   maybe_upcast,TritonKernel.reduction.<locals>.maybe_upcastY	  sF     ;;MMNN UEMM2 r~   c              3  f   #    U  H'  o[         R                  [         R                  4;   v   M)     g 7frr   )rt   rV  rW  r  s     r{   r   )TritonKernel.reduction.<locals>.<genexpr>i	  s      M_U]]ENN33_s   /1c              3  >   #    U  H  oR                    S 3v   M     g7fr   Nr:  re  s     r{   r   rF  o	       M<LDkk]$/<L   r+  r   c                p   > TR                   R                  TR                  SU  ST S3U R                  S9$ )Nr  r  r  r+  r  r  r  r,  )r  r  r   s    r{   <lambda>(TritonKernel.reduction.<locals>.<lambda>~	  s:    dhh''"1#R'7q9gg ( r~   c           
        > TS;   nU(       a  SOSnTR                  XT5      nTS;   a  TR                  U ST SU ST S35      nOTR                  U ST S	U ST S35      nUb  U S
U S3nU$ )z3
Helper to generate a reduction call, e.g. tl.sum.
)r   r  minprodtriton_helperstl)r  rQ  r   z2(r  r  r  rc  )r?  r8  )	r  r  result_type
use_helpermoduler  r,  reduction_typer   s	        r{   final_reduction/TritonKernel.reduction.<locals>.final_reduction	  s     (+HHJ)3%F00FE/--ha/r%3%qA --ha/qr#a@ & 'k]!4Lr~   c                D   > T" XU5      nU R                  U SU 35        g)z=
Generate a reduction and assign it to an existing variable.
r  N)r   )r  r  r  rU  rY  s       r{   final_reduction_define6TritonKernel.reduction.<locals>.final_reduction_define	  s(     $F;?EMMZLE734r~   c                   > TR                  XT5      nTR                  XT5      nU R                  SU SU ST SU SU ST SU STR                  U S35       S	35        g )
N                z_val, z_idx = triton_helpers.z_with_index(r  )
                r  _idx
                )r?  r   r8  )r  r  r  r   r  r,  root_opr   s       r{   final_argreduce/TritonKernel.reduction.<locals>.final_argreduce	  s    00FE00FEMMF:,.DWI\Z_Y``bchbiiklokp qC 5 5D6I JK Lr~   r+  c              3  P   #    U  H  n[        US    5      (       a  M  Uv   M     g7fr  )r9   )r   r=  s     r{   r   rF  	  s!      *
 C(;CF(CCC5s   &	&r   c                B   > T(       d  U $ [         R                  TX5      $ rr   )r  r  )tvalfvalconds     r{   
where_cond*TritonKernel.reduction.<locals>.where_cond	  s    (..tT@@r~   c                n   > TR                   R                  TR                  T" X5      U R                  S9$ )Nr+  rM  )r  defaultr   rk  s     r{   _mask_value+TritonKernel.reduction.<locals>._mask_value	  s3    xx((LL*U"<EKK )  r~   online_softmax_reduce)argmaxargminr  zindex, r  r  r  rQ  welford_reducewelford_combinec              3  n   >#    U  H*  nTR                   R                  TR                  UTS 9v   M,     g7f)r+  N)r  r  r  )r   r  r,  r   s     r{   r   rF  	  s6      #" HH%%dllE%G"s   25r   = tl.full(r  r  _indexr_  _next, z_next = triton_helpers.z%imum_with_index(
                    z(index
                )
                r  _nextrb  _max_sumz, float('-inf'),  = tl.zeros(z
                    zG_next = triton_helpers.online_softmax_combine(
                        z+
                    )
                    z.to(tl.int8)zif HAS_RSPLIT:z_bval = _val_bvalrE   r   c              3  B   #    U  H  n[        U[        5      v   M     g 7frr   )r  r@  r  s     r{   r   rF  
  s     LAz!%677s   )rt  rq  rc  )r  rK   r   rK   )r  r   rU  r   r   r   )r  r   r  r   rU  r   r   r   )r   rK   )Spytreetree_leavesr,  tree_mapr   rt   r  ru   r=  r   rE  r  sortedr.  r  r:  r  _map_tuple_or_scalarr  r<  r  reduction_cacher8  r6  r  r   r   rx  r   	Reductiondefault_valuerV   r  r  r  r   r  r  rC   r  r  r5  r  rG  rt  welford_reduce_fallbackr   _welford prepare_softmax_twopass_fallbackrK   namedvardefault_accumulatorr  r   r?  select_index_dtypeiinfor  rN  r   r:  r7   r   r  %online_softmax_reduce_final_reductionget_reduction_combine_fnr   r-  r  r  r;  r  r"  r8  *codegen_cooperative_reduction_peer_combinewelford_reduce_final_reductionr  r5  r<  rH  r  r@  r  )8r   r,  rz  rX  r  rC  rL  original_dtypesmasksreduction_range_prefixr\  rd  r  acc_typetorch_acc_typer  rn  ro  r  dmasked_valueaccumulator_indexmeanm2weightaccumulatorr  accumulator_maxaccumulator_sum
result_max
result_sum
combine_fnupdatedaccumulator_casted_strrU  r  bufpeer_valpeer_idxresult_mean	result_m2result_weight	peer_meanpeer_m2peer_weightpeer_maxpeer_sumpeersr=  
orig_dtyperj  r  r  rY  rc  rk  s8   `` `                                              @@@@@@r{   	reductionTritonKernel.reductionR	  s   	 170B0B50IJ0I990IJ4M_MMM++Iu}}EI''u}}=E$$$$MD<L<LMM% u??LL)!%!1!1"!5!<!<Q!? ,,.))
 
 %%'$*A*AA		 '	 		 	4
	5
	5 
	5 '	
	5
 
	5	 	 6	00088++I66"9-))4((///?
) *
 *
  

 zz% 	A
 $$$ll00KG//wGG  !88 E5))>A%>QR>QdaAq 1>QR*5':!55$'HH%%*+A*B',W^_88//:= $kk"[[ & %! &+e<^LLL*l<M  #33--!%!4!4"NE:xQV"J "&!=!=eU!KJ#44!,9999%1"r6" #!%dBU"# 
  #:: "BB5%P
!,<<<<!XX..LL#DLL#l2CTJ&,, / 
 ((++a
|,<N+SKll66~yQG//wGGgu--		##"m;t/B/B/D.ERyPRS[R\\]^ !55&'
|6$:!"mm>>@		##()T5H5H5J4K2{{;/334Bt7H7H7U6VVWY &+e<^L##W%6$77Nwi X M$5#6brBXAY ZS{m5,A;!O P Q"#3z5F4Gu2MO`'a&b c  **JEV &n55!00z8U
  #::$%j\"6$%j\"6 		##&'{43F3F3H2IIZ[cZddef 		##&'|D4G4G4I3J"XJVWX ##$%W_,= >()O+<BugRH\H\G] ^ ##$%S6Gu4M)_(` a$%S6Gu4M)_(` a (
!XX__5_9
!GG**##
  88S
$[%8&&"m3z';'G&HI 

* 1<}L-I*"5e"<K*..J.#	 +..J[AQSW %%%ll66~yQG#--/J..0D0DE./((6 F
 !55&&00!l(4+@+@J<tAT+U*VW  JJ!l%()W #mm>>@JJU[[-E-I-I   4 4j(HU%n55%)99998B5Y KK!1)!<gaj	 II/	:GAJ #MM!#3I#>
 33((!
  #::)3&
JJJ 0 ;WQZ  JJ 0 ;WQZ ::(( GG 0 ;W '((#j/5$ .8  +j%((LLLLLL""))*5 !LL?+q000"%j/O"Cz?c/&::::#&z#CZ!---99
***44%s3%t,?
,K+LAN $D"  j*;<<<<""&&z2 ?1#55&q)555&&00!l#j\6I/Z[J\6]5^^_` _ Kd  Ss   qqc                   U R                  XU5      nU R                  XU5      n[        S5       Vs/ s H%  n[        U R                  R	                  US95      PM'     snu  pxUR                  SU SU SU SU SU S[        R                   SU SU R                  U 5       SU SU R                  U 5       S35        Xx4$ s  snf )Nr   r+  
            r  z9 = triton_helpers.online_softmax_reduce(
                )
            r  )	r?  rC  r   r  r  r   r   r  r8  )	r   r  r  r  r  r,  r  r  r  s	            r{   _online_softmax_reduce#TritonKernel._online_softmax_reduce
  s     66vPUV66vPUVMRSTX!VX#dhhooEo&B"CX!V
L:, ' !O#4Bse2f>R>R=S TLD11ZLBC DLD11ZLBC D		
 %% "Ws   ,Cc           	     D  ^ ^^ UUU 4S jX#U4 5       u  p#nSU SU SU SU S3	n[        S5       Vs/ s H%  n[        T R                  R                  TS95      PM'     n	nTR	                  SR                  U	5       SU 35        [        U 4S jU	 5       5      n
U
$ s  snf )	z+
Helper to codegen triton_helpers.welford.
c              3  J   >#    U  H  nTR                  TUT5      v   M     g 7frr   )r?  )r   r  r  r,  r   s     r{   r   (TritonKernel._welford.<locals>.<genexpr>
  s*      
+ ((>>+s    #ztriton_helpers.welford(r  r  r   r+  r  c              3  F   >#    U  H  nTR                  U5      v   M     g 7frr   )r8  )r   r  r   s     r{   r   r  
  s     Xud33E::s   !)rC  r   r  r  r   r   r  )r   r  r  r  r  r  r,  welfordr  welford_resultsresult_valuess   ``    `    r{   r  TritonKernel._welford
  s    
F+
& ,D6B4r&C5JFKAhOh3txxU;<hODIIo67s7)DEXXX	 Ps   ,Bc                   U R                  5       U R                  -
  nU S3nU S3n	U S3n
U R                  R                  U SU R	                  5        SU S35        U R                  R                  U	 SU R	                  5        SU S35        U R                  R                  U
 SU R	                  5        SU S35        US:X  a=  Uu  pnU R
                  R                  SU S	U	 S	U
 S
U SU	 SU
 SU SU SU S35        O9US:X  d   eU R
                  R                  SU S	U	 S	U
 SU SU SU	 SU
 S35        U R
                  R                  SU SU" U S3U5       SU	 SU" U	 S3U	5       SU
 SU" U
 S3U
5       S35        UnU R                  R                  US9nU R                  R                  US9nU R                  U R                  UUUUU	U
UU5	      $ )z%Helper to codegen a welford reduction_mean_m2_weightr}  r  r  ru  r_  ry  z<_next = triton_helpers.welford_combine(
                    z,
                    z#
                )
                rt  z;_next = triton_helpers.welford_reduce(
                    z1, roffset == 0
                )
                z            r  rz  r  r+  )r  r<  r  r   r  r  r   r  r  r  r:  )r   r  rX  r  rk  r  r,  r  r  accumulator_m2accumulator_weightr  r  r  r  r  r  s                    r{   rt  TritonKernel.welford_reduce
  s    %%'$*A*AA#E*&<s+ *|73		m<(;(;(='>b
!L	
 			l4+>+>+@*AH:QO	
 			!",t/B/B/D.ERzQRS	
 ..$DfLLW^$4G<N;O P MN#326H5I JF"RD6( + "%5555LLW^$4G<N;O PG2k]"^,<B?Q>R S 	MZ;-u(={KL MC
n-=U+C^ TU V J2D1EU/KM_$`#a b	
 !HHOO%O0	e422""

 
	
r~   c
                    U R                  XXgX5      n
X#U/n[        X5       H  u  pUR                  U SU 35        M     X#U4$ )z0Helper to codegen call to triton_helpers.welfordr  )r  r  r   )r   r  r  r  r  r  r  r  r  r,  rf  result_exprsresult_exprr  s                 r{   r  +TritonKernel.welford_reduce_final_reduction3  sV     vRD#>"%l";KMM[MUG45 #< }44r~   c                    U R                  XXVU5      nX#/n	[        X5       H  u  pUR                  U
 SU 35        M     X#4$ Nr  )r  r  r   )r   r  r  r  r  r  r  r,  rf  r  r  r  s               r{   r  2TritonKernel.online_softmax_reduce_final_reductionG  sT     ,,VxeT"/"%l";KMM[MUG45 #< %%r~   c                N    U R                   (       a  U R                   S   $ [        $ )NRSPLIT)r7  r+   r   s    r{   
max_rsplitTritonKernel.max_rsplitQ  s"    $$X..  r~   c                   U R                   S   nU R                  5       (       d  SOSnXBR                  -  U R                  5       -  nU R                  R                  U5      u  pxU R                  R                  SU SU SU R                  U5       S[        U5       SU S	U S
U S3SS9  U R                  R                  U SU S[        U5       S35        U S3$ )z
Generate code to save a [XBLOCK, RSPLIT] temporary workspace, where each thread block writes a different
column.  After the barrier, every thread block loads the completed value so that it can compute the final
value independently.
r3  zxindex < xnumelNrb  z_ws = (r  z).to(tl.pointer_type(z))
                tl.store(z%_ws + (xindex * RSPLIT + rsplit_id), r  r  Tstripz_peers = tl.load(z_ws + (xindex * RSPLIT + rsplit_arange), rsplit_mask, eviction_policy='evict_first', other=triton_helpers.if_mask(rsplit_mask, r  _peers)r>  r_  r4  r  r[  r  r:  r   r  r=   r;  r   rV   )	r   r  r,  default_valxnumelr   r  r  r  s	            r{   r  7TritonKernel.codegen_cooperative_reduction_peer_combineV  s#    S!(,(@(@(B(B ..(4??+<<!GGPPQWX%%GG9C0A0A)0L/MMbcnotcubv w$%J:,VXY]X^ _  	& 	
 	&&l+J< 8eers~e  eA  ACD	
 V$$r~   c                   U R                   (       d   eSU l         U R                  USS9nSU l         U R                  R                  U5      n[        R
                  " 5       nU R                  (       a*  UR                  U R                  XR                  5      5        [        U[        5      (       aZ  U R                  R                  [        UU R                  UUUR                  U5      USUR!                  5       < 35      5      5        O][        U["        5      (       d   eU R                  R                  [        USU SUR$                   SU SUR&                   S	3	5      5        UR)                  5         g )
NFTr  r  r  r\  r  r  r  )r=  r  r_  r  r  r  rG  r  r  r;  r  r   r   rL   r  r`  rk  r   r   r   r  )r   r]  r   r  r  r=  r  s          r{   store_reductionTritonKernel.store_reductionq  sV    $$$$ %==$=7 $iit$))+
%%$$,,T3G3GH h00  **55  ,+H,C,C,E+HI	 h8888  **uD););(<CwbIZIZH[[\] 	r~   c           	     :  ^^^
^^^^ [        5       mTR                  S5        [        5       m
[        S5       V^s/ s H#  m[	        U
UU4S j[        U5       5       5      PM%     nnSR                  S [        R                  R                  U5       5       5      nTR                  SU S35        [        5       mSmS	S
K
Jn  U" 5       m " U
UUUU4S jS[        5      nTR                  5          [        R                  " U" 5       5         U" U6 n	SR                  S U	 5       5      n	TR                  SU	 35        S S S 5        S S S 5        U R                   R#                  TR%                  5       TS9$ s  snf ! , (       d  f       NC= f! , (       d  f       NL= f)Nz@triton.jitr   c              3  V   >#    U  H  nTR                  S T SU 3TU   S9v   M      g7f)rJ  r  r+  N)r  )r   nr  dtypesr  s     r{   r   ,TritonKernel._lift_helper.<locals>.<genexpr>  s.     X1#,,QCq}F1I,>s   &)r  c              3  8   #    U  H  n[        U5      v   M     g 7frr   r   r  s     r{   r   r    s     R.Qc!ff.Qrg  zdef {name}():r  r   rd   c                  >   > \ rS rSr        SU UUUU4S jjrSrg)+TritonKernel._lift_helper.<locals>.CSEProxyi  c                z   > TSU 3-  m[        TU5      " U0 UD6nTR                  T[        T	U5      " U0 UD6US9$ )Nr  r+  )rg  r  )
r   r]  r_  rI  output_dtyper  dtype_handlerhelperhelper_name	overridess
        r{   _default4TritonKernel._lift_helper.<locals>.CSEProxy._default  sk     4&z)&!   # " #
 ||It,d=f=& $  r~   r   N)r]  r   r_  ztuple[Any, ...]rI  dict[str, Any]r   r   )rs   r   r   r   r  r   )r  r  r  r  r  s   r{   CSEProxyr    s-    '6@N r~   r  c              3  8   #    U  H  n[        U5      v   M     g 7frr   r  )r   r  s     r{   r   r    s     B'F'rg  return r  )rM   r   rJ   rC  r  r   rd  re  from_iterableru  rQ  re   r%   r"  rC   set_ops_handlerr3  r  r   )r   fnnum_argsr  r  r_  	signaturere   r  outputsr  r  r  r  r  s      ``     @@@@@r{   _lift_helperTritonKernel._lift_helper  sK     !'e 1X
 XhXX 	 
 IIRioo.K.KD.QRR	=267#%	 *P24	 	~ 	$ ]]_a//
;$iGiiB'BBGwwi01 <_
 $$(():k(RRU
J <;__s)   *E60F2E;>F;
F		F
Fc                $  ^ ^ T R                   (       d   eT R                  (       a   S5       e[        S T R                   5       5      nT R	                  U5        [        U5      nT R                  (       a   S5       e/ n/ n[        S U 5       5      n[        R                  " T R                  R                  T R                  5      nT R                  U[        U5      U5      nT R                  5       T R                   -
  n	[#        X15       GH3  u  pT R                  R                  T R                  U
 S[%        U5       S3US9nT R                  R                  T R                  SU S	T R'                  5        S3US9n
UR)                  U
5        [+        U5      nT R,                  (       a  M  T R                  R/                  US9nT R1                  5       nS
US'   SS	R3                  U5       S3nUR4                  (       a  SOSnT R6                  R9                  U SU S	U S	U S35        UR)                  U5        GM6     S mUU 4S jnU" ST" U5       SU	 S	U S3UUU5      nT R,                  (       d  U Vs/ s H!  nU" SU S3[;        UR<                  5      S9PM#     nnU" [        U5      [        U5      5      nU" [        U5      U5      n[#        UU5       VVs/ s H  u  nnU" SU S	U S3UR<                  S9PM      nnn[#        UUU5       H+  u  nnnT R                  R9                  U SU S	U S35        M-     OUnU H*  n[?        U[@        5      (       d   e[        U5      Ul!        M,     [        U5      $ s  snf s  snnf )NTODOc              3  >   #    U  H  oR                    S 3v   M     g7frH  rI  re  s     r{   r   $TritonKernel.scan.<locals>.<genexpr>  rJ  rK  z(ops.scan not supported inside ops.maskedc              3  8   #    U  H  n[        U5      v   M     g 7frr   r?   r   r,  s     r{   r   r          Fve*511vrg  rc  r  r+  r  r  r  r+  r  r  zfloat('nan')z-1rw  c                2    SR                  S U  5       5      $ )Nr  c              3  *   #    U  H	  o S 3v   M     g7f,Nr   r   r  s     r{   r   1TritonKernel.scan.<locals>.csv.<locals>.<genexpr>       <VEgQKVr  r   rf  s    r{   csvTritonKernel.scan.<locals>.csv      88<V<<<r~   c                4  > [        U5      n[        U5       Vs/ s H  oP SU SU 3PM     nn[        U4S jU 5       5      (       a,  U Vs/ s H  nTR                  R	                  U5      PM      sn$ U Vs/ s H  nTR                  R                  US9PM     n	nTR                  R                  T" U	5       SU  35        [        X5       H-  u  pU(       a  X*l	        TR                  R                  Xz5        M/     [        U	5      $ s  snf s  snf s  snf )Nr  c              3  Z   >#    U  H   nTR                   R                  U5      v   M"     g 7frr   r  containsr   r  r   s     r{   r   :TritonKernel.scan.<locals>.cse_multiple.<locals>.<genexpr>  #     LI488$$Y//   (+r+  r  )r  rC  r5  r  r  r  r  r   r  r   r  r  )r  rf  r  r  r  r  
cache_keysr  _dtyperesult_varsr  r  r   s              r{   cse_multiple'TritonKernel.scan.<locals>.cse_multiple  s    FA;@8D8aF"QCr%18JDLLLLAKLIY/LLGMNvV488???8vKNLL""{#$Cv. *-[)E%
+0(Y3 *F %% ELNs   D%D9#Dztl.associative_scan((r  ztriton_helpers.select_one((z1), rbase == (RBLOCK - 1), dim=-1, keep_dims=True)ztl.where(roffset > 0, z = tl.where(roffset > 0, )"r=  rG  r   rE  r  r  r.  r  r  r  r  r  r  r  r  r  r<  r  r-  r  r  r8  rx  r  r<  r   r:  r  r   r?   r,  r  r@  r   )r   r  r  rf  r  broadcasted_valuesaccumulatorscse_computecombine_helper_fnr  r  r,  value_dtyper  r  reduced_sizern  r  partial_scan_varspartial_scan_varpartial_reduce_vars	accs_nextfull_scan_vars	full_scanpartial_scanr  acc_nextpartial_reducer  r  s   `                            @r{   scanTritonKernel.scan  s    $$$$--5v5-MD<L<LMM% u??N$NN"FvFF''(9(94<<H --j#f+vN%%'$*A*AA/LE((++'1%89; , K
 HH%%";-r$2E2E2G1HJ & E
 %%e,&u-H,,,"hhooEo:#335#&R !"499\#:";1=,1,C,C.		##"m;|nBwir(STU ##K05 08	=	& )#C(:$;#<CuBGXFYYZ[	
 (( ):#
 ):$	 12B1CCtu-.>.D.DE ):   # #5#6>Q8RSI'l(;=NON 03>CT/U
 0V+I|	 ,YKr,qI&,, 0V   :=<)<:5+~ &&"m#<XJbHXXYZ: ,K%Jj*;<<<<#-e#4J  & [!!;#s   ?(N%%Nc                L  ^ ^ T R                   (       d   eT R                  (       a   S5       e[        S T R                   5       5      nT R	                  U5        [        U5      nT R                  (       a   S5       eT R                  (       d   S5       e[        R                  " T R                  R                  T R                  5      nT R                  5       T R                  -
  n[        S U 5       5      n[!        U5      [!        U5      :X  d   e[#        U5       VV	s/ s H#  u  pU" SU	 ST R%                  5        S3X   S	9PM%     n
nn	S
 mUU 4S jnT R                  S   R&                  (       d   eT R)                  T R                  S   5      (       a  SOSn[!        U5      S:X  a/  SU
S    SU
S    SU SU SU SU S3nU" U[!        U5      XQ5      nO[+        S5      e[-        X5       H  u  nnX_l        UR0                  Ul        M     [        U5      $ s  sn	nf )Nr  c              3  >   #    U  H  oR                    S 3v   M     g7frH  rI  re  s     r{   r   $TritonKernel.sort.<locals>.<genexpr>F  rJ  rK  z(ops.sort not supported inside ops.maskedz3ops.sort is only supported in persistent reductionsc              3  8   #    U  H  n[        U5      v   M     g 7frr   r  r  s     r{   r   r1  Q  r  rg  r  r  r  r+  c                2    SR                  S U  5       5      $ )Nr  c              3  *   #    U  H	  o S 3v   M     g7fr  r   r	  s     r{   r   1TritonKernel.sort.<locals>.csv.<locals>.<genexpr>[  r  r  r  r  s    r{   r  TritonKernel.sort.<locals>.csvZ  r  r~   c                4  > [        U5       Vs/ s H  o@ SU SU 3PM     nn[        U
4S jU 5       5      (       a,  U Vs/ s H  nT
R                  R                  U5      PM      sn$ [        U5       Vs/ s H  nT
R                  R	                  X4   S9PM      nnT
R
                  R                  T	" U5       SU  35        [        Xu5       H-  u  pU(       a  X(l        T
R                  R                  Xh5        M/     [        U5      $ s  snf s  snf s  snf )Nr  c              3  Z   >#    U  H   nTR                   R                  U5      v   M"     g 7frr   r  r  s     r{   r   :TritonKernel.sort.<locals>.cse_multiple.<locals>.<genexpr>_  r  r  r+  r  )rC  r5  r  r  r  r  r   r  r   r  r  )r  r  r  r  r  r  r  r  r  r  r   s            r{   r  'TritonKernel.sort.<locals>.cse_multiple]  s    ;@8D8aF"QCr%18JDLLLLAKLIY/LLEJ1XNX488???;XKNLL""{#$Cv. *-[)E%
+0(Y3 *F %% ELNs   D%D7%Dr+  r   rnumelr   ztriton_helpers.sort_with_index(r   rE   z	, stable=z, descending=zUnhandled sort)r=  rG  r   rE  r  r  r.  rx  r  r  r  r  r  r  r<  r  r  	enumerater  r@  rV  r  r  r   rD  )r   r  rf  stable
descendingr  r   r  r  r  r  r  r;  r  r  r  	input_varr  s   `                @r{   sortTritonKernel.sort=  s)    $$$$--5v5-MD<L<LMM% u??N$NN"(( 	
A	
(  ''(9(94<<H%%'$*A*AAFvFF6{c&k)))
 &f-	
 . "5'D,?,?,A+B!DFI .	 	 
	=	& #00002243C3CB3GHHhv;!12DQ2G1HK]^_K`Ja b82cU)F8=AO  'tS[%HK !122%(%=!J	#(  ) 0 0J &> [!!Q
s   "*H c                   U R                   (       dV  U R                  (       dE  U R                  (       d4  U R                  (       d#  U R                  (       d  U R
                  (       d  gU R                   Vs/ s H  oR                  (       d  M  UPM     nnU R                  (       GaW  [        U5      S:  GaG  [        U5       H  u  p1U R                  R                  US9   UR                  nU R                  (       a  SOSnU R                  (       a  SOU S3nU R                  R                  SU S	U S
U S
UR!                  5        S3	5        SSS5        U R                  R                  US-   S9   U R#                  XR                  5        SSS5        M     U R                  R                  [        U5      S9   U R%                  U R                  5        U R                  R'                  U R                   5        U R                  R'                  U R                  5        U R                  R'                  U R                  5        U R                  R'                  U R                  5        SSS5        [)        / [        U5      Q5       GHl  u  p1U R                  R                  US-   S9   U R*                  UR,                     R/                  5        H  u  pxU[        U5      S-
  :  ar  X#S-      n	U R*                  U	R,                     U   n
[0        R3                  U	5      n[5        U	R6                  U5      n[9        X5       VVs/ s H  u  pXU-  -
  PM     nnnU R                  R                  [;        U R<                  U   U SU S
[>        R@                  RC                  U5       S35      5        M     SSS5        U RD                  RG                  U RH                  5        URK                  5         GMo     OU R                  R'                  U R                   5        U R                  R'                  U R                  5        U R                  R'                  U R                  5        U R                  R'                  U R                  5        U R                  R'                  U R                  5        U R                  (       ai  U R                  (       d  U R
                  (       aG  U RL                   S3nU R                  R'                  SU S3SS9  U RN                  RQ                  5         U R                  R'                  U R
                  5        U R                   RS                  5         U R                  RS                  5         U R                  RS                  5         U R                  RS                  5         U R                  RS                  5         U R
                  RS                  5         gs  snf ! , (       d  f       GN= f! , (       d  f       GMX  = f! , (       d  f       GN= fs  snnf ! , (       d  f       GN= f)z
Concat output code from index_code, loads, compute, stores,
suffix into self.body.

For pointwise kernels, this is called just once at the end.

For reduction kernels, this generates a loop over the reduction
axis.
Nr   )r   rsplit_startr  
rsplit_endr  zfor zoffset in range(r  zBLOCK):rE   z = tl.advance(r  z + tl.program_id(1)zR
                if HAS_RSPLIT:
                    triton_helpers.x_grid_barrier(r`  Tr  )*indexing_coder  r  r  r:  r;  rE  rd  r=  r  r<  r  r"  r:  rG  r   r   ri  rm  r   rB  rA  r   r8  r   r   r   r  r  rL   r@  rC   r  r  r  
invalidater<  cache_clearrZ  r[  r  clear)r   r   
loop_treeslevelr:  
loop_startloop_endr  advancement	prev_treeprev_advancement
prev_blockprev_num_itercurprevsem_ptrs                   r{   codegen_bodyTritonKernel.codegen_body}  s    zz{{||%%##'+'7'7H'7t<<d'7
H   S_q%8(4YY%%U%3![[F373M3MSVJ(,(B(B6(RWHX  II''vh&6zl"XJbQWQ]Q]Q_P``gh 4 YY%%UQY%788yyI 87  5 !!Z!9..tyy9		  !3!34		  ,		  .		  - :  ((@)J*?(@AYY%%UQY%7262K2K		3eg3.	 !3z?Q#66(219(=I/3/H/H )0'0), *7)E)Ei)PJ,3IOOZ,PM 25[1S+1SIC !$]&: :1S ( +
 		++( $ 8 8 C#,+^I;bI^I^_jIkHllm n!3 84 ##D$:$:;  "9  B< IIT//0IITZZ(IIT\\*IIT[[)		//0%%""d&:&:--..ABGII33:) <    66BBD		--.  "

$$&""$] I 43 87 :9,+ 87sR   6W4W4!A0W95X?B0X/BX6
X0AX69
X	
X	
X-0X66
Y	c                   / nU R                  5       (       Ga)  / nU R                  SU/ 5        U GH  n[        U[        5      (       a  UR	                  [        U5      5        M5  [        U[        5      (       aM  UR	                  [        [        R                  R                  R                  UR                  5      5      5        M  [        U[        R                  5      (       aC  UR	                  [        [        R                  R                  R                  U5      5      5        M  [        S[        U5       35      e   U$ )Nr   z!Unsupported numel argument type: )rr  add_numel_to_call_argsr  r  r  r   ra   rC   r  r  	size_hint
inner_exprr   r  r#  r  )r   r_  
numel_argsrJ  s       r{   kernel_benchmark_extra_args(TritonKernel.kernel_benchmark_extra_args  s    !!+-J''J;!c3''KKC)_55KKAGG$4$4$>$>s~~$N OPUZZ00KKAGG$4$4$>$>s$C DE$'Hc%TUU " r~   c                z   [        5       nU R                  R                  5       u  p4pVUR                  / SQ5        UR	                  5          [
        R                  " 5       n/ n[        XE5       GH  u  pS[        U5       3n[        R                  R                  U	5      nU(       a  UR                  U S[        R                  R                  R                  UR                  5       5       S[        R                  R                  R                  UR!                  5       5       SUR#                  5        SUR%                  5        S3
5        GOU	[        R                  R&                  ;   a  [        R                  R&                  U	   nUR                  U S[        R                  R                  R                  UR)                  5       5       S[        R                  R                  R                  UR+                  5       5       SUR,                   SUR.                   S3
5        GO[1        U
[2        5      (       a\  [        R                  R                  R5                  U
R6                  5      nSU
R8                  ;   a  S	nUR                  U S
U 35        O[1        U
[:        5      (       ay  [        R                  R=                  5       n[        R                  R                  R5                  U
R                  5      nUR                  U SU SU SU
R.                   S35        O[?        SU	 35      eURA                  U5        GM     URC                  U RE                  5       5        UR                  SSRG                  U5       S35        S S S 5        UR                  / SQ5        [        R                  R=                  5       nURH                  nUR	                  5          UR                  S[        R                  RJ                  RM                  U5       S35        UR	                  5          UR                  [        R                  RJ                  RO                  U5      5        SU 3nUR                  U SU S35        UR                  [Q        [R        RT                  5       SU S35        S S S 5        S S S 5        UR                  / SQ5        UR	                  5          UR                  S[        R                  RJ                  RM                  U5       S35        UR	                  5          UR                  [        R                  RJ                  RO                  U5      5        UR                  S[Q        [R        RT                  5       S35        S S S 5        S S S 5        UR                  / SQ5        UR	                  5          UR                  S5        UR                  S5        UR                  S5        UR                  S5        UR                  SU 35        UR                  S5        UR                  S5        S S S 5        U$ ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       N= f! , (       d  f       N= f! , (       d  f       U$ = f)N)r   r   zdef get_args():arg_z = rand_strided(r  z
, device='z	', dtype=r  r  r   r  z = torch.zeros(z*Don't find the buffer or const tensor for r  r  )
r`  zdef call(args):zwith r  streamz = get_raw_stream(z.run(*args, stream=)r`  r`  z def benchmark_all_configs(args):z.benchmark_all_configs(*args))r`  r`  zif __name__ == '__main__':z<from torch._inductor.runtime.benchmarking import benchmarkerr   zargs = get_args()z:ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40)z	num_gb = zgb_per_s = num_gb / (ms / 1e3)z<print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s"))+rM   r_  python_argdefs
writelinesr"  rd  r=  r  r  rC   r  try_get_bufferr   r  
size_hintsget_size
get_stride
get_devicer  	constantsr  rG  devicer,  r  rR   rY  r(  r]  rT   get_current_device_or_throwKeyErrorr  extendr\  r   r   
device_opsdevice_guard
set_devicer   r8   KERNEL_NAME)r   num_gbrN  _argdefs	call_argsr  r  name_cnt	var_namesarg_namearg_sigvar_namer  const_tensorsymval_hintrj  r=  current_devicer   stream_names                       r{   codegen_kernel_benchmark%TritonKernel.codegen_kernel_benchmark  s   !,0II,D,D,F)Y56]]_ (HI%(%>!!$x.!12gg,,X6$$#*$4QWW5E5E5P5PQTQ]Q]Q_5`4aacdedkdkdtdtdd  AD  AO  AO  AQ  eR  dS  S]  ^a  ^l  ^l  ^n  ]o  ox  y|  yF  yF  yH  xI  IJ  K !2!22#$77#4#4X#>L$$#*$4QWW5E5E5P5PQ]QbQbQd5e4ffhijipipiyiy  jE  jE  FR  FY  FY  F[  j\  i]  ]g  ht  h{  h{  g|  |E  FR  FX  FX  EY  YZ  [  11"#''"2"2"<"<W\\"JK
 %4&'$$z[M%BC66WW@@BFGG,,66w}}EE$$#*OE7*VHIV]VcVcUddef #DXJO    *A &?B T==?@wtyy';&<A>?K N 	9:<<>$$]]_uQWW%7%7%D%DU%K$LANO  GG&&11%8 !'ug.  K=0B5'!KL  ;22344G}TUV !  	JK]]_uQWW%7%7%D%DU%K$LANO  GG&&11%8   c+"9"9:;;XY	 !  	DE]]_N R 01L y12=>N   g _X ! _  ! _ _  sf   MY&AY73BY%6Y7)AZ6A%Z	ZA;Z+
Y"%
Y4	/Y77
Z	
Z	Z
Z(+
Z:c                    [         R                  " SR                  [        R                  R
                  R                  S5      5      5      $ )Nzl
            from torch._dynamo.testing import rand_strided
            {}
            import torch
        get_raw_stream)textwrapdedentr`  rC   r  rn  import_get_raw_stream_asr   s    r{   imports_for_benchmark_kernel)TritonKernel.imports_for_benchmark_kernelK  s:     F177%%>>?OPQ
 	
r~   c                    U R                   (       a  gU R                  (       a  gU R                  (       a  U R                  (       d   egU R                  (       a  gg)Nr7  rG  rx  r  	pointwise)r7  rG  rx  r=  r   s    r{   _get_heuristicTritonKernel._get_heuristicT  sD    !''*&&(((()""r~   c                    [         R                  R                  R                  5       [         R                  " 5       [
        R                  [
        R                  [
        R                  R                  [
        R                  [
        R                  [
        R                  [
        R                  [
        R                  [
        R                  R                  [
        R                  R                   [
        R                  R"                  S.n [         R$                  R&                  b  SU S'   [
        R(                  " 5       (       a  SU S'   [
        R*                  (       aL  [
        R*                  U S'   [
        R,                  U S'   [
        R.                  U S'   [
        R0                  U S'   [
        R2                  (       a9  [
        R2                  U S	'   [
        R4                  U S
'   [
        R6                  U S'   U $ )N)backend_hash$are_deterministic_algorithms_enabledassert_indirect_indexingautotune_local_cacheautotune_pointwiseautotune_remote_cacheforce_disable_cachesdynamic_scale_rblockmax_autotunemax_autotune_pointwisemin_split_scan_rblockspill_thresholdstore_cubinTis_hipr  profile_bandwidthprofile_bandwidth_regexprofile_bandwidth_output/profile_bandwidth_with_do_bench_using_profilingcoordinate_descent_tuning coordinate_descent_search_radius'coordinate_descent_check_all_directions)rt   utils_tritontriton_hash_with_backendr  r   r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )inductor_metas    r{   inductor_meta_common!TritonKernel.inductor_meta_common`  sx    "KK//HHJ494^4^4`(.(G(G$*$?$?"(--"B"B%+%A%A$*$?$?$*$?$?"//&,&C&C%+]]%H%H%}}<<!==44
 ==(&*M(#)-M+&##171I1IM-.7=7U7UM348>8W8WM45FF KL ++00 56 77 <= >> CD r~   c                  ^ ^! [        5       n0 nU R                  R                  5        H  u  pE[        U5      (       a  U R                  (       d  M(  [
        R                  R                  R                  U5      n[        U[        [        R                  45      (       d  SnO[        [        U5      5      nXsU'   M     Uc  UR                  [        5       5        [
        R                  R!                  5       R"                  nUS:X  a  UR                  S5        OUR                  S5        [$        R&                  (       a  UR                  U R)                  5       5        U R*                  R-                  5       u  m n	m!n	[/        T!5       H  u  p[        U[0        5      (       d  M  [3        [        R4                  UR6                  5      nU[
        R                  R                  R8                  ;   d  Mj  [1        UR:                  [
        R                  R                  R8                  U   5      T!U
'   M     [<        [>           " 5       nU R@                   GH'  nXR*                  RB                  ;   a(  URE                  U R*                  RB                  U   5        XR*                  RF                  ;   am  U[
        R                  RH                  ;  aO  XRH                  ;  a@  URE                  [3        [J        U R*                  RF                  U   5      RL                  5        XR*                  RN                  ;   d  M  U R*                  RN                  U   n[        U[P        5      (       a   eURE                  U5        GM*     [S        T T!5       HX  u  nn[        U[T        5      (       d  M  URV                  [X        RZ                  :X  d  M=  URE                  UR:                  5        MZ     []        U5      nU R_                  5        H[  n[1        UR`                   S3URb                  5      nT!Re                  U5        T Re                  [g        UR:                  5      5        M]     U U!4S jnU Rh                   HY  nURj                  (       a  U Rl                  (       a  M'  URn                  c  M6  U" UR`                  Rq                  5        S35        M[     U Rr                  (       a  U" S5        [u        T!U Rv                  T S	9nU[x        Rz                  " [
        R                  R!                  5       5      0 S
.n[
        R                  R|                  =(       d    [
        R                  R~                  nU R                  5       R                  [        U R                  5      [?        [        R                  5      UUU R                  U R                  U R                  S.U R                  5       EnU Rr                  (       a  U Rl                  US'   S n[$        R&                  (       d  [$        R                  (       a  U R                  5       S-  nUUS'   [        T!5      /US'   [        T!5       H  nSUS   T!U   R:                  '   M     UU lN        U R                  5         U R                   H%  nUR                  S5        UR                  U5        M'     U R                  (       a5  SU R                  5        SU R                  R$                  < SU< SU< S3	nOU R                  (       a>  U R                  R                  5       nSU R                  5        SU< SU SU< SU< S3nO^Sn[        U5      S:X  a  [        [        T!5      5      S:X  a  SnOSnSU R                  5        SU< SU SU< SU< S U R                   S3nUR                  U5        UR                  S!U=(       d    [?        [        R                  5       S"SR                  S# T  5       5       S$35        UR                  5          U R                  U5        U R*                  R                  5        H  u  nnUR                  U S%U 35        M     UR                  U R                  5        S S S 5        [$        R&                  (       a   UR                  U R                  U5      5        UR                  5       $ ! , (       d  f       NS= f)&Ni    cpuz"triton_helpers.set_driver_to_cpu()z"triton_helpers.set_driver_to_gpu()r  c                   > [        5       (       a  TR                  [        U 5      5        TR                  [        U SS95        g )NT)is_constexpr)r>   r  rI   rG   )rw  argdefsr  s    r{   add_constexpr_arg6TritonKernel.codegen_kernel.<locals>.add_constexpr_arg  s2    -//  h!78NN78$?@r~   r   r  )
size_dtyper  )r  rj  ri  )	grid_typerD  kernel_namemutated_arg_namesoptimize_memr9  num_loadnum_reductionrx  g    eAkernel_num_gbconfigsrE   ri  r   z$
                @triton_heuristics.z(
                    config=zI,
                    filename=__file__,
                    triton_meta=z$,
                    inductor_meta=z;
                )
                @triton.jit
            z!(
                    size_hints=z%,
                    reduction_hint=r   r2  ztile_hint=TileHint.SQUARE,ztile_hint=TileHint.DEFAULT,r  zH
                    filename=__file__,
                    triton_meta=z*,
                    min_elem_per_thread=zdef r  c              3  @   #    U  H  oR                  5       v   M     g 7frr   )	full_namer  s     r{   r   .TritonKernel.codegen_kernel.<locals>.<genexpr>Z  s     Cc[bVWKKMM[br  r  r  )arM   r>  r8  r9   r=  rC   r  r  symbolic_hintr  r  r   rY  r-   r   r   rk  r  r   benchmark_kernelr  r_  rb  r<  rR   r	   r   r(  inv_precomputed_replacementsr]  r   r   	mutationsinput_buffersr  r  removed_buffersrN   
inner_nameoutput_buffersrQ   r  rT   	zero_moderU   ZERO_ON_CALLr  r  r:  r  r  rG   rE  r@  rx  
tensor_dimr   rG  r`   r  r)   rO  is_inferenceis_backward_get_grid_typers   setrD  r8   DESCRIPTIVE_NAMEr9  r  r  r  r  estimate_kernel_num_bytesr\   r]   rE  rU  r3  r   r7  r  r?  get_reduction_hintr  r^   r  rq  r   r"  codegen_static_numelsaliasesr  r~  r   )"r   r]  codere  r:  r  
numel_hintrY  device_typer  r  rJ  r  mutated_argsmutationmutation_argargnamer   sizeargr  triton_meta_signaturerE  r  r  rr  arg_numr  heuristics_linereduction_hint	tile_hintoldnewr  r  s"                                   @@r{   codegen_kernelTritonKernel.codegen_kernel  sF   
![[..0MF"6**43H3H))77>Jj3*>?? !	+C
O<	!*v) 1, <KK134''==?DDKe#@A@A&&D==?@#'99#;#;#= Iq	*FA#w'' ellCHH5QWW--JJJ#*!''"2"2"O"OPV"W$IaL + "#(H99222  !8!8!BCII555AGG$;$;;$8$88  )B)B8)LMXX 99333#yy77A%lJ????  . '6  3LGS3--MM%6%C%CC  . 4 l+++-DU3TZZ@GW%NN77<<01 .	A $$D  T%>%>&!2!2!4 5U;< % %%h' 1$"2"2G!
 /&--agg.Q.Q.ST'
 ww++Bqww/B/B ,,.77!$"5"56{;;<!-(!//
 '')
 %%484M4MM01""f&>&>335;F-3M/*"+I"6!7I +95G@AK$Yw%7%<%<= 6 '++FNN2KK , #$$($7$7$9#: ; --447 8!!, 0##0"3 4O ""!]]==?N#$$($7$7$9#: ;  *~ .$$2#3 4!!, 0##0"3 4	O I:!#/	:;q@ <I =I#$$($7$7$9#: ;  *~R	{ ;!!, 0##0"3 4))-)A)A(B C	O 	O$473{6678$))Cc[bCc:c9ddfg	
 [[]&&t, II--/S#c#/0 0KK		"	  ""KK55f=>}} ]s   3A'c''
c5c                   [         R                  R                  R                  U 5      n [	        U [
        R                  [        45      (       a  [        U 5      n[        U5      nU$ Sn[         R                  R                  R                  X5      (       dI  US:  a  [        SU  35      eUS-  n[         R                  R                  R                  X5      (       d  MI  U$ )N   i @  z!Failed to find static RBLOCK for r   )rC   r  r  simplifyr  r   rY  r  r-   statically_known_leqr#  )r;  rL  s     r{   _get_persistent_RBLOCK#TritonKernel._get_persistent_RBLOCKg  s    !!**62fu}}c233f+C!#&C 
 Cgg&&;;FHH?$'H%QRRq gg&&;;FHH 
r~   c                P     [         R                  U 5        g! [         a     gf = f)NTF)r2  r  r#  )r;  s    r{   has_persistent_RBLOCK"TritonKernel.has_persistent_RBLOCKu  s*    	//7 		s    
%%c                   S	S jnU R                    GHt  nUR                  (       a  U R                  (       ai  [        R                  R
                  R                  UR                  5      nU" U5      (       a)  UR                  UR                   S[        U5       35        UR                  (       a  U R                  (       a  U R                  (       a1  U R                  U R                  UR                  5      5      nSU S3nOU R                  UR                  5      nUR                  UR                  R!                  5        SU 35        UR                  S:X  d  GMO  U R"                  (       d  GMc  UR                  S5        GMw     g)
ay  
We get a small speedup from hard coding numels if they are static.

This code stomps on the passed-in values by writing an constant to the top of the kernel.

In a kernel like:
def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):

We would add
xnumel = 4096
r0_numel = 768

After the signature, before the kernel code, if we decided to make these static. As its hardcoded, it becomes
a better signal to triton on how to unroll and do some static indexing. So, it's not so much that downstream
knows that its a static numel, as that you just plop a constant into the kernel.
c                B    [        U [        R                  [        45      $ rr   )r  r   rY  r  )r(  s    r{   is_static_integer=TritonKernel.codegen_static_numels.<locals>.is_static_integer  s    dU]]C$899r~   znumel = z*triton_helpers.constexpr_next_power_of_2((z + RSPLIT - 1) // RSPLIT)zBLOCK: tl.constexpr = r3  zXBLOCK: tl.constexpr = 1N)r(  r   r   r   )rE  r@  r=  rC   r  r  r  r  r   r:  r  rx  rG  r4  r  r  r   r9  )r   r  r  r   simplified_tree_numelr  rL  s          r{   r  "TritonKernel.codegen_static_numels}  s   $	: $$D$$(=(=()(8(8(A(A$**(M%$%:;;NNdkk](3?T;U:V#WX  T%>%>-- JJt';';DJJ'GHEFugMfgC55djjAC$++"3"3"5!66LSERS{{c!dmmm9: %r~   c                   [        U R                   Vs/ s H  n[        UR                  (       + 5      PM     sn5      nU R                  (       a  US:X  d   e[
        R                  $ US:X  a  [
        R                  $ US:X  aN  [        [        U R                  U R                  5      5      (       a  [
        R                  $ [
        R                  $ US:X  a  [
        R                  $ [        SU 35      es  snf )NrE   r   r   z"Unsupported number of dimensions: )r  rE  r  r@  rG  r&   CooperativeReductionGridGrid1Dr   r   needs_yz_grid_overflowGrid2DWithYZOverflowGrid2DGrid3Dr#  )r   r   r  s      r{   r  TritonKernel._get_grid_type  s    8H8HI8H***+8HIJ%%6M6$===!V$+++!V3t22D4D4DEFF(===$+++!V$+++=aSABB Js   $C6c                   U R                    H  n[        UR                  [        R                  [        R
                  45      (       a  UR                  nO)[        R                  R                  R                  X5      nUR                  (       a  U R                  (       d  M  UR                  U5        UR                  [        U5      5        M     g rr   )rE  r  r  r   rY  r   rC   r  wrapper_codegenerate_numel_exprr@  r=  r  r  )r   r]  rt  	arg_typesr   r(  s         r{   rX  #TritonKernel.add_numel_to_call_args  s    $$D$**u}}ell&CDDzzww++??K$$(=(=(=  &  d, %r~   c                   [         R                  R                  nUR                  5         U R                  R                  5       u  pEpFU R                  XU5        U R                  R                   H  nUR                  U5        M     UR                  UUSUU R                  S9  [        U R                  R                  5       H  nUR                  U5        M     g )NT)r   r  rE  )rC   r  r  write_triton_header_oncer_  rb  rX  workspace_argsgenerate_workspace_allocationgenerate_kernel_callrE  rB  generate_workspace_deallocation)r   r]  r  wrapperr  rt  r  wss           r{   call_kernelTritonKernel.call_kernel  s    ''&&((*%)YY%=%=%?"a##DY?))**B11"5 + 	$$(( 	% 	
 499334B33B7 5r~   c                   [         R                  R                  nU R                  R	                  5       u  p#pB[        X45       H  u  pV[        U[        5      (       d  M  [         R                  R                  (       a  UR                  SU SU S35        MU  SU S3nUR                  U5        SU S3nUR                  U5        M     g )Nz:AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_check_inf_and_nan("z", z));zassert not z.isnan().any().item()z.isinf().any().item())
rC   r  r  r_  rb  r  r  rS   cpp_wrapperr   )r   r  r  rt  arg_signaturesrJ  arg_signaturer  s           r{   codegen_nan_checkTritonKernel.codegen_nan_check  s    ''&&*.))*B*B*D'n"%i"@C-3377&&%%TUXTYY\]`\aade )-BCD%%d+(-BCD%%d+ #Ar~   c                    [        U0 UD6$ rr   )r@  )r   r_  rI  s      r{   create_cse_varTritonKernel.create_cse_var  s     $1&11r~   c                   UR                    SU R                  U R                  UR                  5      5       3nUR                  R
                  (       a  U R                  R                  U5        g U R                  R                  U5        g r  )	r]  r4  r  r(  rootrd  rE  r   r  )r   entryr  s      r{   codegen_iteration_ranges_entry+TritonKernel.codegen_iteration_ranges_entry  sd    **SD,@,@,L!M NO::((. II%r~   c                >   UR                   c   eU R                  UR                   5      nU R                  nUS:w  a  SU S3OSnU R                  (       a'  U R                  (       a  UR
                  (       a  U S3nSUR                  R                  5        SU U 3$ )Nr  rc  r  r   z + rsplit_startztl.arange(0, zBLOCK))r  indexing_size_strr  rG  rx  r@  r:  r   )r   r  r  r  r9  s        r{   rj  )TritonKernel.iteration_ranges_ranges_code  s    +++%%e&6&67&&*5*C4}A&&&))""x/Fu||1134F4&IIr~   c                ^    U R                   nU R                  5       nS/U-  nSU SU SU S3$ )NrE   r  r  r  )r  r  )r   r  r  r  r  r  s         r{   iteration_ranges_scalar_code)TritonKernel.iteration_ranges_scalar_code  sC     &&&&(sTz$r%;-q99r~   c                0   UR                   c   eSUR                    S3nU R                  U5      (       a#  SU SUR                   S-    SUR                    S3nUR                  R                  X"5      nU R                  S:w  a  U S	U R                   S3$ U$ )
Nztl.program_id(r  r  z + tl.program_id(rE   z) * tl.num_programs(r  r  rc  )rX  r  	pid_cacher  r  )r   r  rK  pids       r{   iteration_ranges_get_pid%TritonKernel.iteration_ranges_get_pid  s    ~~)))u~~.a0 &&u-- cU+ENNQ,>+??STYTbTbSccefCoo!!#+z)U$t//022
r~   c                   UR                   S:H  =(       aq    UR                  (       + =(       aY    U R                  (       + =(       aA    [        R                  R
                  R                  UR                  [        5       5      (       + $ r  )	rX  has_zdimrG  rC   r  r  r  r  r,   )r   r  s     r{   r  #TritonKernel.needs_yz_grid_overflow  sa    NNa YNN"Y...Y GG$$99%++~GWXX		
r~   c                    U R                   (       a   U R                   UR                  5        S3   $ [        UR                  5          $ )Nr   )r7  r   r*   )r   r:  s     r{   r  TritonKernel.max_block  s;    $$'7u%=>>//r~   c                   U R                   (       d  gU R                  (       a[  UR                  R                  5        S3U R                  ;   a0  U R                  UR                  R                  5        S3   S:X  a  gO:[        R
                  R                  R                  UR                  S5      (       a  gUR                  (       a-  U R                  (       a  U R                  UR                  5      nO?UR                  S:X  a  U R                  (       a  SnOU R                  UR                  5      nUR                  (       a#  U R                  (       a  X R                  5       -  n[        R
                  R                  R!                  UR                  U5      (       ae  UR"                  S:g  =(       dO    UR$                  =(       d<    [        R
                  R                  R'                  UR                  [)        5       5      $ g)NFr   rE   Tr3  )r6  r7  r:  r   rC   r  r  r  r  r@  rx  r  r9  r  rG  r  rd  rX  r  r  r,   )r   r   r  s      r{   rV  TritonKernel._has_constant_mask  sm   !!DKK$5$5$7#8!>$BSBS!S  DKK$5$5$7#8!>?1D E ww77

AFF !:!:33DJJ?I[[CDMMIt{{3I!;!;!OO$55I 7788YOO" W==W77##88^EUV r~   c                f    U R                   S   nUR                  S:X  d   eU R                  U5      $ )Nr   r3  )rE  r:  rV  )r   xtrees     r{   r_   TritonKernel._has_constant_xmaskG  s5      #||s"""&&u--r~   c                    U R                    H9  nU R                  U5      (       d  M  UR                  UR                   S35        M;     UR                  S5        g )Nr   r   )rE  rV  r  r:  )r   r   r   s      r{   r  TritonKernel.filter_masksL  sL    $$D&&t,,!!T[[M"67 %
 	&!r~   c                    [        [        R                  5      S U R                    Vs/ s H  n[        U   PM     sn$ s  snf rr   )rA  r   r   r<  r   )r   r   s     r{   get_reduction_prefixes#TritonKernel.get_reduction_prefixesT  sG     ]::;<Ud>U>UV
V tV
 	
 
s   ?c                   U R                    Vs/ s H  o"R                  (       d  M  UPM     nnSR                  [        S U 5       5      5      nUR	                  SU R                  U5       35        U R                    Vs/ s H3  nUR                  (       d  M  [        R                  UR                     PM5     nn[        U5      nUR	                  SU R                  U5       35        gs  snf s  snf )zN
Generates code that flattens ND reduction numels, block sizes, etc. into 1D.
r"  c              3  >   #    U  H  oR                    S 3v   M     g7f)r  NrI  re  s     r{   r   8TritonKernel.codegen_reduction_numels.<locals>.<genexpr>a  s     "U_Tkk]%#8_rK  z	rnumel = zRBLOCK: tl.constexpr = N)
rE  r@  r   r  r   r4  r   r   r   r;   )r   r  r   reduction_treesr;  	rn_blocksrn  s          r{   rF  %TritonKernel.codegen_reduction_numels[  s    
 -1,<,<R,<D@Q@Q4,<RF"U_"UUV	$**V"4!567
 ((
(   1M%%dii0( 	 

 y)/

60B/CDE S

s   C,C,C1!C1c                    U R                  5       nU Vs/ s H  n[        R                  " U U 340 UD6PM      sn$ s  snf )z;
Helper to initialize symbols like rn_numel, rn_base, etc.
)r+  r   r   )r   r9  rI  rn_prefixesr:  s        r{   rk  #TritonKernel._get_reduction_symbolsm  sA     113JUV+xx0;F;+VVVs   %=c                    U R                  5       nU R                  SSSS9n[        [        U5      S-
  5       Vs/ s H  n[	        X#S-   S 5      PM     sn[
        R                  " S5      /-   $ s  snf )z
Compute coefficients to convert ND reduction indices to linear indices.
For example:
  rindex = r0_index * r1_numel * ... * rn_numel + ... + rn_index.
r  Tr   rE   N)r+  rk  rC  r  r;   r   rY  )r   r4  	rn_numelsrg  s       r{   _get_reduction_index_coeffs(TritonKernel._get_reduction_index_coeffst  s~     113//PT/U	;@[AQTUAU;V
;VCM)!GI./;V
]]1 	 
s   A0c                8    U R                  5       n[        X!5      $ )z;
Compute linear reduction indices from N dimensional ones.
)r8  r:   )r   
multi_indscoeffss      r{   rl  'TritonKernel._flatten_reduction_indices  s     113,,r~   c                   U R                  SSSS9nU R                  SSSS9nU R                  U5      nUR                  SU R                  U5       35        U R                  U5      nUR                  SU R                  U5       35        g)zH
Generates code that converts ND reduction indices into linear indices.
r   Tr   r   z
roffset = z	rindex = N)rk  rl  r   r  )r   r  
rn_offsetsrn_indsrS  rindexs         r{   rm  &TritonKernel.codegen_reduction_indices  s    
 00d 1 

 --gtQU-V 11*=
4#4#4W#=">?@009	$"3"3F";!<=>r~   c                   UR                   nUR                  (       a%  UR                  UR                   SU SU S35        OUR                  cD  UR                  UR                   SU R                  U5       35        UR                  U S35        OUR                  b  U SU R                  U5       3nOU R                  X S35      nUR                  U SU R                  U5       SUR                  5        S3UR                   SU 3/5        U R                  U5      (       a(  U R                  5       nUR                  U S	U S
35        g UR                  U SUR                   SU S35        g )Nr  z	offset + rh  z
offset = 0r   z	offset = r"  r   zmask = tl.full(z, True, tl.int1)zmask = z < r  )r:  rd  r   r]  rX  rj  r  r  rc  r  r   rV  r  )r   r  r  r3  r  r7  s         r{   ri  ,TritonKernel.iteration_ranges_codegen_header  sc    LL==NNejj\QCy4@A^^#NNejj\T-N-Nu-U,VWXNNaS
+,+Id&G&G&N%OP88#VMOOc4#@#@#G"HAGGI;V[\zzl#dV, ""5))'')ENNaSw6FGHNNaS

|3qc?@r~   )rC  rD  r>  r@  r[  r  r7  r3  r=  r  r6  r<  rA  r:  r;  rZ  rE  )r   TN)rK  zdict[str, sympy.Expr]r7  zOptional[FixedTritonConfig]r   r   r,  rM  r   r   r   r   )r   r   )r   )r]  r   r=  r   r  r   r   ztuple[str, str])r(  r   r  r   r  r   r   r   )r]  r   r   r   rr   )
r]  r   r   r   r  rK   r  rB   r   r   NN)rf  rK   r&  z.tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]r'  rK   r(  rM  r  r   r)  z Optional[tuple[str, sympy.Expr]]r*  zOptional[CSEVariable]r   rK   )r  r   r,  rM  r   r   )
r,  rM  rz  rM  rX  rA   r  +Union[CSEVariable, tuple[CSEVariable, ...]]r   rG  )r,  rM  )r]  r   r   r   r  rG  )r  tuple[torch.dtype, ...]r   r   )r  rH  r  zUCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]rf  tuple[CSEVariable, ...]r   rI  )
r  rH  rf  rI  r=  r   r>  r   r   rI  )r   r  )r   z type[triton_heuristics.GridExpr])r]  r   r  zOptional[IRNode]r  )r   r@  )r  rX   )r  rY   r   r   )r  rY   r  r   r   r   )r  rY   r   r   )r:  r   r   r  )r   rY   r   r   )r   r   r   r   )r  rM   r   r   )r9  r   r   zlist[sympy.Symbol]r  )r;  r4  r   r   )r  rY   r  rM   r   r   )Ors   r   r   r   r  r  r   r  r4  r  rC  rN  rR  rH  rJ  rI  rr  ru  ry  r   r|  r  r  r  r  r  r  r  r  r2  r8  r?  r  r  r  rt  r  r  r  r  r  r  r-  r@  rU  r\  r~  r  r  r  r  r  r  r  r  r  rX  r  r	  r  r  rj  r  r  r  r  rV  r_  r  r2   r+  rF  rk  r8  rl  rm  ri  r   rN  rO  s   @r{   r2  r2    s   %I%%).E&.O
 48%3%%3
 2%3 
%3 %3N"

#J*:0

 " " fTfTR	 EG- - !- -<- 	- ^8MM M 	M
 M4EP SW(( *(3>(FO(	(T  480411 C1 &	1
 $1 1 11 .1 
1f.
"DD D &	D
 ;D 
5DL&DO&"9
v5(&!
%6(( ( ;	(T1Sfm"'m"
m" (m" 
!m"^>"'>" (>" 	>"
 >" 
!>"@b%H Xt

 % %N]~    $;LC
-8(,2&J:(:14:	:
0
(T.
" 
 
F$W 
 
-? A(A0>A	A Ar~   r2  c            
      z  ^  \ rS rSr% \rS\S'   \" \R                  \R                  \R                  \R                  \R                  \R                  \R                  \R                   /5      rSU 4S jjr\SS j5       rS rS rSSS jjr S   SS	 jjr        SS
 jr        SS jrS rSrU =r$ )TritonSchedulingi  z	type[Any]kernel_typec                   > [         TU ]  U5        Ub  [        US5      (       d  g UR                   H+  n[	        U[
        [        45      (       d  M   [        Ul        M-     g )Nr  )	rB  rC  r   r  r  r1   r/   debug_triton_codedebug_device_str)r   	schedulerr  rE  s      r{   rC  TritonScheduling.__init__  sN    #GIw$?$?OOD$0B CDD(9% $r~   c                    [         R                  R                  (       d  [         R                  R                  (       a'  [	        / U R
                  Q[        R                  P5      $ U R
                  $ rr   )r   r   cooperative_reductionsforce_cooperative_reductionsr   backend_featuresrH   REDUCE_TO_SINGLE_ELEMENT)rw   rj  s     r{   get_backend_features%TritonScheduling.get_backend_features  sR     MM00}}99P#&&P(O(OP  ###r~   c                  ^ [         R                  R                  n[        X5      u  p4U(       a  UR	                  U5        [
        R                  (       a  SSKJnJ	m  [        U4S jU 5       5      (       db  U Vs/ s H%  n[        Xe5      (       d  M  UR                  5       PM'     nnUR	                  UR                   SSR                  U5       35        g g g s  snf )Nr   )r.   ForeachKernelSchedulerNodec              3  <   >#    U  H  n[        UT5      v   M     g 7frr   )r  )r   r  rZ  s     r{   r   3TritonScheduling.codegen_comment.<locals>.<genexpr>  s      CPa
1899=s   z Fused node name list: r  )rC   r  r  r6   r   r   debug_fusiontorch._inductor.schedulerr.   rZ  r   r  get_namecommentr   )	r   node_scheduler  origins_detailed_originsr.   r  
node_namesrZ  s	           @r{   codegen_comment TritonScheduling.codegen_comment  s    ''&&%8%P"g&
  CP   +*!!7 !AJJL*  
 !!''>tyy?T>UV s   <CCc                   [         R                  R                  nXR                  ;   a  UR                  U   nU$ [        R
                  R                  (       a$  [        U[        R
                  R                  5      OSn[        U5      S S nSR                  SXvUR                  5       /5      nXTR                  U'   [        R
                  R                  (       a  UOSnUR                  [        [        R                  5      U5      nUR                  [        [        R                   5      U5      nUR                  SS5      n[#        [%        UR'                  5       5      S5      u  pn[)        5       n[*        R-                  5       (       a  [*        R                  X5        UR/                  S	U< S
35        UR1                  USS9  [         R                  R3                  5       nUR/                  SUR4                   S35        SU 3n[7        X$5      u  nnUSU-   S-   U-   -  nUR9                  X\R;                  5       U5        [<        R>                  " S5      (       a  [<        R@                  " X[U5        U$ )Nr   r   r  r   triton_z#pragma CMT#pyzasync_compile.triton(z, '''Tr  z''', device_str='z')z# kernel path: r`  kernel_metadata)!rC   r  r  src_to_kernelr   r   descriptive_namesr5   rD   r   next_kernel_suffixunique_kernel_namesreplacer   r8   r  rq  r#   r"   r  rM   async_compileuse_process_poolr   r   rk  r  r6   define_kernelr   r    is_metric_table_enabledlog_kernel_metadata)r   src_codera  r  r  r  
fused_namekernel_category	subs_name	_basenamer  kernel_pathcompile_wrapperr|  metadata_commentrb  detailed_originss                    r{   rs  TritonScheduling.define_kernel  s*   ''&&,,,!//9Kj c ==22 &mV]]5T5TU 
 AJ2ANO((?8R8R8TUK /:!!(+'-}}'H'HiI
  ''K,H,H(I;WH''K,C,C(DiPH  ''s;H(08>>;K1Ld(S%I+,.O--// $$Y9%%(=i]%&PQ""84"8WW@@BN%%(9.:M:M9Nb&QR!0>(;M(S%G%w 58H HH!!5579I ../@AA++KhOr~   c                    U R                  USS9n[        R                  " U5      nU R                  XB[	        S U 5       5      S9$ )zk
Benchmark fused list of nodes and return the execution time
in milliseconds on randomly generated inputs.
T)r  c              3  @   #    U  H  oR                  5       v   M     g 7frr   r_  r   r  s     r{   r   9TritonScheduling.benchmark_fused_nodes.<locals>.<genexpr>3  s     :WQVA::<<QVr  )rd  )generate_kernel_code_from_nodesr$   r  benchmark_codegened_moduler   )r   r  n_spills_thresholdrv  r  s        r{   benchmark_fused_nodes&TritonScheduling.benchmark_fused_nodes+  sS    
 77PT7Ux(..
:WQV:W0W / 
 	
r~   c                  ^^	^
^^^ [        [        R                  R                  5      n[	        5          UR                  [        R                  R                  5       5         SmU4S jm
U
U4S jnU
4S jnUb  UO[        S/5      n[        R                  SUTR                  5        U" 5       mTb   TTR                  4sSSS5        sSSS5        $ TR                  5       m	TR                  mTR                  m T" TR                  " T	6 S   5        TR(                  n[+        U5      S
:X  d   eUS   R,                  U:  a  ['        S	5      mOS[.        R0                  " U	UU4S j5      m[+        TR2                  5      S:  a  T[.        R0                  " U	U4S j5      -
  m[        R                  SUT5        U" 5         TTR                  4sSSS5        sSSS5        $ ! [         as  n[         R"                  R$                  (       a  e [        R                  SUU5        ['        S	5      mU" 5         TTR                  4s SnAsSSS5        sSSS5        $ SnAff = f! , (       d  f       O= f SSS5        g! , (       d  f       g= f)z$Benchmark an already compiled moduleNc                    > T R                   c   e[        R                  R                  T R                   5      S   S-   $ Nr   z.kernel_perf__file__ospathsplitextr  s   r{   cache_file_pathDTritonScheduling.benchmark_codegened_module.<locals>.cache_file_pathA  s6    ||///ww''5a8>IIr~   c                    > T" 5       n [        U S5       nUR                  [        T5      5        S S S 5        g ! , (       d  f       g = f)Nwopenwriter   )r  fdr  mss     r{   store_cache@TritonScheduling.benchmark_codegened_module.<locals>.store_cacheE  s0    &($_HHSW% %__s	   9
Ac                    > T" 5       n [         R                  R                  U 5      (       a.  [        U 5       n[	        UR                  5       5      sS S S 5        $ g ! , (       d  f       g = frr   )r  r  existsr  floatreadr  r  r  s     r{   
load_cache?TritonScheduling.benchmark_codegened_module.<locals>.load_cacheJ  sJ    &(77>>$''dr$RWWY/ $ $s   A
A*unknown%kernel src code for %s written to: %sr   z*Exception (%s) in compiling fused nodes %sinfrE   c                 4   > T" TR                   " T 6 S   5      $ rX  
clone_argsr_  callwrapped_jit_functions   r{   rN  =TritonScheduling.benchmark_codegened_module.<locals>.<lambda>z      D!5!@!@$!G!JKr~   c                 "   > TR                   " T 6 $ rr   r  r_  r  s   r{   rN  r    s     4 ? ? Fr~   z+The fused kernel for %s took %.3f ms to run)r   rC   r  r  r   rj  rk  r   r~  debugr  get_argsr  rh  r  	Exceptionr   r   .disallow_failing_autotune_kernels_TESTING_ONLYr  	launchersr  n_spillsr'   benchmark_gpur  )r   r  r  rd  device_interfacer  r  r  r  r_  r  r  r  r  s    `       @@@@@r{   r  +TritonScheduling.benchmark_codegened_module6  s/    4AGG4G4GH ##AGG$G$G$IJBJ&
 )4
*i[:Q  II7
 B~3<<'= KJ ! B <<>D88D#&;; ()44d;A>? -66Iy>Q&&& |$$'995\ !..K +==>Bk77F B II=
 Ms||#] KJ ! N  
(==OO		@
 5\3<<''a KJ ! N
(M KJJ !  sh   .I="AI"	I=(I"=G"B:I"	I="
I,AIII"	I=II""
I0	,I==
Jc                   UR                  S5      nU=(       a     [        S UR                  5        5       5      nU R                  nU(       a  SSKJn  UnU(       a  SUS'   UR                  S5      (       a
  SUS	'   SUS'   [        R                  UR                  5      (       d  UR                  S	5      (       a   eSUS	'   [        R                  R                  XaX#5      nU" U0 UD6nU R                  XU5      $ )
Nr-  c              3  @   #    U  H  oR                  5       v   M     g 7frr   )is_split_scan)r   r  s     r{   r   9TritonScheduling.create_kernel_choices.<locals>.<genexpr>  s      (
-NT  -Nr  rE   )TritonSplitScanKernelFoverride_cooperative_reductionr@  Toverride_persistent_reduction)contains_opr   scheduler_nodesrL  triton_split_scanr  r2  r  reduction_numelr  rC   rQ  triton_kernel_kwargsadd_multi_kernel_choices)	r   kernel_featureskernel_argskernel_kwargsis_scanr  rL  r  r  s	            r{   create_kernel_choices&TritonScheduling.create_kernel_choices  s    "--f5 
C (
-<-L-L-N(
 %
 +/*:*:@/K>CM:; &&v..=AM9:>CM:;11/2Q2QRR$(()HIIII=BM9:		66+
 k;];,,V-PPr~   c           	     $   U/n[         R                  R                  (       d  U$ UR                  =(       a    UR	                  S5      (       + nUR
                  =(       a    UR	                  S5      (       + nU(       a%  UR                  U R                  " U0 UDSS0D65        U(       a  UR                  R                  n[        R                  R                  R                  US5      (       ae  UR                  U R                  " U0 UDSS0D6=n5        U(       a7  UR                  (       a&  UR                  U R                  " U0 UDSSS.D65        [        U5      S:  a-  USS   H  n	UR                  U	l        M     UR!                  S S9  U$ )	Nr  r  Fi   )r  r  rE   c                    U R                   $ rr   )rx  )ks    r{   rN  ;TritonScheduling.add_multi_kernel_choices.<locals>.<lambda>  s
    q'='=r~   )rK  )r   r   multi_kernelrx  r  rG  r  rL  r?  r  rC   r  r  r  r  must_keep_buffersr@  )
r   r  r  r  kernelsoptional_persistentoptional_cooperativer;  r  kernel2s
             r{   r  )TritonScheduling.add_multi_kernel_choices  s    (.h}}))N$99 
-BSBS+C
 ?
  &;;  
MDUDU,E
 A
 NN   # 38  __44Fww44VUCC!--$' 8= E '5+E+ENN(((+ <A:?	 w<!"12;,2,D,D) ' LL=L>r~   c                  ^^^^^^^ U4S jmU4S jnUUU4S jnS/ pTSn[         R                  R                  n[        U5      [         R                  l        [         R                  R                  n[        U5      [         R                  l        [
        R                  S:  n	[
        R                  S:  n
U R                  USU	U
SS9nU GH  u  pnU Vs/ s H  oR                  5       PM     nnU VVs/ s H  nU  H  nUR                  5       PM     M     nnnUR                  [        [        R                  5      S5      n[        R                   " U5      m["        R%                  S	UTR&                  5        U" 5       u  mmTb'  UT-  nUT-  nUR)                  TR&                  5        M  TR+                  5       mTR,                  mTR.                  mT" TR0                  " T6 S   5        TR2                  n[5        U5      S
:X  d   eUS   R6                  S:  a  [9        S5      =mmO7[:        R<                  " UUU4S j5      m[:        R<                  " UU4S j5      m["        R%                  S[        S U 5       5      TT5        U" 5         UT-  nUT-  nUR)                  TR&                  5        GM     U[         R                  l        U[         R                  l        XFU4$ s  snf s  snnf )Nc                    > T R                   c   e[        R                  R                  T R                   5      S   S-   $ r  r  r  s   r{   r  @TritonScheduling.benchmark_combo_kernel.<locals>.cache_file_path  s6    <<+++77##CLL1!4~EEr~   c                   > T" 5       n [         R                  R                  U 5      (       aC  [        U 5       n[	        S UR                  5       R                  5        5       5      sS S S 5        $ g! , (       d  f       g= f)Nc              3  8   #    U  H  n[        U5      v   M     g 7frr   )r  )r   r  s     r{   r   NTritonScheduling.benchmark_combo_kernel.<locals>.load_cache.<locals>.<genexpr>  s      E3Daq3Drg  rF  )r  r  r  r  r  r  splitr  s     r{   r  ;TritonScheduling.benchmark_combo_kernel.<locals>.load_cache  sW    "$Dww~~d##$Z2  E2779??3D EE  Z  Zs   .A11
A?c                    > T" 5       n [        U S5       nUR                  [        T5      S-   [        T5      -   5        S S S 5        g ! , (       d  f       g = f)Nr  r  r  )r  r  r  r  ms_clones     r{   r  <TritonScheduling.benchmark_combo_kernel.<locals>.store_cache  s=    "$DdCBR3X67 !s   *A
Ar   g        T)subkernel_nodescustom_part_algorithmenable_autotunemixed_sizesonly_gen_src_coderh  r  rE   r  c                 4   > T" TR                   " T 6 S   5      $ rX  r  r  s   r{   rN  9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>/  r  r~   c                 (   > TR                   " T 6 S   $ rX  r  r  s   r{   rN  r  2  s    0;;TB1Er~   zDThe fused kernel for %s took %.3f ms to run, %.3f ms to clone inputsc              3  @   #    U  H  oR                  5       v   M     g 7frr   r  r  s     r{   r   :TritonScheduling.benchmark_combo_kernel.<locals>.<genexpr>7  s     <A::<<r  )rC   r  r  r   inplaced_to_remover   combo_kernels_autotunecombo_kernel_allow_mixed_sizesgenerate_combo_kernel_code	get_nodesr_  rp  r   r8   rq  r$   r  r~  r  r  r  r  r  rh  r  r  r  r  r  r'   r  )r   	node_listr  r  total_ms	file_listtotal_clone_msremoved_buffers_originplaced_to_remove_origr  r  kernel_code_listrv  r  
node_groupr  fused_node_listsr  r  namesr  r_  r  r  r  r  r  r  s                        @@@@@@@r{   benchmark_combo_kernel'TritonScheduling.benchmark_combo_kernel  s   
	F	 	8
  ) # ww66",-A"B"#''"<"<%/0G%H" 77!;;;a?::%"&+#" ; 
 (8#H=GHZT 0ZH/?O/?eAQZZ\\/?EO''K,C,C(DiPH""8,CII7
 &<LB~B(*  .<<>D88D#&;;  %00$7:;,66Iy>Q&&&|$$q( %e,X !..K '44E IIV<<<	 MNHh&NS\\*e (8f #7%<"22i  IOs   K.7$K3r   )rP  zOptional[Scheduler]r   r   )rj  ztorch.device)   )r   tuple[float, str])r  N)rd  zOptional[OrderedSet[str]]r   r   )r  rg   r  	list[Any]r  r  r   list[TritonKernel])r  r2  r  r  r  r  r   r  )rs   r   r   r   r2  rL  r   r   rH   FOREACH	BUCKETIZEINPLACE_BUFFERSMASKED_SCATTER_WITH_INDEXSCANSORTTRITON_TEMPLATESTUPLE_REDUCTIONrU  rC  r   rW  re  rs  r  r  r  r  r  r   rN  rO  s   @r{   rK  rK    s   )K)!""$$**44++**		
: $ $48t	
 RVU$5NU$	U$n#Q+#Q #Q &	#Q
 
#QJ33 3 &	3
 
3jZ3 Z3r~   rK  c                4   / nU R                  5       nUb!  [        U[        R                  5      (       d   eU(       a1  UR                  c$  UR                  U R                  5        S35        U$ SSKJn  U R                  5       nUc   eU R                  R                  U5      n[        U[        U45      (       d   S[        U5       35       e[        R                  R!                  U5         ["        R$                  nUR'                  U R)                  5       5      R+                  5       nU["        l        S S S 5        UR                  U R                  5        S35        UR                  [,        R.                  " WS5      5        U$ ! , (       d  f       NX= f)Nz" Unfinalized multi template bufferr   )CUDACombinedSchedulingz]Scheduling backend should be SIMD or CUDACombined when generating debug Triton strings, got: z Triton code:z    )get_template_noder  r   MultiTemplateBuffermake_kernel_renderr  r_  0torch._inductor.codegen.cuda_combined_schedulingr  rh  rP  get_backendr[   r  rC   r  set_current_devicer    generated_kernel_countr  r  r  r  r"  )r  linesmulti_templater  rj  backendold_generated_kernel_counttriton_codes           r{   rN  rN  D  s_   E++-N!Z@V@V%W%WWW.;;C((JKL2 L/	
 "!!!..,,V4'N4J#KLL 	
klpqxlykz{	
L WW''/ *1)G)G&!AA eg  .HG* 0 	(67X__[&9:L 0/s   .A	F		
Fr   )r  r   r  r   r  r   r   r   rE  )r,  rM  r   rM  )r,  rM  r   r   )r=  zUnion[CSEVariable, Any]r   r   )r   re   r  )ry   r   r   zCallable[[_T], _T])r  r.   r   r  )
__future__r   r
  r  r6  r  rd  loggingr  r  r  collections.abcr   r   r   typingr   r   r	   r
   r   r   r   sympy.printing.precedencer   rt   torch._loggingtorch.utils._pytreer  _pytreer  torch._dynamo.device_interfacer   torch._dynamo.utilsr   r   torch._prims_commonr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._tritonr   utils._sympy.symbolr   r   r   r   utils._sympy.value_rangesr   r   r   r   r    rq  r!   	codecacher"   r#   r$   ops_handlerr%   runtimer&   runtime.benchmarkingr'   runtime.hintsr(   r)   r*   r+   runtime.runtime_utilsr,   r-   rP  r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   virtualizedr@   r  rA   rB   rC   wrapper_benchmarkrD   block_analysisrF   commonrG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   simdrV   rW   rX   rY   rZ   r[   triton_utilsr\   r]   r^   r_   r`   r  ra   typesrb   rc   rQ  re   rf   simd_kernel_featuresrg   rh   	getLoggerrs   r~  _logginggetArtifactLoggerperf_hint_logschedule_log
fusion_logrm   r   r   r   	dataclassr   r   r  r  r  r  r-  r0  r6  r8  r;  r>  r@  rR  rs  ru  _initialize_pointwise_overridesr  r  r   r  r#  r   r  r,  r2  rK  rN  r   r~   r{   <module>r?     s3   "        	  .  F F  0   $ $ C < 0 / K K 2 X X 4 " " ( 8 8 ( ' .  D W W     C B B /    "   %  L8	B!00<H~~//*E^^--hA
6 6  4 $ 4 *, ,: R R R6 N+ N+ N+b++/+<P++>aQM aQH 	3
&8
;P *(.bi&k i&X  / / 9D$O D$N$+ $+N : : :&! !H # # #
%uS%S/-A'BBC 
U&A:/0 U&ApLL3~ L3^r~   