
    shE                   2   S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKJr  S SKJrJrJrJrJrJrJr  S SKJr  S SKrS SKrS SKrS SKJr  S SKJr  S SKJrJrJ r   S S	K!J"r"J#r#J$r$J%r%  S
SK&J'r'  SSK(J)r)J*r*J+r+  SSK,J-r-  SSK.J/r/  SSK0J1r1J2r2J3r3  SSK*J4r4J5r5  SSK6J7r7  SSK8J9r9J:r:  SSK+J;r;J<r<J=r=  SSK>J?r?J@r@JArAJBrBJCrCJDrDJErEJFrFJGrGJHrHJIrI  SSKJJKrKJLrLJMrM  SSKNJOrO  SSKPJQrQJRrRJSrSJTrT  SSKUJVrV  SSKWJXrXJYrYJZrZJ[r[  \(       a
  S SK\J]r]J^r^J_r_  \R                  " \a5      rb\R                  R                  \aS5      re\R                  R                  \aS5      rf\R                  R                  \aS5      rg\T" 5       R                  ri\" / S Q5      rj\R                   " S! S"5      5       rl " S# S$\l5      rm " S% S&\l5      rnS4S' jro\" S(\Q\QS)9rp " S* S+\S\p   \\p   5      rq " S, S-\<5      rr\R                  " S.S/9 " S0 S15      5       rs " S2 S3\t5      rug)5    )annotationsN)Counter)AnyCallableGenericno_type_checkOptionalTYPE_CHECKINGUnion)TypeVar)immutable_dict)
OrderedSet)FloorDivIdentityModularIndexing)free_symbol_is_type
prefix_strsymbol_is_typeSymT   )counters   )configir	scheduler)prologue_preserves_zero_mask)	code_hash)	MemoryDepStarDepWeakDep)IRNodeTritonTemplateBuffer)!indexing_dtype_strength_reduction)
green_textyellow_text)BaseSchedulerNodeBaseScheduling	WhyNoFuse)cache_on_selfexpr_fits_within_32bitget_dtype_sizeIndentedBufferPlaceholderprefix_is_reduction'set_kernel_post_grad_provenance_tracingsympy_index_symbolsympy_product
sympy_subsunique)ops
OpsWrapperV   )BlockPatternMatcher)CSEVariableindex_prevent_reorderingKernelPythonPrinter)MultiKernel)DisableReductionEnableReductionNodeScheduleMarkerSIMDKernelFeatures)IterableIteratorSequence
perf_hintsschedulefusion)zyxr0_r1_c                    ^  \ rS rSrSr\R                  R                  \R                  R                  S.               S	U 4S jjjr\	\
\S
S j5       5       5       rSS jr\	\
\SS j5       5       5       rSrU =r$ )IterationRangesP   a  
Each range tree represents multiple sets of iteration indexing
in a single tiled dimension in the output kernel.

If you have two loops ranges one (4, 3, 2) and another (4, 6),
then the range tree will be:
        4 (i0)
    3 (i1)  6 (i3)
    2 (i2)
Where i0 is shared between both loops, but then the split into
different indexing vars.  All loop ranges must iterate over
the same number of elements.
)divisorlengthc                  > [         T
U ]  5         Xl        X l        X0l        X@l        XPl        Xpl        Xl        X`l	        Xl
        g N)super__init__namevar_list
var_rangesnumelprefixrP   rQ   kernelroot)selfrV   rW   rX   rY   rZ   r[   rP   rQ   r\   	__class__s             p/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/_inductor/codegen/simd.pyrU   IterationRanges.__init__`   s=     		 $
	    c                ,    [        U R                  5      $ rS   )r.   rZ   r]   s    r_   is_reductionIterationRanges.is_reductionx   s     #4;;//ra   c                ,    [        U R                  5      $ rS   )r0   rV   rc   s    r_   symbolIterationRanges.symbol~   s    !$)),,ra   c                |    [         R                  " 5        VVs0 s H  u  pX!_M	     nnnX0R                     $ s  snnf rS   )r   itemsrZ   )r]   symtrZ   prefix_to_symts       r_   rk   IterationRanges.symt   s;     <F;K;K;MN;M<4&,;MNkk** Os   8)	rP   r[   rQ   rV   rY   rZ   r\   rW   rX   )rV   strrW   list[sympy.Symbol]rX   dict[sympy.Symbol, sympy.Expr]rY   
sympy.ExprrZ   rn   r[   
SIMDKernelr\   IterationRangesRootreturnNonert   boolrt   zsympy.Symbol)rt   r   )__name__
__module____qualname____firstlineno____doc__sympySOnerU   propertyr)   r   rd   rg   rk   __static_attributes____classcell__r^   s   @r_   rN   rN   P   s    . ww{{ % 3	
    " 
 0 0   0- +   +ra   rN   c                     ^  \ rS rSr S                     SU 4S jjjrSS jrSS jrSS jrSS jr    SS jr	SS jr
    SS	 jrS
rU =r$ )rs      c          
        > Uc  0 n[         TU ]  U/ 0 UUUU S9  X@l        0 U l        X`l        U(       a  U R
                  (       a  U	b   eXpl        Xl        Xl        Xl	        g )N)rV   rW   rX   rY   rZ   r[   r\   )
rT   rU   indexnodes	pid_cacherd   is_loop
tensor_dimgrid_dimhas_zdim)r]   rV   rY   rZ   r   r[   r   r   r   r   r   r^   s              r_   rU   IterationRangesRoot.__init__   sx     I 	 	
 
=?
 *3 t00X5EFF$  ra   c                >    SU R                   < SU R                   S3$ )NzIterationRangesRoot(, z, ...))rV   rY   rc   s    r_   __repr__IterationRangesRoot.__repr__   s    %dii]"TZZLGGra   c                f    U R                   R                  5        H  nUR                  5         M     g rS   )r   valuescache_clear)r]   nodes     r_   r   IterationRangesRoot.cache_clear   s%    JJ%%'D (ra   c                2    [        U R                   S35      $ )Nr   )r0   rZ   rc   s    r_   	index_symIterationRangesRoot.index_sym   s    !T[[M"788ra   c                   [         R                  R                  R                  X-  U R                  5      (       a  [        U R                  5       U5      nO[        U R                  5       X5      nX0R                  ;  a  [        U R                   [        [         R                  R                  5       3UUUU 5      nU[         R                  R                  UR                  5       '   U R                   R#                  UR                  5       5        X R$                  UR                  5       '   X@R                  U'   U R                  U   $ )z6
Lookup a given RangeTreeEntry, creating it if needed
)r6   graphsizevarsstatically_known_equalsrY   r   r   r   r   IterationRangesEntryrZ   nextr[   iter_vars_countrange_tree_nodesrg   rW   appendrX   )r]   rP   rQ   exprr   s        r_   lookupIterationRangesRoot.lookup   s     7733G4DdjjQQDNN,g6D"4>>#3WEDzz!';;-QXX%=%= >?@D 8<AHH%%dkkm4MM  /-3OODKKM*#JJtzz$ra   c                    [         R                  R                  n/ n[        U5       H'  nUR	                  U R                  X$5      5        X$-  nM)     / [        U5      Q$ rS   )r~   r   r   reversedr   r   )r]   lengthsrP   itervarsrQ   s        r_   construct_entries%IterationRangesRoot.construct_entries   sT     ''++w'FOODKK89&G ( %(#$$ra   c                j    U R                  U5       Vs/ s H  o"R                  5       PM     sn$ s  snf rS   )r   rg   )r]   r   es      r_   	constructIterationRangesRoot.construct   s+    $($:$:7$CD$Cq
$CDDDs   0c           
     |  ^^^	 UR                    Vs/ s H,  n[        R                  R                  R	                  U5      PM.     nnU Vs/ s H)  oD(       d  M  UR
                  U R
                  :X  d  M'  UPM+     nnUR                  S S9  [        R                  R                  m/ m/ m	UUU	4S jnU H|  n[        R                  R                  R                  UR                  T5      (       d8  U" U R                  T[        UR                  T5      5      5        UR                  mU" U5        M~     [        R                  R                  R                  U R                   T5      (       d,  U" U R                  T[        U R                   T5      5      5        / [#        T5      Q/ [#        T	5      Q4$ s  snf s  snf )z,Figure out vars from this tree used in indexc                    [         R                  R                  R                  U R                  [
        R                  S9$ )N)fallback)r6   r   r   	size_hintrP   r   unbacked_symint_fallbackrJ   s    r_   <lambda>4IterationRangesRoot.vars_and_sizes.<locals>.<lambda>   s-    !''**44		F$C$C 5 ra   keyc                   > TR                  U R                  5       5        TR                  U R                  5        TU R                  -  mg rS   )r   rg   rQ   )r   rP   
index_varssizess    r_   add/IterationRangesRoot.vars_and_sizes.<locals>.add   s5    dkkm,LL%+Gra   )free_symbolsr6   r[   r   getrZ   sortr~   r   r   r   r   r   rP   r   r   rY   r   )
r]   r   sr   nr   r   rP   r   r   s
          @@@r_   vars_and_sizes"IterationRangesRoot.vars_and_sizes   sV    <A;M;MN;Ma**..q1;MN!CEqQ188t{{+BEC

 	 	

 ''++
	, D77##;;DLL'RRDKK$,,)HIJ,,I  ww77

GLLGXdjj'%BCD&*%&(:(5/(:::7 OCs   3F4
F9F95F9)r   r   r   r   r   r   r   rS   )rV   rn   rY   rq   rZ   rn   r   intr[   rr   r   Optional[dict[str, str]]r   rw   r   Optional[int]r   r   r   rw   rt   ru   rt   rn   rt   ru   rx   )rP   rq   rQ   rq   rt   r   )r   list[sympy.Expr]rt   zlist[IterationRangesEntry])r   r   rt   ro   )r   rq   rt   z+tuple[list[sympy.Symbol], list[sympy.Expr]])ry   rz   r{   r|   rU   r   r   r   r   r   r   r   r   r   r   s   @r_   rs   rs      s     /3(!(! (! 	(!
 (! (! ,(! (! "(!  (! (! 
(! (!TH9 .%'%	#%E;;	4; ;ra   rs   c                     ^  \ rS rSr            SU 4S jjrSS jrSS jrSS jrSS jrSS jr	SS jr
SS	 jrS
rU =r$ )r   i  c                  > [         TU ]  UUR                  U-  UR                  UR                  UR
                  UUUR                  UR                  S9	  XPl        [        R                  " S 5      " U R                  5      U l        X@l        g )N)	rV   rY   rW   rX   rZ   rP   rQ   r[   r\   )rT   rU   rY   rW   rX   rZ   r[   r\   parent	functools	lru_cache_codegencodegenr   )r]   rV   rP   rQ   r   r   r^   s         r_   rU   IterationRangesEntry.__init__  sx     	,,'__((==== 	 
	
  **40?	ra   c                    SU R                    SU R                   SU R                   SU R                   SU R                   S3$ )NzIterationRangesEntry(r   ))rV   rP   rQ   r   rX   rc   s    r_   r   IterationRangesEntry.__repr__  sH    &tyykDLL>DKK=PRSWS\S\R]]_`d`o`o_ppqrrra   c                N   ^ U4S jU l         S U R                   l        TU l        g )Nc                    > T $ rS    )rV   s   r_   r   /IterationRangesEntry.set_name.<locals>.<lambda>   s    tra   c                     g rS   r   r   ra   r_   r   r   !  s    4ra   )r   r   rV   )r]   rV   s    `r_   set_nameIterationRangesEntry.set_name  s    ##/ 	ra   c                8    U R                   R                  5         g rS   )r   r   rc   s    r_   r    IterationRangesEntry.cache_clear$  s      "ra   c                X    [         R                  R                  U 5        U R                  $ rS   )r6   r[   codegen_iteration_ranges_entryrV   rc   s    r_   r   IterationRangesEntry._codegen'  s    	//5yyra   c                   / n[        U R                  [        R                  5      (       a  U$ [        U R                  [        [
        45      (       d   [        U R                  5      5       eU R                  R                  SS   H{  n[        U[        R                  [        R                  45      (       a  M4  UR                  n[        U5      S:  d  MQ  [        S U 5       5      (       d  Mj  UR                  U5        M}     U$ )Nr7   r   c              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7frS   )r   r   SIZE.0r   s     r_   	<genexpr>8IterationRangesEntry.precomputed_args.<locals>.<genexpr>4  s!      ,:AQN1dii00's   '))
isinstancer   r~   Symbolr   r   typeargsIntegerr   lenallr   )r]   precomputed_argsargsymbolss       r_   r   %IterationRangesEntry.precomputed_args+  s    -/dii..##$))h%@AAR4		?RA99>>!"%CcEMM5<<#@AA**w<!# ,:A, ) ) %++C0 &  ra   c                ,    [        U R                  5      $ rS   )hashrV   rc   s    r_   __hash__IterationRangesEntry.__hash__:  s    DIIra   c                b    [        U[        5      (       d   eU R                  UR                  :H  $ rS   )r   r   rV   )r]   others     r_   __eq__IterationRangesEntry.__eq__=  s)    %!56666yyEJJ&&ra   )r   r   rV   r   )rV   rn   rP   rq   rQ   rq   r   rq   r   rN   rt   ru   r   )rV   rn   rt   ru   r   )rt   r   rt   r   )r   objectrt   rw   )ry   rz   r{   r|   rU   r   r   r   r   r   r   r   r   r   r   s   @r_   r   r     sk      	
    
.s
# ' 'ra   r   c                    U [        S5      :X  a  gU [        S5      :X  a  g[        R                  " U 5      (       a  g[        U 5      $ )Ninfzfloat("inf")z-infzfloat("-inf")zfloat("nan"))floatmathisnanrepr)values    r_   constant_reprr  B  s<    e	%-		E		;ra   CSEVariableType)bounddefaultc                     ^  \ rS rSr% Sr\rS\S'   S\S'   SrS\S'   S	\S
'      S7           S8U 4S jjjr	\
\\S9S j5       5       5       rS:S jr\
S;S j5       rS<S jr            S=S jrS>S jrS?S jrS@S jrS<S jrS<S jrSAS jrS9S jrSBS jrSCS jrS;S jrSDS jr      SES jr      SES jrSFS jrSGS jr \!      SHS  j5       r"\#\$RJ                  RL                  4       SIS! jj5       r'    SJS" jr(\#      SKS# j5       r)SLS$ jr*SLS% jr+SMS& jr,    SDS' jr-SNSOS( jjr.SPS) jr/SQS* jr0SRSSS+ jjr1\2Rf                        STS, j5       r4SUS- jr5\!S. 5       r6S/ r7S0 r8S1 r9S2 r:S3 r;S4 r<SVS5 jr=S6r>U =r?$ )Wrr   iO  zg
Common base class for Triton/Halide codegen which both use flattened indexing rather than loop nests.
zCallable[[sympy.Expr], str]sexprkexprFrw   allow_block_ptrrn   kernel_namec                  >^  Uc  0 n[         T	T ]  5         UT l        UR                  5       T l        [        5       T l        [        5       T l        UR                  5        VVs0 s H/  u  pgU[        R                  R                  R                  U5      _M1     snnT l        / T l        0 T l        [         R"                  " 5       T l        UR'                  5       T l        Ub  UOT R+                  5       T l        Ub  UOT R/                  5       T l        T R3                  5       T l        S T l        [8        R:                  " S 5      SU 4S jj5       nUT l        T R?                  U5        g s  snnf )Nc                   > [         R                  R                  R                  U TR	                  5       5      n TR
                   H  nTR                  X5      n M     TR                  U 5      $ rS   )r6   r   r   simplify_with_rangesrX   range_treescombine_contiguous_dimscombine_modular_indexing_pairs)r   treer]   s     r_   simplify_indexing.SIMDKernel.__init__.<locals>.simplify_indexing}  sY    GG$$99%ARSE((44UA ) 66u==ra   )r   rq   ) rT   rU   featuresget_mutations	mutationsr,   bodyindexing_coderj   r6   r   r   simplifynumelsr  r   	itertoolscountr   rd   inside_reduction should_use_cooperative_reductioncooperative_reductionshould_use_persistent_reductionpersistent_reductionwant_no_x_dimno_x_dimr   r   r   r  initialize_range_tree)
r]   tilingr  r   override_persistent_reductionoverride_cooperative_reductionrZ   valr  r^   s
   `        r_   rU   SIMDKernel.__init__Y  sM    I !//1"$	+-FLlln
FT{vFAGG$$--c22n
 79JL(0 ( 5 5 7 .9 +668 	" -8 *557 	!
 **,(, 
		T	"	> 
#	> "3""9-=
s   #6E)c                :    [        S U R                   5       5      $ )Nc              3  8   #    U  H  n[        U5      v   M     g 7frS   )r.   )r   rZ   s     r_   r   0SIMDKernel.num_reduction_dims.<locals>.<genexpr>  s     I[6&v..[   )sumr   rc   s    r_   num_reduction_dimsSIMDKernel.num_reduction_dims  s     IT[[IIIra   c                    [         erS   NotImplementedError)r]   dtypes     r_   dtype_to_strSIMDKernel.dtype_to_str      !!ra   c                T    U R                  U R                  R                  5       5      $ rS   )r;  r  select_index_dtyperc   s    r_   index_dtypeSIMDKernel.index_dtype  s       !A!A!CDDra   c                    gNFr   rc   s    r_   r(  SIMDKernel.want_no_x_dim      ra   c                  ^ [        U4S j[         5       5      nU(       + =(       d    U(       + nS	S jn/ SQn	SS/n
U(       a  U
nOU(       a  U	nOX-   nU" X5      nU" U	[        5      n/ n[        U5       H|  u  nn[        U5      nUR	                  U5      nUR	                  U5      nUc  UOUnUR                  [        U S3TU   UUU UU=(       a    U R                  (       + UUST;   S9
5        M~     U$ )
Nc              3  6   >#    U  H  oT;   d  M
  Uv   M     g 7frS   r   )r   rZ   r   s     r_   r   3SIMDKernel.construct_range_trees.<locals>.<genexpr>  s      %
!-v61AFF   		c                d   ^ [        U4S jU  5       5       VVs0 s H  u  p#X2_M	     snn$ s  snnf )Nc              3  6   >#    U  H  oT;   d  M
  Uv   M     g 7frS   r   )r   r.  masks     r_   r   OSIMDKernel.construct_range_trees.<locals>.filtered_index_map.<locals>.<genexpr>  s     2U#3PT33#rI  )	enumerate)seqrL  idxr.  s    `  r_   filtered_index_map<SIMDKernel.construct_range_trees.<locals>.filtered_index_map  s4    )22U#2U)U)UXS)U  s   ,)rJ   rI   rH   rK   rL   r   rH   )r   r   r   r   r   )rt   zdict[Any, int])r   all_prefixesrN  r.   r   r   rs   r'  )r]   r   r#  rd   r   r)  active_prefixesno_r_dimrQ  	grid_dimsreduction_dimstensor_dimstensor_dim_mapgrid_dim_mapr  irZ   r   r   r   s       `               r_   construct_range_trees SIMDKernel.construct_range_trees  s    % %
!-%
 
 (';|+;	
 $	(K#K#4K ,KI))\B"?3IAv.v6L'++F3J#''/H!)AxE#he$6N'(J1J1J-J)% F] 4& ra   c                    U R                  UU R                  U R                  R                  5       U R                  U R
                  5      nU R                  R                  U5        g rS   )r\  r#  r  rd   r   r)  r  extend)r]   r   r  s      r_   r*   SIMDKernel.initialize_range_tree  sR    00!!MM&&(KKMM
 	,ra   c                    g)zZ
Hook called right before codegen with every index that will be
used in the fused kernel.
Nr   )r]   indicess     r_   finalize_indexingSIMDKernel.finalize_indexing  s    ra   c                p    U R                   nSU l          U R                  XU5      X@l         $ ! X@l         f = frC  )r#  store)r]   rV   r   r  priors        r_   store_reductionSIMDKernel.store_reduction  s5    %% %	*::d51$)!E!s   - 5c                    grC  r   rc   s    r_   r$  +SIMDKernel.should_use_cooperative_reduction  rE  ra   c                    grC  r   rc   s    r_   r&  *SIMDKernel.should_use_persistent_reduction  rE  ra   c                t    [        [        R                  R                  S U R                   5       5      5      $ )Nc              3  T   #    U  H  oR                   R                  5       v   M      g 7frS   )rX   rj   r   r  s     r_   r   (SIMDKernel.var_ranges.<locals>.<genexpr>  s"      *4DD%%''4Ds   &()dictr!  chainfrom_iterabler  rc   s    r_   rX   SIMDKernel.var_ranges  s4    OO)) *484D4D* 
 	
ra   c                :    [        S U R                   5       5      $ )Nc              3  P   #    U  H  n[        UR                  S L5      v   M     g 7frS   )r   r   rp  s     r_   r   0SIMDKernel.triton_tensor_ndim.<locals>.<genexpr>  s#     Q@P3td233@Ps   $&)r4  r  rc   s    r_   triton_tensor_ndimSIMDKernel.triton_tensor_ndim  s    Q@P@PQQQra   c                \    S/U R                  5       -  nSX!'   SSR                  U5       S3$ )Nru   :[r   ])ry  join)r]   r[  r   s      r_   indexing_size_strSIMDKernel.indexing_size_str  s7    42244499U#$A&&ra   c                   S/U R                  5       -  nU R                   H_  nUR                  c  M  UR                  (       a  U R                  (       d  M6  UR
                  R                  5        S3XR                  '   Ma     U$ )N1BLOCK)ry  r  r   rd   r#  rZ   upper)r]   r   r  s      r_   dense_size_listSIMDKernel.dense_size_list  sp    //11$$D&$$(=(=(=,0KK,=,=,?+@)Foo& % ra   c                L    U R                  5       nSSR                  U5       S3$ )Nr}  r   r~  )r  r  r]   r   s     r_   dense_size_strSIMDKernel.dense_size_str  s)    $$&499U#$A&&ra   c                   [        U[        5      (       d  U$ UR                  S   nU R                  R	                  U5      =nc  U$ [        XUR                  05      n[        R                  R                  R                  U5      n[        UUR                  R                  5       UR                  R                  [        R                  R                   UR                  R"                  5      R%                  5       05      $ )Nr   )r   r   r   r   r   r2   r   r6   r   r   r  r\   r   r   r~   r   r   rY   rg   )r]   r   rJ   	tree_node	new_indexs        r_   r  )SIMDKernel.combine_modular_indexing_pairs	  s    %11LJJqM..22155I>Lu)..&9:	GG$$CCIN	((*INN,A,AGGKK!5!5-&(
 	
ra   c                    [         R                  R                  R                  U5      =n(       a  Uu  pE[	        U R                  XB5      U5      $ U R                  X5      $ rS   )r6   r   r   expand_floor_divr   _combine_contiguous_dims)r]   r   r  
expand_resr  denominators         r_   r  "SIMDKernel.combine_contiguous_dims  sU     ))::5AA:A%/"ID99)JKXX00==ra   c                   [        U[        R                  [        R                  45      (       a  U$ UR	                  U5      u  p4[        U5      S::  a  U$ [        R                  R                  R                  X4[        U/X45      5      u  pVnXT:X  a  U$ UR                  U5      n[        U[        [        X6" U5      5      5      5      n	U	$ )z9
More aggressive simplification to merge contiguous dims
r7   )r   r~   r   r   r   r   r6   r   r   _simplify_loopsr:   r   r2   rr  zip)
r]   r   r  r   r   	new_sizesreindex_prunenew_index_varsr  s
             r_   r  #SIMDKernel._combine_contiguous_dims$  s     eemmU\\:;;L //6
u:?L%&WW%5%5%E%E7S&
"	F L	2ud3z7>;R+S&TU	ra   c                   ^ ^ T R                   S   R                  =(       d    T R                  m[        R                  U U4S j5       nU" 5       $ )Nc               3    >#    T R                   R                  5       (       d  T R                  (       a   eS v   g T(       a  T R                  5         ST l         S v   T(       a  T R                  5         ST l        g ! ST l        f = f7f)NFT)r  rd   r#  codegen_body)r]   should_flushs   r_   ctx)SIMDKernel.disable_reduction.<locals>.ctx;  sn     ==--//0000 !!#$)D!-%%'(,%%s   AB	A= 5B	=	BB	)r  r   r%  
contextlibcontextmanager)r]   r  r  s   ` @r_   disable_reductionSIMDKernel.disable_reduction8  sE    ''+33Qt7Q7Q		"	"	- 
#	-$ ura   c                    [        U5      [        U R                  5      :X  d   e[        XR                  5       VVs/ s H  u  p#UR                  U5      PM     snn$ s  snnf rS   )r   r  r  r   )r]   r   rQ   rangess       r_   
set_rangesSIMDKernel.set_rangesP  s^    7|s4#3#34444 #&g/?/?"@
"@ V$"@
 	
 
s   Ac                  ^^^^ [        S U 5       5      (       a  U  Vs/ s H  n/ PM     sn/ 4$ [        R                  R                  mU  Vs/ s H  n/ PM     snmU  Vs/ s H  nTR	                  U5      PM     snm[
        R                  " 5       mS
UUUU4S jjn        SS jn/ nSnU GH]  n	/ n
U	 GH?  nTR                  US5      (       a  U
R                  S 5        M/  U[        T5      :  aJ  TR                  TU   S5      (       a0  US-  nU[        T5      :  a  TR                  TU   S5      (       a  M0  US-   [        T5      :  az  TR                  UTU   5      (       a`  TR                  UTU   5      (       d  [        eTU   n[        UTU   5      nU
R                  U" UU" X5      U" US-   U5      5      5        GM  U
R                  [        R                  " U" X5      5      5        GMB     UR                  U
5        GM`     [        S T 5       5      (       d   ST S	U 35       eTU4$ s  snf s  snf s  snf )Nc              3  >   #    U  H  n[        U5      S :H  v   M     g7fr   N)r   )r   rQ   s     r_   r   5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>^  s     6gFs6{ag   c                   > TR                  U5      nTR                  TU    U5      (       d  [        e[        TU    U5      TU '   TU    R	                  U5        [        T5      $ rS   )r  statically_known_multiple_of	CantSplitr   r   r   )r[  r   
new_ranges	remainingsv	var_counts     r_   	add_range5SIMDKernel._split_iteration_ranges.<locals>.add_rangef  s]    ;;t$D229Q<FF#IaL$7IaLqM  &	?"ra   c                    ^ ^^ SUUU 4S jjnU$ )Nc                    > TU T   -  U T   -   $ rS   r   )	flat_varsidx1idx2sizes    r_   getterISIMDKernel._split_iteration_ranges.<locals>.make_combined.<locals>.getterr  s    io-	$??ra   )r  r   rt   rq   r   )r  r  r  r  s   ``` r_   make_combined9SIMDKernel._split_iteration_ranges.<locals>.make_combinedo  s    @ @ Mra   r   r7   c                6    [         R                  R                  $ rS   )r~   r   Zero)_s    r_   r   4SIMDKernel._split_iteration_ranges.<locals>.<lambda>}  s    EGGLLra   c              3  z   #    U  H1  n[         R                  R                  R                  U5      S :H  v   M3     g7f)r7   Nr6   r   r   r   r   s     r_   r   r    s*     Iy!177##--a0A5ys   9;zfailed to set ranges  )r[  r   r   rq   rt   r   )r  rq   r  r   r  r   rt   z(Callable[[list[sympy.Expr]], sympy.Expr])r   r6   r   r   r  r!  r"  r   r   r   statically_known_gtr  r  r   operator
itemgetter)groupsr   groupr  gr  r  return_getters_groupscurrent_grouplength_groupreturn_gettersr  size1size2r  r  r  r  s                 @@@@r_   _split_iteration_ranges"SIMDKernel._split_iteration_rangesW  sx    6g666$*+F5BF+R//WW:@-A&Qb&-A
-34VR[[^V4	OO%		# 	#		$'	/2	5	 !##LN$--dA66"))*@A#c)n49S9Sm,: :
 "Q&M $c)n49S9Sm,: : !1$s9~5":P:P)M2; ; ::i6  (%m4E$T9]+CDE"))%!%m;%ma&7? #)) ++Im,JK= %B "((8G $J IyIII 	
#I;ay9	
I 000G , .B4s   H=I"Ic                   [         R                  R                  n[        US   5      S:X  a7  UR	                  [        U5      [        US   5      U-  5      (       a  US   U/4n U R                  X5        g! [         a     gf = f)Nr7   r   TF)r6   r   r   r   r   r1   r  r  )clsr  r   reduction_numelr   s        r_   is_compatibleSIMDKernel.is_compatible  s     77##wqz?a,,f%gaj)O; 
 qzO#45G	''8 		s   %A7 7
BBc                X   U R                    Vs0 s H  o"R                  UR                  _M     nnU R                  (       d7  U H1  n[	        U5      (       d  M  [
        R                  R                  X4'   M3     / UR                  5       QnU R                  XQU R                  5      $ s  snf rS   )r  rZ   rY   r#  r.   r~   r   r   r   map_kernel_groups_to_node_sizesr  )r]   r   rtr+  rZ   r  s         r_   split_and_set_rangesSIMDKernel.split_and_set_ranges  s     150@0@A0@"))RXX%0@A$$ &v..%*WW[[FN ! $6==?#33FT__UU Bs    B'c           
     T   [        U5      [        U5      :X  a%  [        S [        X!5       5       5      (       a  U" U6 $ U R                  X5      u  pE/ [        R
                  R                  U" U6 5      QnU VVs/ s H  ow Vs/ s H
  o" U5      PM     snPM     snn$ s  snf s  snnf )aY  
We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).

To do this we need to split up the iteration space of i0 into something like:
    for i1 in s0:
      for i2 in s1:
        i0 = i1*s1 + i2
        ....

This function matches and resplits lengths to the groups of
this kernel to enable tiled + non-tiled fusions.
c              3     #    U  H?  u  p[         R                  R                  R                  [	        U5      U-
  5      S :H  v   MA     g7fr  r6   r   r   r  r1   )r   rJ   r  s      r_   r   =SIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<genexpr>  s=      /
, GG%%mA&6&:;q@,s   AA	)r   r   r  r  r!  rs  rt  )	r  r  r   r  r  r  r   fnsfns	            r_   r  *SIMDKernel.map_kernel_groups_to_node_sizes  s    & w<3v;&3 /
G,/
 ,
 ,
 w'',/,G,G,X)
LY__22:z3JKL8MN8M,"H,8MNN,Ns   :	B$BB$B$c                6    [        U[        R                  5      $ rS   )r   r   TMPr]   r   s     r_   is_indirect_indexingSIMDKernel.is_indirect_indexing  s    "5$((33ra   c                  ^ U R                  U5      (       a  gS/[        U R                  5      -  nUR                   Hn  nX0R                  ;  a  M  U R                  U   n[        UR                  [        5      (       d   eX$R                  R                  ==   UR                  -  ss'   Mp     [        R                  R                  R                  m[        U4S j[        X R                  R!                  5       5       5       5      $ )NFr7   c              3  J   >#    U  H  u  pT" U5      T" U5      :g  v   M     g 7frS   r   )r   	idx_range
iter_ranger  s      r_   r   ,SIMDKernel.is_broadcasted.<locals>.<genexpr>  s*      
)P%	 Y8J#77)Ps    #)r  r   r   r   r   r   r   rs   r   rQ   r6   r   r   r  anyr  r   )r]   r   index_numelsrg   entryr  s        @r_   is_broadcastedSIMDKernel.is_broadcasted  s    $$U++sS--((F222))&1Eell,?@@@@++,<, ) 77##,, 
),\;;;M;M;O)P
 
 	
ra   c                    [        U[        5      (       a)  SSR                  [        U R                  U5      5       S3$ U R                  U R                  U5      5      $ )a`  
Convert an index expr to a string that can be used in output code.
e.g. a sympy expression "s2" may actually appear as "ks1" in the generated kernel.

Index expressions often need to be passed in as arguments to the triton kernel.
Rename_indexing and codegen_indexing keep track of the needed indices and add
new parameters to the function signature.
r}  r   r~  )r   listr  mapindex_to_strr  rename_indexingr  s     r_   r  SIMDKernel.index_to_str  sQ     eT""tyyT%6%6!>?@BBzz$..u566ra   c                   U R                  U5      n[        U[        R                  R                  R
                  5      n[        UR                  [        R                  5      5      (       d-  [        UR                  [        R                  5      5      (       a3  UR                  [        R                  R                  R
                  5      n[        UR                  [        R                  5      5      (       a  UR                  [        R                  5       Ho  nUR                  n[        U5      S:  d  M   [        S U 5       5      (       d  M9  U[        R                  R                  R                  U5      0n[        X5      nMq     U R                  U5      n[        U[         5      (       d  UOUR"                  S   nU R%                  U5      $ )Nr   c              3  v   #    U  H/  n[        U[        R                  [        R                  45      v   M1     g 7frS   )r   r   r   PRECOMPUTED_SIZEr   s     r_   r   .SIMDKernel.prepare_indexing.<locals>.<genexpr>  s0      ,$ #1tyy$2G2G&HII$s   79)r  r2   r6   r   r   precomputed_replacementsr   atomsr~   floorceilingsubsr   r   lookup_precomputed_sizer   r   r   codegen_indexing)r]   r   ar   replacements
simp_indexs         r_   prepare_indexingSIMDKernel.prepare_indexing  sQ    &&u-5!''"2"2"K"KLu{{5;;'((CEMM0J,K,KJJqww//HHIE u{{5==)**[[/ ..w<!# ,$, ) ) %&qww'7'7'O'OPQ'R#SL&u;E 0 ++E2
 )X>>JJOOTUDV 	 $$Z00ra   c                   U R                    Vs/ s H(  o"R                  (       a  U R                  (       d  M&  UPM*     nnU(       ay  [        U5      S:  aj  [	        S U 5       5      nSR                  S US U  5       5      SU* S  :X  d$   US U  Vs/ s H  o"R                  PM     sn5       e[        US U 5      US U& U$ s  snf s  snf )Nr7   c              3  >   #    U  H  oR                   S ;   v   M     g7f)xyzNrZ   r   ts     r_   r   0SIMDKernel.active_range_trees.<locals>.<genexpr>6  s     95aE)5r   c              3  8   #    U  H  oR                   v   M     g 7frS   r  r  s     r_   r   r  7  s     ;]88]r3  zyx)r  rd   r#  r   r4  r  rZ   r   )r]   reorderr  treesr"  s        r_   active_range_treesSIMDKernel.active_range_trees1  s    ''
'!~~AVAVA' 	 
 s5zA~9599E77;U6E];;ueVW~M "'-P"/Q-P M %U6E]3E&5M

Ps   %CCCc                8   [         R                  R                  R                  XR	                  5       5      n[        UR                  [        S9 H  nX R                  ;   d  M  0 nU R                  U   R                  5        H.  n[         R                  R                  R                  U5      X4'   M0     [        U5      S:  a5  [        U R                  U   R                  U5      U R                  U   l        U R                  U   R                  5         M     U$ )Nr   r   )r6   r   r   r  rX   sortedr   rn   r   r   r	  r   r2   r   r   )r]   r   symr  pss        r_   r
  SIMDKernel.codegen_indexing=  s    ww44T??;LM$++5C+++  "//4EEGB'(ww'7'7'O'OPR'SL$ H|$q(6@--c277$7D))#.3 %%c*224 6 ra   c                    [        S5      e)NzNYI: codegen_nan_checkr8  rc   s    r_   codegen_nan_checkSIMDKernel.codegen_nan_checkN  s    !":;;ra   c                    [        S5      e)NzNYI: call_kernelr8  )r]   rV   r   s      r_   call_kernelSIMDKernel.call_kernelQ  s    !"455ra   c              #     #    U R                   nU R                  nU(       a  [        R                  " X5      n[        R
                  " U5      nXl         X l         Uv   X0l         X@l        g! X0l         X@l        f = f7f)z:Context manager to add an additional mask to tl.load/storeN)
_load_mask_load_otherr4   logical_andr5   _unwrap)r]   rL  r  rg  	prior_vals        r_   
mask_loadsSIMDKernel.mask_loadsT  sj     
 $$	??4/D!!$' 	)J#O( $O(s   AA=A, A=,A::A=c                &   U R                   R                  5        VVs0 s H  u  p#X#R                  _M     nnn[        X5      n0 nU R                   H5  n[        UR                  5      n[        XXS05      [        XXS05      -
  Xh'   M7     U$ s  snnf )a  
This gets the stride of the index for each of the tiling variables
(technically, it does it at index 0)

For example, if
xindex = x0 + 512*x1 + 1024*r0
x0 = (xindex//512)
x1 = (xindex % 512)
r0 = rindex // 1024

this function would return
{xindex: 512, rindex: 1024}
r7   r   )r   rj   r   r2   r  r0   rV   )	r]   r   kvindex_to_tile_indexesindex_in_tile_varsstrides
range_treer   s	            r_   get_strides_of_loadSIMDKernel.get_strides_of_loadh  s     8<7L7L7R7R7T U7TtqFF7T U'E**J":??3A#$6A?*"FC GJ +
  !Vs   Bc                d    [        U[        5      (       a  [        [        X5      5      $ U " U5      $ rS   )r   tupler  )r  r  s     r_   _map_tuple_or_scalarSIMDKernel._map_tuple_or_scalar  s(    eU##R((%yra   c           	     "   / n[        [        U R                  R                  R	                  5       5      5      nU R                  R                  5       u  p4  nU R                  R                  5       n[        R                  R                  R                  [        U R                  R	                  5       5      5      n[        U5       GH;  u  pxX;  a  UR                  S5        M  [        R                  R!                  U5      n	[        R                  R                  R                  U	5      n
X:  a  ["        [$           " 5       nSnXX    HT  n['        U[(        [*        45      (       a  UR-                  SU 35        US-  nM9  UR-                  UR.                  5        MV     [        U5      U-  nOU
n[        R                  R1                  U5      n[3        U5      nUR                  UU-  S[5        Xr:  5      -   -  5        GM>     [7        U5      $ )a  
Try the best to estimate the total size (in bytes) of the
kernel's inputs and outputs, which is used for estimating the memory
throughput of this kernel. This information is used for checking how
far we are from the peak memory bandwidth. It's important that
we want to avoid overestimating the sizes of the inputs and outputs,
because it can wrongfully give us a very large memory traffic value,
which may be even larger than the theoretical bandwidth and thus
become very misleading. This is particularly problematic for cases
where we slice some inputs. In those cases, we should only count
the size of the "slices" instead of the original inputs, because
only the slices contribute to the real memory traffic.
r   no_index_dep_r7   )r   r3   r   inplace_buffersr   python_argdefsr  buf_accessesr6   r   r   r   r1   r   rN  r   	get_numelr   r   r   r   r    r   r   	get_dtyper+   r   r4  )r]   nbytesninplace_argsr  	call_argsrB  	out_numelr[  r   	arg_numelbuf_sizerb  no_index_dep_countdeprY   r:  
dtype_sizes                    r_   estimate_kernel_num_bytes$SIMDKernel.estimate_kernel_num_bytes  s    F499#<#<#C#C#EFG!YY557a}}113 GG$$..}T[[=O=O=Q/RS		*FA &a ))#.Iww''11)<H# %S/+%&"',C!#'9::m4F3G$HI*a/*CII. - Gy0 GG%%c*E'.JMM%*,C8I4J0JKL9 +: 6{ra   c           	        [        U R                  R                  5      S:X  aG  [        U R                  R                  5      S:X  a$  [        U R                  R                  5      S:X  a  gU R                  R                  5       u  p#pESnU GHr  n[        R                  R                  U5      nU(       d  M,  UR                  5       n	[        U	R                  5      S:X  d  MW  [        U	R                   V
s/ s H  oS:X  d  M
  U
PM     sn
5      S:X  a  M  [        R                  " U	R                  5      nUc  UnM  Xk:w  d  M  [        SU S3SU S	U 3-   5      n[        R!                  U5        U Vs/ s Ht  n[        R                  R                  U5      (       aK  [        R                  " [        R                  R#                  U5      R                  5       R                  5      OSPMv     nnU Vs/ s H`  n[        R                  R                  U5      (       a7  [        R                  R#                  U5      R                  5       R                  OSPMb     nnU Vs/ s HE  nU[        R                  R$                  ;   a  S
O!U[        R                  R&                  ;   a  SOSPMG     nnU V
s/ s H  oR(                  PM     nn
[        SU SU SU 3SU SU S3-   5      n[        R!                  U5          g   [+        SU S35      n[        R!                  U5        gs  sn
f s  snf s  snf s  snf s  sn
f )zZ
Print message if the kernel have mixed layout inputs.
Only care about 4D tensor for now.
r7   r   N   r   zExpected stride order z, but found stride orderr  z for kernel 
GraphInputIntermediateBufferz  param names z
  buf names z
  strides z	
  sizes z
  sources 
z%All the inputs for the triton kernel z have uniform layout)r   r   input_buffersoutput_buffersr@  rA  r6   r   try_get_buffer
get_layoutr  r   get_stride_orderstrider%   logwarning
get_buffergraph_inputsname_to_bufferrV   r$   )r]   r  argdefsrG  
_signaturer  uniform_stride_orderarg_namebuflayoutrJ   stride_ordermsgrV   stride_order_list	size_listsource_listargdef_namess                     r_   warn_mix_layoutSIMDKernel.warn_mix_layout  s    		''(A-DII,,-2DII--.!3
 ,0II,D,D,F)J#!H''((2C^^%F6;;1$6;;9;aq&;9:a?!226==A'/+7()9%01E0FF^_l^<}EFC KK$ %.) %.D 7711$77 ++GG..t4??AHH "	"
 %. & ) %.	! %.D 7711$77 **40;;=BB!" %.	  ! %.# %.D	  177#7#77 %  177#9#99 2!	"
 %.   # 5<#<GqFFGL#<%(nYK|\m[no&ykk]"MNC KK$a "b 3K=@TU
 	C[ :)!# $=s'   6	L(
L(
5A;L-6A'L2#AL75L<c                   [         R                  " XSU5      nSU l        [         R                  " U R                  R
                  U5      n[         R                  " X45      nSU l        [         R                  " X%5      n[         R                  " Xf5      n[         R                  " XSU5      n[        R                  " XXU45      $ )Nr4  FT)r4   	reductionr#  
index_exprr  r  truedivsubmulr5   r-  )	r]   r:  r  sum_rnumelmeandxdx2m2s	            r_   welford_reduce_fallback"SIMDKernel.welford_reduce_fallback  s    }}U5%8 % = =uE{{4( $WWU!ggbo]]54!!4V"455ra   c                    [         R                  " XSU5      n[         R                  " X#5      n[         R                  " U5      n[         R                  " XSU5      n[        R
                  " X645      $ )Nmaxr4  )r4   ro  rr  expr5   r-  )r]   r:  r  vmaxrr  r~  vsums          r_    prepare_softmax_twopass_fallback+SIMDKernel.prepare_softmax_twopass_fallback  sT    }}U5%8gge"ggcl}}U5#6!!4,//ra   c                    [         erS   r8  rc   s    r_   codegen_kernelSIMDKernel.codegen_kernel  r=  ra   c                    g rS   r   rc   s    r_   r  SIMDKernel.codegen_body"      ra   c                    g rS   r   )r]   r  s     r_   r   )SIMDKernel.codegen_iteration_ranges_entry%  r  ra   )r*  r+  r  r   r%  r  r  r#  r   r  r)  r   r'  r   r  r  )NNN)r+  dict[str, sympy.Expr]r  rA   r   r   r,  Optional[bool]r-  r  rt   ru   r   )r:  ztorch.dtypert   rn   r   rv   )r   r   r#  rw   rd   rw   r   r  r)  rw   rt   list[IterationRangesRoot])r   zdict[str, str]rt   ru   )rb  Sequence[sympy.Expr]rt   ru   )rV   rn   r   rq   r  r9   rt   ru   )rt   rp   )r[  r   rt   rn   )rt   z	list[str])r   rq   rt   rq   )r   rq   r  rs   rt   rq   )rt   z'contextlib.AbstractContextManager[None])r   rq   rt   ro   )r  Iterable[sympy.Expr]r   Sequence[Sequence[sympy.Expr]]rt   zStuple[list[list[sympy.Expr]], list[list[Callable[[list[sympy.Expr]], sympy.Expr]]]])r  r  r   r  r  rq   rt   rw   )r   r  rt   list[list[sympy.Expr]])r  r  r   r  rt   r  )r   rq   rt   rw   )r   rq   rt   rn   F)r  rw   rt   r  )r   rq   rt   rq   r   rS   )rV   rn   r   zOptional[IRNode]rt   ru   )rL  zUnion[str, OpsWrapper]r  Union[int, float]rt   zIterator[str])r   rq   rt   rp   )r  r   )@ry   rz   r{   r|   r}   pexprr  __annotations__r  rU   r   r)   r   r5  r;  r@  r(  r\  r*  rc  rh  r$  r&  rX   ry  r  r  r  r  r  r  r  r  staticmethodr  classmethodr~   r   r   r  r  r  r  r  r  r  r  r
  r$  r'  r  r  r/  r8  r<  rN  rl  rz  r  r  r  r   r   r   r   s   @r_   rr   rr   O  s    */E&.&&!OT! /38<9=-.%-. %-. ,	-.
 (6-. )7-. 
-. -.^ J   J" E E4+4 4 	4
 &4 4 
#4l-*
R'
'
$>>':>	>':	(0
 J1$J1/MJ1
J1 J1X 
 ',ggkk	$ 0 $	
 
 ,
V5
V	
V O$O 0O
 
 O O84
,7$1$1 
$1L
"<6 )*)3D)	) )&0  
=~EN
60" ra   rr   c                     \ rS rSr% \rS\S'   S rS r\r	\r
S r  S S jr\      S!S j5       rS"S	 jr    S#S
 jrS rSS. S$S jjrS r S%           S&S jjrS r\\R0                  " S5      S'S j5       5       r\      S(S j5       r\      S)S j5       r\        S*S j5       r\  S+S j5       r\\R>                  R@                  4 S,S jj5       r!S r"S-S jr#S%S jr$S r%S r&Sr'g).SIMDSchedulingi)  z	type[Any]kernel_typec                &    [        S U 5       5      $ )Nc              3     #    U  H7  n[         R                  R                  R                  [	        U5      5      v   M9     g 7frS   r  r   s     r_   r   *SIMDScheduling.group_fn.<locals>.<genexpr>-  s-     P%QQWW%%..}Q/?@@%s   ?A)r;  r  s     r_   group_fnSIMDScheduling.group_fn,  s    P%PPPra   c                	  ^^ [        U[        R                  5      (       d  [        U[        R                  5      (       a  [        R                  R                  X5      $ UR                  u  nu  pEUR                  u  nu  mm[        X5      nUR                  5       (       a3  UR                  5       (       d  UR                  5       (       a  U" S5        OGUR                  5       (       a2  UR                  5       (       d  UR                  5       (       a  U" S5        UR                  5       (       a;  UR                  5       (       a&  UT:H  =(       a    UT:H  nU(       d  U" SUTUT5        U$ UR                  5       (       Gd  UR                  5       (       Gd  UT:X  a  UT:X  d  UR                  5       (       d  U" SUTUT5        gUR                  5        Hk  nUR                  5       (       a    OUUR                  5       UR                  5       -  (       d  MB  UR                  u  nu  pXI:X  a  XZ:X  a  M_  U" SUU	UU
5          g   [        X4S5       HN  u  pUR                  5       (       d  M  [        UR                  5       [        5      nU(       d  U" U S35        Us  $    U R                  UR                  5       XE5      nU R                  UR                  5       XE5      nU R                  UR                  5       UR                  5       -   XE5      n[         R"                  R$                  (       ab  Sn['        U5      S	:  a*  ['        U5      S	:  a  Xs=:H  =(       a    U:H  Os  nOUU:H  nO['        U5      S	:  a  UU:H  nU(       d  U" S
UUU5        ggUR                  5       (       d  UR                  5       (       a  US:X  a  TS:w  d   eUTT-  :X  a  [)        UU4S jUR                  5        5       5      (       d	  U" S5        g[         R"                  R*                  (       ag  UR                  5       (       dR  [-        U R                  UR                  5       U5      R/                  5       5      US4TTS44;   nU(       d  U" S5        U$ gUT:w  a  U" S5        UT:H  $ UR                  5       (       a  UR                  5       (       a   eU R1                  X!5      $ )z
Hook called by Scheduler to determine if the Triton backend
can fuse node1 and node2.  These nodes might already be
FusedSchedulerNodes.
z&Split scan cannot fuse with reductionsz1numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)z5numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)Fz:numel/rnumel mismatch prologue mismatch (%s, %s), (%s, %s))node1node2z is not TritonTemplateBufferTr   ztiling mismatch (%s, %s, %s)r7   c              3  p   >#    U  H+  n[         R                  TT4UR                  5       5      v   M-     g 7frS   )rr   r  
get_ranges)r   r   numel2rnumel2s     r_   r   *SIMDScheduling.can_fuse.<locals>.<genexpr>  s3      . ,,fg->OO.s   36z"nodes numel/rnumel incompatibilityzinvalid tiling for reductionznodes numel incompatibility)r   r   ForeachKernelSchedulerNodecan_fuser  r(   is_split_scanrd   is_template	get_nodesused_buffer_namesget_buffer_namesr  get_template_noder"   select_tilingr   triton tiling_prevents_pointwise_fusionr   r    tiling_prevents_reduction_fusionr;  r   can_fuse_horizontal)r]   r  r  r  numel1rnumel1whyreduction_can_fuser   	pro_numel
pro_rnumelr   	node_nameis_triton_templatetiling1tiling2tiling3condis_reduction_tiling_validr  r  s                      @@r_   r  SIMDScheduling.can_fuse/  ss    eYAABBj977G
 G
 77@@NN${{F${{FG%  )<)<)>)>!!##<=  ""5+>+>+@+@!!##<=E$6$6$8$8!'6!1!Hg6H%G &%!!##E,>,>,@,@f$G);((**O ! !& 1++--!  $557%:P:P:RR$59ZZ22I & 38M \ & ) ' * $)# !2& !$UN4F G==?? *4++-/C*& .yk)EFG-- !H (():FLG(():FLG((!EOO$55vG }}==w<!#7|a'&<<W<&'1\A%"g-D6	 !!!##(:(:(<(<a<GqL00')) "__.   <= MMBB!--//05**5??+<fELLN1  !,1- 5:;4412V##!!##E,>,>,@,@@@''55ra   c           
       ^^^^^^^ / m[         [        R                     " 5       m[         [           " 5       m[         [           " 5       mS mUU4S jnUU4S jnU4S jnUUUU4S jn[        R
                  UUUU4S j5       nUU4S jn	U H  n
U
T;   a  M  TR                  U
5        U" U
5      (       aT  U	" U
T5      (       a  U" 5           S S S 5        T(       a"  U" U
5      (       d  T=(       d    [        T5      mOS mU" U
5        M}  U" U
5      (       a#  U" 5          TR                  U
5        S S S 5        M  [        ST ST S	U
R                  S
    35      e   T$ ! , (       d  f       N= f! , (       d  f       M  = f)Nc                ~   > U R                   u  nu  p#UT:H  =(       a    UT:H  =(       d    UTT-  :H  =(       a    US:H  $ Nr7   r  r   r  
node_numelnode_rnumelrY   ru  s       r_   fits_in_main_body@SIMDScheduling.generate_node_schedule.<locals>.fits_in_main_body  sF    +,77(A(
%'AK6,A efn,A1Ara   c                `   > U R                   u  nu  p#UT:H  =(       a    US:H  =(       a    TS:g  $ r  r  r  s       r_   fits_outside_reductionESIMDScheduling.generate_node_schedule.<locals>.fits_outside_reduction  s2    +,77(A(
&K;!+;K!Kra   c                d   > U R                   R                   H  nUR                  T;   d  M    g   g)NTF)read_writesreadsrV   )r   readcurrent_loop_buffer_usages     r_   expect_improved_memory_usageKSIMDScheduling.generate_node_schedule.<locals>.expect_improved_memory_usage  s,    ++99 99 , ra   c                  > TR                  U 5        TR                  U 5        TR                  U R                  R                   Vs/ s H  oR
                  PM     sn5        U R                  5       (       a  [        U [        R                  5      (       a|  [        U R                  [        R                  5      (       aS  [        U R                  R                  [        R                  5      (       d   TR                  U R                  5       5        g TR                  U R                  R                    Vs/ s H  oR
                  PM     sn5        g s  snf s  snf rS   )r   r   updater  r  rV   rd   r   r   SchedulerNoder   r   ComputedBufferdataScanget_namewrites)r   rJ   r  donenode_schedulenot_ready_yet_nodess     r_   schedule_node_in_loopDSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loop  s    HHQK  #%,,amm>Q>Q-R>Qff>Q-RS
   q)"9"9::qvvr'8'899"166;;88#''

5)00!--BVBV1WBVQ&&BV1WX .S 2Xs   E6Ec               3  b  >#    T(       a  TS   [         L a  TR                  5         OTR                  [        5        T(       a1  TR	                  T[        5        TR	                  TS-   [         5        S mS v   TR                  [         5        TR                  5         T R                  5         g 7f)Nr  r7   )r?   popr   r>   insertclear)r  maybe_split_indexr  r  s   r_   end_current_reduction_loopISIMDScheduling.generate_node_schedule.<locals>.end_current_reduction_loop  s      r!2o!E!!#$$%56 $$%68HI$$%6%:OL$(!  1%%'%++-s   B,B/c                   > TS:X  a  gTU R                   -  (       d  gU(       a  [        US   [        [        45      (       a   e[	        T5      $ )Nr7   Fr  )	ancestorsr   r?   r>   rw   )r   r  r  ru  s     r_   #requires_closing_previous_reductionRSIMDScheduling.generate_node_schedule.<locals>.requires_closing_previous_reduction  sS    {&7 b!O5E#F* *   +,,ra   zunexpected group: (r   z) != r7   )r   r   r&   rn   r  r  r   r   r   r9  r  )r]   r   rY   ru  r  r  r  r  r  r  r   r  r  r  r  r  s     ``       @@@@@r_   generate_node_schedule%SIMDScheduling.generate_node_schedule  s^   #%)5568 )o/$.sO$5!+/		L		Y 	Y" 
	"	"	. 
#	.	- Dt|HHTN &&6t]KK35 6 -5QRV5W5W(9(OS=O% )-%%d+'--/1!((. 21 *)%6(%

1O - 4 ' 65 21s   E E1 
E.	1
F 	c                    UR                  5       n[        US S9R                  u  nu  pEU R                  X$U5      n[        R                  SU5        U R                  [        XdU5      5      $ )z;
Given a set of pre-fused nodes, generate a Triton kernel.
c                4    [        U R                  5       5      $ rS   r   rd   r   s    r_   r   -SIMDScheduling.codegen_node.<locals>.<lambda>#  s    c!..:J6Kra   r   zSchedule:
 %s)r  r}  r  r  schedule_logdebugcodegen_node_schedulerA   )r]   r   r   r  rY   ru  r  s          r_   codegen_nodeSIMDScheduling.codegen_node  sj     04~~/? ,KLRR?E33E&I+];))}V<
 	
ra   c                   [         R                  " [         R                  5      R                  n[	        U 5      (       d  gU Vs/ s H8  nUR                  5       (       d  M  UR                  5       R                  5       PM:     nn[        S U 5       5      (       d  g[        R                  R                  R                  X5        U H,  n[        R                  R                  R                  XR5        M.     gs  snf )NFc              3  8   #    U  H  n[        U5      v   M     g 7frS   )r*   )r   r  s     r_   r   8SIMDScheduling.can_use_32bit_indexing.<locals>.<genexpr>>  s     FID)$//Ir3  T)torchiinfoint32r}  r*   has_tensor_outputrX  storage_sizer   r6   r   r   	guard_leq)rY   buffersint_maxrd  	buf_sizesr  s         r_   can_use_32bit_indexing%SIMDScheduling.can_use_32bit_indexing,  s    
 ++ekk*..%e,, 
$$& ,CNN))+ 	 
 FIFFF 	
""52DGG&&t5 
s   C9!"C9c                0   UR                   nU R                  X!R                  UR                  5      nU R	                  X/SU05      nU H  nU R                  X%5        M     [        R                  " U5        U H  n[        R                  " U5         UR                  5       nS S S 5        U R                  WX%5      n[        R                  R                  (       a  [        UU5        [         R#                  SU5        Xul        ['        U5      Ul        M     A[)        U5      S:  a  [        U5      nOUu  n[        R                  " U5         UR+                  5        H  n	U	R-                  5         M     S S S 5        U R/                  U5        UR1                  UR$                  5        [        R2                  (       a  UR5                  5         [        R6                  (       a  UR7                  US   R$                  5        [        R8                  =R:                  UR:                  -  sl        [        R8                  =R<                  UR<                  -  sl        [        R8                  R>                  R@                  (       a  [        RB                  (       a  US   RD                  RG                  5       n
UR+                  5        H  n	U	RI                  5       nX;  a  M  U	RJ                  c   eU	RJ                  RM                  5       nUc  MH  [N        S   S==   S-  ss'   [        R8                  R>                  RQ                  SURR                  < SU S	35        M     U RU                  5         g ! , (       d  f       GN= f! , (       d  f       GN= f)
Nr  z+Generating kernel code with kernel_name: %sr7   r   inductorintermediate_hookszrun_intermediate_hooks(r   r   )+r  r  rY   r  create_kernel_choices!codegen_node_schedule_with_kernelr=   merge_workspaces_inplacer6   set_kernel_handlerr  define_kernelr   traceenabledr/   r[  r  r  r   r   scheduler_nodesmark_runcodegen_commentr'  nan_assertsr$  rl  r   removed_buffersinplaced_to_removewrapper_codesupports_intermediate_hooksgenerate_intermediate_hooksr   live_output_buffersr  r   get_origin_noder   	writelinerV   free_buffers_in_scheduler)r]   kernel_featuresr  r+  kernelsr[   src_coder  final_kernelr   	live_outsrV   origin_nodes                r_   r  $SIMDScheduling.codegen_node_scheduleH  s   '55##00/2Q2Q
 ,,X
O'D
 F22=I ,,W5F%%f-!002 .,,X}MK||##7! IIC[Q!,(2F   w<!&w/L%O\!!,/'779 : 0 	]+  !9!9:**,!!(()?)?@	<#?#??	""l&E&EE" GG  <<22  
;;=I'779}}(yy,,,"ii779*Z()=>!C>GG((221+2B2B1ERvQO : 	&&(k .-& 0/s   M4(N4
N	
Nc                (    U R                   " U0 UD6/$ rS   )r  )r]   r  kernel_argskernel_kwargss       r_   r
  $SIMDScheduling.create_kernel_choices  s'     
 	
ra   c           	     ^   U   [         R                  " 5       n0 nU H  nU[        L a!  UR                  UR	                  5       5        M-  U[
        L a  UR                  5         MH  UR                  5         UR                  UR                  5       5      nUR                  [        R                  UR                  R                  U5      R                  5       5      5        M     UR!                  UR#                  5       5        U H  nU[        L a!  UR                  UR	                  5       5        M-  U[
        L a  UR                  5         MH  [%        UR                  5        UR                  UR                  5       5      nUR'                  U5        M     S S S 5        g ! , (       d  f       g = frS   )r  	ExitStackr>   enter_contextr  r?   closedecide_inplace_updater  r  r  rr  fromkeys_bodyindexing_from_argsr   rc  keysr#   r   )r]   r  r[   stackall_indexingr   r   s          r_   r  0SIMDScheduling.codegen_node_schedule_with_kernel  s>   ((*EL &++''(@(@(BC_,KKM..0!'!<!<T__=N!OJ '' JJ99*ELLN & $$\%6%6%89 &++''(@(@(BC_,KKM 6djjA!'!<!<T__=N!OJLL, &- VVs   FF
F,Fonly_gen_src_codec                  UR                   u  nu  pgUS:X  d   eUR                  R                  UR                  5      u  p0 n
UR                  5       n/ nU H  nUR	                  5       nUR                  U5        X-  (       d  M/  [        U5      S:X  d   eX[        [        U5      5      '   UR                  R                  [        [        U5      5      5        / nM     [        U5      S:X  d   eU   U(       d  U/UQ H  nUR                  5         M     U	" 5       nUR                  S5         U H1  nUR                  UR                  UR                  5       5      5        M3     UR                   R#                  [%        5       5        SSS5        UR&                  R)                  5        GH@  u  nnSU S3nU
R+                  UR-                  5       / 5      =n(       d  M6  [/        S U 5       5      n[0        R2                  " SU(       + 5         UR                  U5         U H  n[        UR	                  5       5      S:X  aB  [        U5      S:X  a3  [5        U5      (       a#  U=R6                  UR	                  5       -  sl        UR                  UR                  UR                  5       5      5        M     UR                   R#                  [%        5       5        SSS5        SSS5        GMC     SSS5        [9        W[:        5      (       d!  UR=                  S	5        UR=                  S
SS9  [>        R@                  " U5         UR&                  RC                  5        H  nSU S3nUR=                  USS9  M     UR                  S5         [9        U[:        5      (       a  UnOUR=                  S5        URD                  nSSS5        / UQUPUQn[0        RF                  (       aH  URI                  5       S-  nURK                  5        SW SURM                  U5      RO                  5        3nU(       a  WsSSS5        $ U RQ                  WUU5      n[0        RR                  RT                  (       a  [W        UU5        SSS5        U RY                  W5        UR[                  WUR                  5        [>        R\                  =R^                  UR^                  -  sl/        [>        R\                  =R`                  UR`                  -  sl0        U Rc                  5         g! , (       d  f       GN= f! , (       d  f       GN^= f! , (       d  f       GM  = f! , (       d  f       GNn= f! , (       d  f       GN= f! , (       d  f       GN= f)zw
Codegen a triton template

If `only_gen_src_code` the src code will be returned instead of codegen'd into the wrapper
r7   r   z<STORE_OUTPUT>Nz<LOAD_INPUT_>c              3  @   #    U  H  oR                  5       v   M     g 7frS   )can_codegen_without_upcasts)r   p_ns     r_   r   2SIMDScheduling.codegen_template.<locals>.<genexpr>  s      5ESc7799^   ztriton.codegen_upcast_to_fp32z<DEF_KERNEL>z	<ARGDEFS>F)strictg    eArT  )2r  r   make_kernel_renderr  r  r   r   r   iterprologue_fused_inputsr   r  set_subgraph_bodyr   r  r  cse
invalidater   named_input_nodesrj   r   r  r   r   patchr   #prologue_fused_inputs_preserve_zeror   rn   finalize_hookr6   r  r1  codebenchmark_kernelrN  imports_for_benchmark_kernelcodegen_kernel_benchmarkgetvaluer  r  r  r/   r  r'  r   r  r  r  )r]   template_nodeepilogue_nodesprologue_nodesr6  r  _numelru  r[   renderbuf_name_to_prologue_grouptemplate_readsprologue_groupprologuenamesr   partial_code
input_namebuffersubgraph_namecan_codegen_without_upcastprologue_noder   r  num_gbr  s                             r_   codegen_templateSIMDScheduling.codegen_template  s    ,11F{{&++>>}?Q?QR%'"&88:&H--/E!!(+%%5zQ&@N4U+<=,,00d5k1BC!# ' >"a'''$ +<^<DMMO = "8L))*:;*DLL!<!<T__=N!OP +

%%jl3 <
 '-&>&>&D&D&F"
F".zl! <%?%C%COO%r& >  25 5ES5 2.  7=W9W $55mD1?$'(F(F(H$IQ$N(+N(;q(@'CM'R'R(.(R(R,9,J,J,L)*(R !. 5 5$*$?$?(5(@(@(B%&!" 2@ #JJ11*,?! E  'G \ ,,,&&~6&&{5&A !!&) %66;;=
".zl! <**=*G > ))*:;lC00+H ../?@+00H < NnMmMnMM&&99;cA::<=Rj66v>GGIJL  !1 *)4 ,,X}fMK||##7{S; *> 	]+;(:(:;	6#9#99	""f&?&??"&&(_ <;& ED 3 Vt <; *)s   3<U/AT!
AU".UU"B:T3	U$UA	U<6U*A5U<>U<!
T0	+U3
U=U
U	U
U'*
U9	4U<<
Vc                    [         R                  R                  R                  [         R                  R                  R                  5       5        g rS   )r6   r   r  r  
device_opssynchronizerc   s    r_   codegen_syncSIMDScheduling.codegen_sync-  s-    	&&qww'9'9'E'E'GHra   c           
        SSK Jn  U Vs/ s H  owR                  5       PM     nn0 0 p[        X5       Hl  u  p[	        US S9R
                  u  nu  pU R                  XU5      nU R                  UX5      nUUX4X'   UR                  U[        UX5      U(       + S9X'   Mn     UR                  UU UU	U
S9n[        R                  S[        U5      U Vs/ s H  n[        U5      PM     sn5        / nU GHC  nU Vs/ s H  owR                  5       PM     nnU" UUS9n[        UU5       H  u  pU R                  X   S	   UR                  X   5      5        X   nX   S	   nU(       dL  [         R"                  " U5         [$        R&                  " U5       H  nUR)                  5         M     S S S 5        [         R*                  =R,                  UR,                  -  sl        [         R*                  =R.                  UR.                  -  sl        M     UR1                  5       nUR3                  UUU45        GMF     U$ s  snf s  snf s  snf ! , (       d  f       N= f)
Nr7   )ComboKernelc                4    [        U R                  5       5      $ rS   r  r   s    r_   r   ;SIMDScheduling.generate_combo_kernel_code.<locals>.<lambda>=      #ann>N:Ora   r   )r  optimize_mask)r   triton_schedulingcustom_algorithm
kernel_mapnode_info_mapz1ComboKernels: %d nodes partitioned into %s groups)enable_autotunemixed_sizesr   )triton_combo_kernelrg  r  r  r}  r  r  r  create_triton_kernelrA   horizontal_partitionr[  r  r   r  create_sub_kernelr6   r  r@   
only_nodesr  r   r  r  r  r   )r]   subkernel_nodescustom_part_algorithmrp  rq  r6  rg  r   fused_node_listssubkernel_mapnode_schedule_mappnr   r  rY   ru  r  r+  
partitionspkernel_code_list
node_groupr[   	subkernelr   s                            r_   generate_combo_kernel_code)SIMDScheduling.generate_combo_kernel_code0  sQ    	59HINN,I+-r(_?IB!$U0O!P!V!VA 77fMM''uEF$165$H! + @ @+M5I"-o !A !M @ !55!"2$+ 6 

 			? '(ZSVZ(	

 $J=GHZT 0ZH  /'F
 !-=>	66%)!,,,]->? *-	 1 5a 8(--i8$6$A$A-$PD MMO %Q 9 ''9+D+DD'**i.J.JJ* ? ,,.H##Xvz$BC- %.  c J. )  I 98s   H?I=I	 .I
Ic                   UR                  5       nUR                  nUR                  n[        R                  S:  =(       d    [        R                  S:H  =(       a    UnU R                  X#XE5      nU Hk  u  pxn	U R                  Xq/U5      n
U R                  U/5        [        R                  SU
5        UR                  [        R                  R                  U
5        Mm     U R                  5         g )Nr7   z"ComboKernels: generated kernel %s.)get_subkernel_nodesuse_custom_partition_algorp  r   combo_kernel_allow_mixed_sizesr  r  r  r[  r  r'  r6   r   r  r  )r]   combo_kernel_noderw  rx  rp  rq  r  r   r[   r  r  s              r_   codegen_combo_kernel#SIMDScheduling.codegen_combo_kernelm  s    +??A 1 K K+;;;;a? 
11Q6P;P 	  ::O
 $4Ha,,X7JFSK  "3!45II:KHqww33[A	 $4 	&&(ra       c           
       ^ ^^
 TS:H  nSU UU
4S jjnUR                  5       u  nm
[        U5      S::  a  [        T
5      S::  a  / $ UR                  5       u  nm
U" UU(       a  UOT
UR                  U5      5      nU Vs/ s H=  n[        T R	                  UR
                  UT5      UR                  UR                  S9PM?     n	nU	$ s  snf )Nr7   c                  > [        UR                  5      [        U5      :X  d   SUR                  < SU< 35       eUR                  UR                  /n[	        S [
        R                  R                  U5       5       5      (       d   e[
        R                  R                  U5       Vs/ s HF  nUR                  [        R                  R                  ;  d  M-  [        U[        5      (       d  MD  UPMH     nn[        UR                   Vs/ s H  oDR                  PM     sn5      nSS jn[        TR!                  U" U5      /U 5      SSS9/nU GH  n[        R                  R"                  R%                  UR&                  UR                  5      n	[        U	5      [        U5      :X  d   e U	R'                  S5      S-   n
U
[        U5      :X  a  M  [	        S	 XS
  5       5      (       a  M   U" US
U
 5      U" XS
 5      4n[        R                  R"                  R+                  [-        S [/        X5       5       5      5      nUR                  U;   a  US-  n[        R1                  US   5      (       a  US-  n[        R1                  US   5      (       a  US-  n[        R                  R"                  R+                  U[-        [
        R                  " UT5      5      -
  5      S:  d  GM  UR3                  [        TR!                  U" US
U
 5      U" XS
 5      /T5      UUR                  S95        GM     U$ s  snf s  snf ! [(         a     GM  f = f)z@
Compute tiling candidates by dividing up the iteration ranges.
zrw.range_vars=z ranges=c              3  N   #    U  H  n[        U[        [        45      v   M     g 7frS   )r   r   r   )r   rL  s     r_   r   HSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>  s&      EC 3G 455Es   #%c                f    [         R                  R                  R                  [	        U 5      5      $ rS   r  )r  s    r_   collapse_rangesNSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.collapse_ranges  s"    ww''00v1FGGra   noner   )r+  rV   scorer7   c              3  *   #    U  H	  oS :H  v   M     g7fr  r   r   s     r_   r   r    s     ;?a6?s   Nc              3  :   #    U  H  u  pUS :w  d  M  Uv   M     g7fr  r   )r   r  rZ  s      r_   r   r    s      "1EST1Es   	r   r+  r  rV   )r  r  rt   rq   )r   
range_varsr  r  r   r!  rs  rt  rV   r6   r   r  r   r   r   CandidateTilingcreate_partial_tilingr   stride_hintsr   
ValueErrorr   r1   r  is_good_sizer   )is_pointwiser  rwdep_sourcesrL  depswrite_namesr  tilingsr6  splittiled_groupsr  r  r  reduction_rangess                r_   tile_ranges5SIMDScheduling.candidate_tilings.<locals>.tile_ranges  s#    r}}%V4S8H	&6SS4 88RYY/K $??88E     %??88EEC88177#:#::  sI. E   %"))%D)3hh)%DEKH
  44(01<  G ''**77		2==Q7|s6{222
#MM!,q0EF+ ;76?;;; ! < $F6EN3#F6N3  ((22! "14V1E" 
 88{*QJE"//Q@@QJE"//Q@@QJE GG$$..ioofFV.W XX 
 NN'#&#<#<$3F6EN$C$3F6N$C!" !0$ #(!$
Q l N[ &E: " s0   ,,MM3MM1#MM
MMr  )r  rw   rt   list[CandidateTiling])r  r   "pointwise_or_reduction_read_writesr  complete_partial_tilingr+  r  rV   )r  r   rY   r  r  r  pointwise_rangespartial_tilingsr+  full_tilingsr  s   `  `      @r_   candidate_tilings SIMDScheduling.candidate_tilings  s     '!+\	 \	| .2__->** A%#.>*?1*DI .2__->**% ,2B33LA
 *	
 * 22MM5/ ll[[ * 	 	
 	
s   ?ACc                    / SQ[        U5      * S nSS/S[        U5       n[        / [        X15      Q[        XB5      Q5      $ )z;
Create a tiling dict from pointwise and reduction splits.
)rH   rI   rJ   NrK   rL   )r   r   r  )r  	pw_tilingreduction_tilingpw_prefixesreduction_prefixess        r_   create_tilingSIMDScheduling.create_tiling  sT     &s9~o&78#U^,Cc2B.CDVc+)VC0B,UV
 	
ra   c                R    U R                  U(       a  UO/ U(       d  U5      $ / 5      $ rS   )r  )r  r+  r  s      r_   r  $SIMDScheduling.create_partial_tiling  s0       "F&F
 	
,.
 	
ra   c                    [        UR                  5       5      nSU;   nX#-  nU[        U5      -  /nU(       a  XG4OXt4nU R                  " U6 $ )zR
Given a tiling for only pointwise or reduction dimensions, adds the missing one.
rJ   )r  r   r1   r  )	r  r+  rY   r  splitsr  total_numelmissing_tilingtiling_argss	            r_   r  &SIMDScheduling.complete_partial_tiling  s^     fmmo&f}-%f(==> )5V$>:R 	   +..ra   c           
     <   US:H  n[         [        [        [        R                  4      " 5       n[
        R                  " U5       GH  n[        U[        R                  5      (       d  M%  UR                  5       nU(       d  [        US   5      S:X  a  MP  Xt(       a  SOS   nU/n	UR                  R                  5        V
s/ s H7  n
[        U
[        5      (       d  M  [        U
R                  5      S:  d  M5  U
PM9     nn
U GHk  n
/ U
R                  R!                  5       Qn[        R"                  R$                  n[&        R(                  R*                  n[-        U5       H&  u  nu  nnUU-  nUR/                  X5      (       d  M&    O   UR1                  X5      (       d  M  WS-   nU(       a  USU OUUS n/ nU H  u  nn[2        R4                  " U
R6                  U5      n[9        SUR;                  [<        5      UR;                  [>        5      -   [        U5      5      n[2        R@                  " UUUU5      nUb  US   OU/nURC                  U5        M     U	RE                  U5        GMn     U	 H  n[9        S[        U5      [F        RH                  RJ                  -
  5      nUS-   n[M        USU 5      nU4[O        UUS 5      -   nURQ                  U RS                  U RU                  UU5      UU5      5        M     GM     [W        U[        SS9nU$ s  sn
f )z
Creates N-dimensional tiling candidiates, attempting to simplify loads/stores
by tiling the kernel into higher dimensions.

Returns a list of tilings ranked by dimensionality.
r7   r   Nr   T)r   reverse),r   rr  rn   r~   Exprr?   filterr   r   r  r  r   r  reads_and_writesr   r  rj   r   r   r6   r   r   rN  statically_known_geqr   r8   get_subexpr_involving_symbolr   r}  r"  r   r   match_mod_div_block_exprr_  r   r   r  	max_tilesr1   r;  r   r  r  r  )r  r  pointwise_numelr  r  r  r   node_rangesranges_to_tilenode_tilingsrL  memory_depsall_var_rangespointwise_vars_numelr   pointwise_end_idxvarrY   reduction_start_idxrX   index_tilingr   num_dimsmatch_resultdimsnode_tilingnum_leading_dimsfirst_trailing_dimcollapsed_leading_dimcollapsed_splitsranked_tilingss                                  r_   get_nd_tilingsSIMDScheduling.get_nd_tilings+  s"    '!+T#uzz/235#**=9DdI$;$;<< //+KCA$71$< )lBN*+L  ++<<>>Cc9- 25cjj/A2E >  
 # "73::#3#3#5!6',ww{{$77++7@7P3%|U(E1(44,   8Q  77(   '8!&;# $ ##7$78'(;(<=   "",JC/LL		3E
  #H-O0LLN+ H $7#O#OsE8$L /;.F<?UGD ''-% #-( ##L1c #h  ,#&q#k*:V]]=T=T*T#U %5%9"(5kBUCU6V(W%$9#;e 2 34? $  //112BLQ''  ,U :v  
 as   LL9Lc                  ^^ TS:H  nU R                  U/T/5      nU(       d  [        R                  R                  (       a  [        R                  R                  S::  a  [
        R                  [        R                  ::  a  [        R                  " U5       Ho  n[        R                  R                  (       a  M$  [        U R                  XbT5      5      S:  d  ME  [
        R                  [        R                  " S5      5          U$    U$ [         ["           " 5       n[$        R&                  " 5       n[        R                  " U5       Hl  nU R                  XbT5       HS  n	U	R(                  U;   a  M  U	R(                  b  UR+                  U	R(                  5        X==   U	R,                  -  ss'   MU     Mn     UR/                  5        V	V
s/ s H  u  pU	R0                  PM     nn	n
[        R                  R                  S:  aG  U(       a@        SS jn[3        S[        U5      5       H  nU" US   X   5      nUc  M  U/U-   n  O   [        U5      S:  a  [
        R                  SU5        [        R                  R4                  (       a  U R7                  XT5      U-   nU H9  m[9        T[:        5      (       d   e[=        UU4S jU 5       5      (       d  M7  Ts  $    U$ s  sn
n	f )	z
Heuristics to decide how to tile kernels.
Currently, we tile based on stride-1 dimensions.

Returns:
    `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`

r7   r   z
                                Reduction over non-contiguous dims.
                                Consider setting config.triton.tile_reductions to True.
                                r   c                   U S   U R                  SS5      p2US   UR                  SS5      pT[        R                  R                  R	                  X5-
  5      S:X  a  g [        R                  R                  R	                  X5-
  5      S:  a  XE4X#4su  p#u  pE[        R                  R                  R	                  X5-
  5      S:  d   e[        R                  R                  R                  X55      (       d  g U[        X55      UU S   S.nU$ )NrJ   rI   r7   r   rK   )rH   rI   rJ   rK   )r   r6   r   r   r   r  r   )tiling0r  a0a1b0b1
new_tilings          r_   convert_tiling_to_3d:SIMDScheduling.select_tiling.<locals>.convert_tiling_to_3d  s     !w{{3':B w{{3':B77##--bg6!;77##--bg6:*,B8&HRhrww''11"':Q>>>ww''DDRLL !")"5>	
 "!ra   zpossibly bad tiling: %sc              3     >#    U  HW  n[        U[        R                  5      (       d  M$  [        R	                  TR                  5       UR                  5       TS 9v   MY     g7f))r  N)r   r   r  rr   r  r   r  )r   r   r  r+  s     r_   r   /SIMDScheduling.select_tiling.<locals>.<genexpr>  sR       *DdI$;$;<	
((MMOT__%6 )  *s
   #A"8A")r  r  r  r  rt   zOptional[dict[str, sympy.Expr]])r  r   r  tile_reductionsr  perf_hint_loglevelloggingWARNINGr?   r  r   r  infotextwrapdedentr   rn   collectionsr   rV   r   r  most_commonr+  rangeprefer_nd_tilingr  r   rr  r   )r  r  rY   r  r  default_tilingr   
seen_namescandidate_tilescandidate_tilingr  r  r  r[  new_3d_tilingr+  s      `           @r_   r  SIMDScheduling.select_tiling  s    '!+ **E7_4EFV]]%B%B]]$$)""goo5+22=AD"MM999 5 5d? STWXX%**$OO!$ !! B "!_&
4?4G4G4I#**=9D$'$9$9$$W #((J6%**6NN#3#8#8915E5K5KK1 %X : ,;+F+F+H7
+H'  ##+H 	 7

 ==""a'L"."9N"0"0 1c.12 4"1%~'8! !,&3_~%EN 3 ~"8.I ==))""=I ! 
 %Ffd++++  *	    % C7
s   K'c                    g rS   r   rc   s    r_   flushSIMDScheduling.flush  r  ra   c                    grC  r   rc   s    r_   ready_to_flushSIMDScheduling.ready_to_flush  rE  ra   c                   [        S U 5       5      (       d  [        US S9R                  u  nu  pEU R                  XU5      nU R	                  XdU5      nU R                  U[        XdU5      S9nU R                  Xh5        [        R                  " SU5         [        R                  " U5         UR                  5       n	S S S 5        S S S 5        OIUS   R                  U5      u  pn[        R                  " SU5         U R                  UUU
SS9n	S S S 5        W	R                  [!        ["        R$                  5      S	5      n	U	$ ! , (       d  f       N= f! , (       d  f       NJ= f! , (       d  f       N[= f)
Nc              3  @   #    U  H  oR                  5       v   M     g 7frS   )r  )r   r   s     r_   r   ASIMDScheduling.generate_kernel_code_from_nodes.<locals>.<genexpr>  s     2Eq==??Er=  c                4    [        U R                  5       5      $ rS   r  r   s    r_   r   @SIMDScheduling.generate_kernel_code_from_nodes.<locals>.<lambda>  rj  ra   r   )r  rJ  r   Tr5  triton_)r  r}  r  r  r  r  rA   r  r   rF  r6   r  r  get_prologue_template_epiloguer_  replacern   r-   KERNEL_NAME)r]   r   rJ  r  rY   ru  r  r+  r[   r   rV  templateepilogues                r_   generate_kernel_code_from_nodes.SIMDScheduling.generate_kernel_code_from_nodes  sU   2E222!$U0O!P!V!VA 77fMM''fEF%%+M&I & F 22=I/1AB$$V,!002 - CB
 ,18+R+R,(H 02BC00&*	 1  D ##C(?(?$@)L! -, CB DCs0   E/E E?E'
E	E
E$'
E5c                    g rS   r   )r]   r  s     r_   r  SIMDScheduling.codegen_comment4  r  ra   c                    [         erS   r8  )r]   r   r  r[   s       r_   r  SIMDScheduling.define_kernel7  r=  ra   r   N)r   z<Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode])rY   rq   r  z<Iterable[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]rt   rw   )r  rA   )r  rA   rt   zlist[SIMDKernel])rt   Optional[str]r  )rw  zlist[BaseSchedulerNode]rx  rw   rp  rw   rq  rw   r6  rw   rt   zlist[tuple[str, Any, Any]])rt   r  )r  r  r  r  rt   r  )r+  r  r  rw   rt   r  )r+  r  rY   rq   r  rq   rt   r  )rt   z"list[dict[str, tuple[sympy.Expr]]])rt   r  rv   )(ry   rz   r{   r|   rr   r  r  r  r  can_fuse_verticalr  r  r  r  r  r  r
  r  r_  rd  r  r  r  r   r   r  r  r  r  r  r~   r   r   r  r  r  r  r  r  r   r   ra   r_   r  r  )  s   'K'QF6P !"^@
P
$ M 
 6A)F
1
	
 -F SXt	tlI #(; 0;   $;  	; 
 ;   ;  
$; z)( y  yv 

,

@T

	

 

 
$
 
 
	
 
 /%/ / $	/
 
/ /( o
 
,o ob 3877;;p	p pd<"ra   r  T)frozenc                  H    \ rS rSr% S\S'   S\S'   SrS\S'   \S	 5       rS
rg)r  i;  r  r+  r   r  Nr  rV   c                |    [         R                  R                  R                  U 5      n U S:  =(       a    U S-  S:H  $ )z@Somewhat arbitrary heuristic used to boost scores for some sizesr  r   r  )r   s    r_   r  CandidateTiling.is_good_sizeA  s5     GG&&q)Bw(AFaK(ra   r   )	ry   rz   r{   r|   r  rV   r  r  r   r   ra   r_   r  r  ;  s)    !!JD-) )ra   r  c                      \ rS rSrSrg)r  iH  r   N)ry   rz   r{   r|   r   r   ra   r_   r  r  H  s    ra   r  )r  r  rt   rn   )v
__future__r   r  r  dataclassesr   r!  r  r  r  r  r   typingr   r   r   r   r	   r
   r   typing_extensionsr   r~   r  torch._loggingtorch.fx.immutable_collectionsr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   r   _dynamo.utilsr   r  r   r   r   analyze_preserves_zero_maskr   	codecacher   dependenciesr   r   r    r!   r"   optimize_indexingr#   runtime.runtime_utilsr$   r%   r&   r'   r(   utilsr)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   virtualizedr4   r5   r6   block_analysisr8   commonr9   r:   r;   r<   multi_kernelr=   simd_kernel_featuresr>   r?   r@   rA   collections.abcrB   rC   rD   	getLoggerry   r[  _logginggetArtifactLoggerr  r  
fusion_logdoprintr  rS  	dataclassrN   rs   r   r  r	  rr   r  r  	Exceptionr  r   ra   r_   <module>r8     s   "           X X X %    9 / L L  & $ $ F ! 6 6 - A ; D D    - , / P P %  << !00<H~~//*E^^--hA
 	78 5+ 5+ 5+px;/ x;v;'? ;'| +;TW('/*B WtO"^ O"d  d#	) 	) $	)		 	ra   