
    shU                    X	   % S SK r S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	J
r
  S SKJr  S SKJrJrJrJrJr  S SKrS SKrS SKrS SKJr  S SKJrJr  S SKJr  S SKJrJrJ r   S S	K!J"r"J#r#J$r$  S
SK%J&r&  SSK'J(r(J)r)J*r*J+r+J,r,J-r-  SSK.J/r/  SSK0J1r1J2r2J3r3J4r4J5r5J6r6J7r7  SSK8J9r9J:r:J;r;J<r<J=r=J>r>J?r?J@r@JArAJBrBJCrCJDrD  SSKEJFrFJGrGJHrHJIrI  SSKJJKrKJLrLJMrMJNrNJOrOJPrPJQrQJRrRJSrSJTrTJUrUJVrV  SSKWJXrXJYrYJZrZJ[r[J\r\J]r]J^r^J_r_J`r`JaraJbrbJcrcJdrdJere  \R                  S:H  rg\R                  " S5      S 5       ri\R                  R                  \lS5      rm\" / SQ5      rnSSSSSSSS S!S!S".
ro\" / S#Q5      rpS$S%S&S'S(S)S*S+S,S-S..
rqS/S0S1.rr\R                  \R                  /ru\R                  \R                  \R                  \R                  \R                  \R                  \R                  \R                  \R                  /	r}\~\R                     \S2'   \R                  \R                  \R                  \R                  \R                  /r\~\R                     \S3'   S4 rS5 r  SwS6\\GR                     4S7 jjrS8 rS9\RS:\GR                  S;\S<\GR                  S=\GR                  S>\L4S? jrS@\\\N4   SA\SB\SC\R                  SD\\\4   4
SE jrSF\RSG\SH\4SI jr\R                  S6\GR                  SJ\GR                  4SK j5       r\R                  S6\GR                  SJ\GR                  SL\4SM j5       r\R                   SxS6\GR                  SJ\GR                  SL\\   4SN jj5       r\GR                    " SO SP5      5       r " SQ SR\55      r " SS ST5      r " SU SV\U5      r\GR+                  SW5         " SX SY\5      r\GR+                  SZ5        \GR/                  5          " S[ S\\5      r " S] S^\S5      r " S_ S`\5      r " Sa Sb\5      rSc\/S>\\\R                     \x4   4Sd jr " Se Sf5      r " Sg Sh\5      r " Si Sj\5      r " Sk Sl\5      r " Sm Sn\25      r " So Sp5      r " Sq Sr5      r\GR                    " Ss St5      5       r\GR                    " Su Sv5      5       rg)y    N)Sequence)Enum)AnyCallablecastOptionalUnion)dependencies)is_float_dtypeis_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)free_symbol_is_typesymbol_is_typeSymT   )counters   )	codecacheconfigcpp_buildercpu_vec_isairmetrics)LoopBody)BaseSchedulerNodeBaseSchedulingExternKernelSchedulerNodeForeachKernelSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)cache_on_selfget_bounds_index_exprget_fused_kernel_namehas_free_symbolsis_multi_outputs_templateis_welford_reductionparallel_num_threadsPlaceholdersympy_index_symbolsympy_index_symbol_with_prefixsympy_product
sympy_subs)NullKernelHandleropsOpsValueV   )BackendFeatureBracesBufferCSECSEVariableDataTypePropagationDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferKernel
KernelArgsOpOverridesOptimizationContext)_get_dtype_from_loopbodies_get_loop_bodycexprcexpr_indexcodegen_randCppCSEVariableDTYPE_TO_CPP
INDEX_TYPELocalBufferContextmay_unify_binary_op_mask_typepromote_args(template_fusion_with_epilogues_supportedunify_mask_base_typevalue_to_cppwin32c                       [         (       a  S$ S$ )Nz__declspec(dllexport) _IS_WINDOWS     o/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/_inductor/codegen/cpp.pyget_export_declarationrX   W   s    &1k"9r9rV   schedule)+*^||minmaxrZ   r[   r\   r^   r_   argminargmaxr]   welford)
sumprodxor_sumr^   r_   r`   ra   anywelford_reducewelford_combine)
r_   r^   rc   rd   re   rg   rh   r`   ra   rf   z
at::Tensorlongdoubleboolzstd::stringzc10::ScalarTypezat::MemoryFormatz
at::Layoutz
at::Devicez
at::Scalar)
Tensorintfloatrk   str
ScalarTypeMemoryFormatLayoutDevicenumberzstd::vectorzstd::optional)Listr   VECTORIZABLE_DTYPESMASKED_VECTORIZABLE_DTYPESc                    U[         ;   a  [        R                  nU S;   a  gU S:X  a  gU S;   a  [        U   nU[        R                  :X  a  U S;   a  [        [        R
                     n[        U5      (       a  SU S3OS	U S
3n[        U5      (       a  S	U S3OS	U S3nU S;   a  UOUnU S;   a  U$ SU SU S3$ [        U 5      (       a  S[        U    S3$ [        U 5      e)N)re   rc   rf   r   rd   r5   )r_   ra   r^   r`   r`   ra   -std::numeric_limits<>::infinity()std::numeric_limits<>::min()>::max())r_   ra   )r_   r^   IndexValue<z>{0, }Welford<>())	DTYPE_LOWP_FPtorchfloat32rH   rk   rn   r   r*   AssertionError)reduction_typedtypecdtypemin_varmax_varinit_vars         rW   reduction_initr      s$    22;;e$EJJ>5I#I!%++.F e$$ $F8=9'xx8 	 e$$ #6(-8'xx8 	
 -0AA7w / 	
 vhfXJb9	

 N++,u-.c22

((rV   c                     [         [        U      n[        U 5      (       a  SU S3$ U S;   a1  U[        R                  :X  a  [         [        R
                     nSU S3$ U$ )Nr   >ry   r   )rH   r<   r*   r   rk   rn   )r   r   scalar_types      rW   reduction_acc_typer      sd    9%@AKN+++a((--EJJ&u{{3K[M++rV   indexc           	      z   U[         R                  :H  nU S:X  a  U(       a  SOSnU SU SU 3$ U S:X  a  U SU 3$ U S:X  a  U SU 3$ U S	:X  a  U S
U 3$ U S;   a  U  SU SU S3$ U S:X  a	  SU SU S3$ U S:X  a8  [        U[        5      (       a  Uu  pxn	O[	        X5      u  pxn	SU SU SU SU	 S3	$ U S;   ax  [        US5      (       aK  UR                  [         R                  :X  a-  UR                  (       d  Ub  U  SU SU SU S3$ U  SU SU S3$ Ub  U  SU SU SU S3$ U  SU SU S3$ [        U 5      e)Nrc   |rZ    rd    * re    ^ rf    || )r^   r_   z_propagate_nan(, )rg   welford_combine(rh   , {})ry   r   z	_combine(z, static_cast<float>(), )))	r   rk   
isinstancetuplereduction_projecthasattrr   is_vecr   )
r   var
next_valuer   	src_dtypeis_boolconjunctionmeanm2weights
             rW   reduction_combiner      s    5::%G$c#a}Aj\22c*&&"c*&&d:,''' !R
|1EE))!#bA66**j%(()Df0LDf!#d4&2$bDD--J((  EJJ.%% ()3%7LZLX[\a[bbcdd &&iu4I*UWX $%Yse2j\E7!LL$%Yse2j\CC

((rV   c                 X    [        U 5      (       a  U S3U S3U S34$ U S;   a  U S3$ U$ )Nz.meanz.m2z.weightry   z.index)r*   )r   accs     rW   r   r     sF    N++e}SkcU'?::	/	/f~JrV   codeiter_varnew_iter_var
loop_startloop_endreturnc                    [        5       n[        R                  " 5        nUR                  S[         SU S[        U5       S3U S[        U5       SU S3-   5        UR                  UR                  5       5        [        U R                  5       H  u  px[        U[        [        45      (       d   eSn	[        U[        5      (       a  UR                  n	UR                  n[        R                   " S	U -   S	-   U U5      n
U	(       a  [        X5      n
UR                  U
5        M     SSS5        U$ ! , (       d  f       U$ = f)
a  
f(iter_var) is transformed to f(new_iter_var) under the inner loop
  \/
for (new_iter_var = loop_start; new_iter_var < loop_end; new_iter_var++) {
    f(new_iter_var)
}
Please be careful while using this function,
as the variable defined in f(iter_var) will be invalid outside the for loop.
For example:
auto tmp0 = in_ptr[x0]; ->
for (new_x0 = start; new_x0 < end; new_x0++){
    auto tmp0 = in_ptr[new_x0];
}
The tmp0 is invalid outside the loop.
zfor (r    = ; < ; ++)N\b)r7   
contextlib	ExitStack	writelinerI   rE   enter_contextindent	enumerate_linesr   ro   r;   namelineresub)r   r   r   r   r   transformed_codestack_r   deferred_namenew_lines              rW   move_code_under_inner_loopr   	  s<   , $~				5""J<qc+j2I1J!Lc+h"7!8<.LM	
 	,3356 -GA     !M$-- $		yyvve
3e;~PTUH'@&&x0 . 
 , - 
 	, s   D
D44
Eacc_varacc_typer   r   lenc                     [        5       n[        R                  " 5       (       a  SU  SU SU S3O
U SU  SU S3nUR                  U 5        UR	                  SU S	3S
SU  SU" X#5       S3S/5        U$ )a  
MSVC don't support dynamic array(VLA). So we use std::unique_ptr here.
Ref: https://stackoverflow.com/questions/56555406/creating-dynamic-sized-array-using-msvc-c-compiler
MSVC is the only one compiler without VLA. support. Since MSVC can't get good performance here.
We just use unique_ptr make it works on MSVC.
For other compilers, we continue to use VLA to get best performence.
auto z_arr = std::make_unique<z[]>();r   _arr[];for (int i = 0; i < ; i++){    z
_arr[i] = r   r   )r=   r   
is_msvc_clr   
writelines)r   r   r   r   r   init_fncode_bufferacc_decls           rW   reduction_prefix_arrayr   9  s     !"K !!## y0
$se2Fz7)5R0 
 XJ("3%v.7):gn&D%EQG		
 rV   bufferr   new_namec                 `   [        U R                  5       H  u  p4[        U[        [        45      (       d   e[        U[        5      (       a1  [
        R                  " SU -   S-   U UR                  5      Ul        Mh  [
        R                  " SU -   S-   U U5      U R                  U'   M     g )Nr   )r   r   r   ro   r;   r   r   r   )r   r   r   ir   s        rW   replace_acc_namer   Z  s    V]]+
 
 	
 
 dL))u$058XJSDI!vve&7%&?H:PTUFMM! ,rV   r   c                     U R                  U5      (       d  [        R                  R                  $ XS-   0n[	        X5      n[        R
                  " X0-
  5      $ Nr5   )hassympySZeror0   simplify)r   r   replacement	new_indexs       rW   	stride_atr   i  sF    99S>> ww||a.K5.I>>)+,,rV   
vec_lengthc                   ^^^^	 SmSm	UUU4S jnU	UU4S jnU n[         R                  " SSS9nU R                  [        5      (       a  U R	                  [        TU5      U5      n [         R                  " SSS9nU R                  [
        5      (       a  U R	                  [        TXg5      U5      n [         R                  " U 5      n X:w  a  [        U TT5      $ U $ )ai  
Simplifies the index expression within the range of a vectorized loop.
Given a vectorized loop variable `var` in the range of a loop with `vec_length`,
this function transforms the `index` into an equivalent form. It handles
simplifications for cases where `var` can be expressed as `vec_length * a + b`,
where `b` ranges from 0 to `vec_length - 1`. The function reduces occurrences
of `FloorDiv` and `ModularIndexing` in the `index` with best-effort optimizations.

NOTE:
The simplified index expression is intended for analysis purposes only, not
for code generation. It replaces `FloorDiv` and `ModularIndexing` with free variables
which are not dependent on the loop variable `var` in the vectorized range. Check
https://github.com/pytorch/pytorch/pull/117221#discussion_r1449746217 for more details.

Examples:
1. If `var` is `x3` and `vec_length` is 16, and `x3 = 16*a + b`, then
   `FloorDiv(x3, div)` or `ModularIndexing(x3, div, mod)` becomes a free variable
   when `div` is divisible by 16.
2. `ModularIndexing(x3, 1, mod)` can be simplified to `x3 + c` where `c` is a free
   variable when `mod` is divisible by 16.
r   c                    > [        TU 5      n[        R                  " U T5      T:X  a   [        R                  " T ST 35      nTS-  mU$ )N_div_cr5   )r   r   gcdSymbol)divisorresultdiv_freevar_idr   r   s     rW   visit_indexing_div7simplify_index_in_vec_range.<locals>.visit_indexing_div  sK    #w'99Wj)Z7\\SE/?"@AFaNrV   c                 "  > [        TX5      n[        R                  " U T5      T:X  a"  [        R                  " T ST 35      nTS-  mU$ U S:X  a>  [        R                  " UT5      T:X  a#  T[        R                  " T ST 35      -   nTS-  mU$ )N_mod_cr5   )r   r   r   r   )r   modulusr   mod_freevar_idr   r   s      rW   visit_modular_indexing;simplify_index_in_vec_range.<locals>.visit_modular_indexing  s     g799Wj)Z7\\SE/?"@AFaN  \eii<
J5<<3%vn5E(FGGFaNrV   r   T)integerr   )r   Wildr   r   replacer   r   simplify_index_in_vec_range)
r   r   r   r   r   original_indexdivmodr   r   s
    ``     @@rW   r  r  u  s    0 NN	 N
**Y
-CyyhsC02DE
**Y
-Cyy!!oc3<>TUNN5!E*5#zBBLrV   c                 >    U(       a  [        XU5      n [        X5      $ N)r  r   )r   r   r   s      rW   stride_at_vec_ranger	    s     +E
CU  rV   c                   .    \ rS rSr% Sr\\S'   \\S'   Srg)ParallelDepthi  zo
A class representing parallel depth.
Includes the starting depth of parallelism and the depth of parallelism.
parallel_depthstart_depthrU   N)__name__
__module____qualname____firstlineno____doc__rm   __annotations____static_attributes__rU   rV   rW   r  r    s    
 rV   r  c                   r   ^  \ rS rSr\S\S\4S j5       rSSS\\\	\
4      4U 4S jjrS	 rS
 rS rSrU =r$ )OuterLoopFusedSchedulerNodei  node1node2c                    UR                   UR                   L d   e[        S X4 5       5      (       d   e[        S X4 5       5      (       av  U " UR                   [        U5      [        L a  [        UR                  5       5      OU/[        U5      [        L a   [        UR                  5       5      -   U5      $ U/-   U5      $ U " UR                   X/U5      $ )Nc              3   \   #    U  H"  n[        U5      [        [        [        4;   v   M$     g 7fr  )typer  r$   r"   .0nodes     rW   	<genexpr>3OuterLoopFusedSchedulerNode.fuse.<locals>.<genexpr>  s2      
 ' J+" '   *,c              3   D   #    U  H  n[        U5      [        L v   M     g 7fr  r  r  r  s     rW   r  r          T^TtDz88^    )	schedulerallrf   r  r  listget_outer_nodes)clsr  r  outer_loop_fusion_depths       rW   fuse OuterLoopFusedSchedulerNode.fuse  s     %//111 
 
 
 
 	
 
 Te^TTT E{&AA ..01  E{&AA ..01 (!   (! & u8OPPrV   r&  r#   outer_fused_nodesc                    > UU l         X0l        / nU R                    HH  n[        U[        [        45      (       d   eUR                  [        UR                  5       5      5        MJ     [        TU ]%  X5        g r  )
r.  r+  r   r$   r"   extendr(  	get_nodessuper__init__)selfr&  r.  r+  flatten_snodes_node	__class__s         rW   r3  $OuterLoopFusedSchedulerNode.__init__  so      	 (?$++Eem5G%HIIII!!$u'8"9: , 	3rV   c                     U R                   $ r  )r.  r4  s    rW   r)  +OuterLoopFusedSchedulerNode.get_outer_nodes  s    %%%rV   c           
      j  ^ S[         S[         S[        S[        S[        4
U4S jjm[        [	        U5      S-
  5       H4  nX   R
                  nXS-      R
                  nT" UUUS5      (       a  M4    g	   U H  n[        R                  " [        R                  UR                  S U 5      n[	        UR                  5      U:  d  MP  [        U[        R                  5      (       d  Mq  [        UR                  U   [        R                  5      (       d  M  US
-  UR                  U   :  d  M    g	   g)Nleft_loop_nestright_loop_nestloop_fusion_depthcurrent_checking_depthr   c                   >^^ U R                   (       d   eUR                   (       d   eU R                   U   mUR                   U   m/ SQn[        UU4S jU 5       5      (       d  gUS:  d   eUS-
  =nS:  aL  US-   nU[        U R                   5      :  d   eU[        UR                   5      :  d   eT" U UUU5      (       d  gg)N)r   sizeoffsetstepsc              3   V   >#    U  H  n[        TU5      [        TU5      :H  v   M      g 7fr  )getattr)r  attr_compareleft_loop_levelright_loop_levels     rW   r  aOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner.<locals>.<genexpr>  s2       )F O\:/>?(Es   &)Fr5   r   T)loopsr'  r   )r=  r>  r?  r@  outer_loops_attr_compare_listrH  rI  _inners        @@rW   rM  NOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner	  s     "''''"((((,223IJO.445KL-)   )F   $)))%6%::!a?)?!)C&-N4H4H0IIII-O4I4I0JJJJ"#%*	  !rV   r5   r   F,  T)LoopNestrm   rk   ranger   	loop_nest	functoolsreduceoperatormulrangesr   r   Integer)	r4  cpp_kernel_proxy_listr+  idxr=  r>  cpp_kernel_proxyouter_rangesrM  s	           @rW   "check_outer_fusion_loop_level_attr>OuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr   s;   (	$(	%(	  #(	 %(	(	
 (	T 23a78C27AAN3!G<FFO'	   9 !6$++ ''(@)@AL $++,/FF|U]];;$++,CDMM  !3&"))*ABC ) !6, rV   c                 H   US   R                   n[        U5      nU Vs/ s H(  nUR                  R                  U R                  5      PM*     snUl        US   nX5R                  l        UR                  R                  S U R                   UR                  l        U$ s  snf Nr   )kernel_groupOuterLoopFusedKernelrR  from_loop_levelr+  innerkernelrK  )r4  rY  ra  outer_loop_fused_kernelproxyouter_fused_proxys         rW   merge_outer_fusion_kernels6OuterLoopFusedSchedulerNode.merge_outer_fusion_kernelsV  s     -Q/<<"6|"D /)
. OO++D,H,HI.)
% 2!4-D##*,=,G,G,M,M*d**-
##) ! )
s   /B)r.  r+  )r  r  r  r  classmethodr   r,  r(  r	   r"   r$   r3  r)  r]  ri  r  __classcell__r7  s   @rW   r  r    sg    !Q%!Q.?!Q !QF44  &8-&G HI4 &Tl! !rV   r  c                   >    \ rS rSrS
S\4S jjrS rS rS rS r	Sr
g	)RecordOptimizationContextih  	func_namec                 ,    Xl         S U l        S U l        g r  )rp  current_nodeopt_ctx)r4  rp  s     rW   r3  "RecordOptimizationContext.__init__i  s    "596:rV   c                    [         R                  (       d   e[         R                  R                  (       d   e[         R                  R                  U l        U R                  c   e[        R                  U R                  R
                  ;   a-  U R                  R
                  [        R                     U l        O[        5       U l        U R                  c   eU R                  U R                  l        U $ r  )	r4   interpreterrr  rA   keymetars  rp  ops_namer:  s    rW   	__enter__#RecordOptimizationContext.__enter__n  s    }}}}}))))MM66  ,,,""d&7&7&<&<<,,112E2I2IJDL.0DL||''' $rV   c                     U R                   (       d   eU R                  (       d   eU R                  U R                   R                  [        R                  '   g r  )rr  rs  rx  rA   rw  r4  exc_typeexc_valexc_tbs       rW   __exit__"RecordOptimizationContext.__exit__|  s>        |||:>,,2667rV   c                     U R                   $ r  )rs  r:  s    rW   get_opt_ctx%RecordOptimizationContext.get_opt_ctx  s    ||rV   c                 @    U R                   (       d   eU R                   $ r  )rr  r:  s    rW   get_fx_node%RecordOptimizationContext.get_fx_node  s           rV   )rr  rp  rs  N)rR   )r  r  r  r  ro   r3  rz  r  r  r  r  rU   rV   rW   ro  ro  h  s#    ;# ;
G
!rV   ro  c                   (   \ rS rSrSr\S 5       r\S 5       r\S 5       r\SNS j5       r	\S 5       r
\S	 5       r\S
 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r \S 5       r!\S  5       r"\S! 5       r#\S" 5       r$\S# 5       r%\S$ 5       r&\S% 5       r'\S& 5       r(\S' 5       r)\S( 5       r*\S) 5       r+\S* 5       r,\S+ 5       r-\S, 5       r.\S- 5       r/\S. 5       r0\S/ 5       r1\S0 5       r2\S1 5       r3\S2 5       r4\S3 5       r5\S4 5       r6\S5 5       r7\S6 5       r8\S7 5       r9\S8 5       r:\S9 5       r;\S: 5       r<\S; 5       r=\S< 5       r>\S= 5       r?\S> 5       r@\S? 5       rA\S@ 5       rB\SA 5       rC\SB 5       rD\SC 5       rE\SD 5       rF\SE 5       rG\SF\HR                  SG\HR                  4SH j5       rJ\SF\HR                  SG\HR                  4SI j5       rK\SF\HR                  SG\HR                  4SJ j5       rL\SK 5       rM\SL 5       rNSMrOg)OCppOverridesi  zMap element-wise ops to C++c                     SU  SU  SU S3$ )N	decltype()( + r   rU   abs     rW   addCppOverrides.add      1#Rs#aS**rV   c                     SU  SU  SU S3$ )Nr  r   - r   rU   r  s     rW   r   CppOverrides.sub  r  rV   c                     SU  SU  SU S3$ )Nr  r  r   r   rU   r  s     rW   rV  CppOverrides.mul  r  rV   Nc                    [        U [        5      (       d   eUc  U R                  n[        R                  R                  XU5      n[        R                  R                  R                  [        R                  R                  U5      nUR                  SX4SU05        U[        ;   a5  U[        R                  :X  a!   [        R                  R                  XXQ5        U$ )Nto_dtyper   )r   rG   r   r4   re  get_to_dtype_exprcsegeneratecomputeupdate_on_argsr   r   rn   cache_dtype_convert)xr   r   use_compute_typesexprcsevars         rW   r  CppOverrides.to_dtype  s    !^,,,,Ixx))!I>&&qxx'7'7>j1*{I6NOM!i5;;&>> HH((vErV   c                 X    U[         ;   d   U S[         S35       eS[         U    SU  S3$ )Nz missing from z.DTYPE_TO_CPPzc10::bit_cast<>(r   )rH   r  )r  r   r   s      rW   to_dtype_bitcastCppOverrides.to_dtype_bitcast  s=    $U~hZ}&UU$U 34Bqc;;rV   c                     SU  S3$ )Nz	std::abs(r   rU   r  s    rW   absCppOverrides.abs      1#QrV   c                     SU  S3$ )Nz	std::sin(r   rU   r  s    rW   sinCppOverrides.sin  r  rV   c                     SU  S3$ )Nz	std::cos(r   rU   r  s    rW   cosCppOverrides.cos  r  rV   c                     SU  SU  S3$ )Nr  z)(-r   rU   r  s    rW   negCppOverrides.neg      1#S1%%rV   c                     SU  S3$ )Nz	std::exp(r   rU   r  s    rW   expCppOverrides.exp  s     1#QrV   c                     SU  S3$ )Nz
std::exp2(r   rU   r  s    rW   exp2CppOverrides.exp2      A3a  rV   c                     SU  S3$ )Nzstd::expm1(r   rU   r  s    rW   expm1CppOverrides.expm1      QCq!!rV   c                     SU  S3$ )Nz	std::erf(r   rU   r  s    rW   erfCppOverrides.erf  r  rV   c                     SU  S3$ )Nz
std::erfc(r   rU   r  s    rW   erfcCppOverrides.erfc  r  rV   c                     SU  S3$ )Nzcalc_erfinv(r   rU   r  s    rW   erfinvCppOverrides.erfinv      aS""rV   c                     SU  S3$ )Nz
std::sqrt(r   rU   r  s    rW   sqrtCppOverrides.sqrt  r  rV   c                     SU  S3$ )Nz1 / std::sqrt(r   rU   r  s    rW   rsqrtCppOverrides.rsqrt  s    s!$$rV   c                     [         R                  R                  nUS:X  a  U  SU  S3$ Uc  SU  S3$ [        SU< 35      e)Naccuracy + decltype()(1)zstd::log1p(r   8unrecognized config cpp.inject_log1p_bug_TESTING_ONLY = r   cppinject_log1p_bug_TESTING_ONLYr   r  bugs     rW   log1pCppOverrides.log1p  sW    jj66*SQCt,,[ 1%% J3'R rV   c                     SU  S3$ )Nz	std::tan(r   rU   r  s    rW   tanCppOverrides.tan  r  rV   c                     SU  S3$ )Nz
std::tanh(r   rU   r  s    rW   tanhCppOverrides.tanh	  r  rV   c                 0    [         (       a  SU  S3$ SU  S3$ )z
On windows std::signbit only support float type.
Ref: https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/signbit?view=msvc-170
z std::signbit(static_cast<float>(r   zstd::signbit(r   rS   r  s    rW   signbitCppOverrides.signbit  s/     { /qc4	
 !1%	
rV   c                     SU  SU S3$ )Nz	std::pow(r   r   rU   r  s     rW   powCppOverrides.pow  s    1#Rs!$$rV   c                     SU  S3$ )Nz	std::log(r   rU   r  s    rW   logCppOverrides.log  r  rV   c                     SU  S3$ )Nzstd::nearbyint(r   rU   r  s    rW   roundCppOverrides.round!  s     1%%rV   c                     SU  S3$ )Nzstd::floor(r   rU   r  s    rW   floorCppOverrides.floor%  r  rV   c                 H    U  SU 3nU  SU 3nSU  SU SU SU SU SU S	3$ )
N /  % ((z
 < 0) != (z	 < 0) ? (z != 0 ? z - 1 : z) : r   rU   )r  r  quotrems       rW   floordivCppOverrides.floordiv)  sR     Cs|3qclA3j9SE$wtfDQUPVVWXXrV   c                     SU  S3$ )Nz
std::ceil(r   rU   r  s    rW   ceilCppOverrides.ceil0  r  rV   c                     SU  S3$ )Nzstd::trunc(r   rU   r  s    rW   truncCppOverrides.trunc4  r  rV   c                     U  SU 3$ Nr  rU   r  s     rW   truncdivCppOverrides.truncdiv8  s     Cs|rV   c                     SU  SU S3$ )Nz
std::fmod(r   r   rU   r  s     rW   fmodCppOverrides.fmod=  s    A3b1%%rV   c                     SU  S3$ )Nzstd::isinf(r   rU   r  s    rW   isinfCppOverrides.isinfA  r  rV   c                     SU  S3$ )Nzstd::isnan(r   rU   r  s    rW   isnanCppOverrides.isnanE  r  rV   c                     SU  S3$ )Nzstd::lgamma(r   rU   r  s    rW   lgammaCppOverrides.lgammaI  r  rV   c                     SU  S3$ )Nz
std::acos(r   rU   r  s    rW   acosCppOverrides.acosM  r  rV   c                     SU  S3$ )Nzstd::acosh(r   rU   r  s    rW   acoshCppOverrides.acoshQ  r  rV   c                     SU  S3$ )Nz
std::cosh(r   rU   r  s    rW   coshCppOverrides.coshU  r  rV   c                     SU  S3$ )Nz
std::sinh(r   rU   r  s    rW   sinhCppOverrides.sinhY  r  rV   c                     SU  S3$ )Nz
std::asin(r   rU   r  s    rW   asinCppOverrides.asin]  r  rV   c                     SU  S3$ )Nzstd::asinh(r   rU   r  s    rW   asinhCppOverrides.asinha  r  rV   c                     SU  SU S3$ )Nzstd::atan2(r   r   rU   r  ys     rW   atan2CppOverrides.atan2e      QCr!A&&rV   c                     SU  S3$ )Nz
std::atan(r   rU   r  s    rW   atanCppOverrides.atani  r  rV   c                     SU  S3$ )Nzstd::atanh(r   rU   r  s    rW   atanhCppOverrides.atanhm  r  rV   c                     SU  SU S3$ )Nzstd::copysign(r   r   rU   r'  s     rW   copysignCppOverrides.copysignq  s    s"QCq))rV   c           	         SU  S3SU  S34n[        S U 5       5      (       a  [        S U 5       5      $ [        5       n[        R                  R
                  R                  [        R                  S9n[        R                  R
                  R                  U R                  S9nUR                  SU S35        UR                  S	U S
U  SU S35        [        R                  R                  R                  U5        XC4n[        X5       H.  u  pg[        R                  R
                  R                  Xg5        M0     XC4$ )Nfrexp()[0])[1]c              3   x   #    U  H0  n[         R                  R                  R                  U5      S Lv   M2     g 7fr  r4   re  r  try_getr  	cache_keys     rW   r  %CppOverrides.frexp.<locals>.<genexpr>x  (     WJyqxx||##I.d:J   8:c              3   t   #    U  H.  n[         R                  R                  R                  U5      v   M0     g 7fr  r:  r<  s     rW   r  r>  y  &     U*Y--i88*   68r   zint32_t r   r   z = std::frexp(, &r   )r'  r   r7   r4   re  r  newvarr   int32r   r   r  splicezipput)r  
cache_keysr   exponentmantissacse_varsr=  cse_vars           rW   frexpCppOverrides.frexpu  s   aS%s$'77
WJWWWU*UUU~88<<&&U[[&988<<&&QWW&5(1-.xjqcXJbIJ	%'"%j";IHHLLY0 #<!!rV   c                     SU  SU S3$ )Nzstd::hypot(r   r   rU   r'  s     rW   hypotCppOverrides.hypot  r+  rV   c                     SU  S3$ )Nzstd::log10(r   rU   r  s    rW   log10CppOverrides.log10  r  rV   c                     SU  S3$ )Nz
std::log2(r   rU   r  s    rW   log2CppOverrides.log2  r  rV   c                     SU  SU S3$ )Nzstd::nextafter(r   r   rU   r'  s     rW   	nextafterCppOverrides.nextafter  s     2aS**rV   c                     [         R                  R                  nUS:X  a  gUS:X  a  U  S3$ US:X  a  U  SU  S3$ Uc	  SU  S	U  S
3$ [        SU< 35      e)Ncompile_errorcompile error!runtime_error	; throw 1r  r  r  z	std::max(, decltype()(0))7unrecognized config cpp.inject_relu_bug_TESTING_ONLY = r   r  inject_relu_bug_TESTING_ONLYr   r  s     rW   reluCppOverrides.relu  s|    jj55/!#O#S	?"JSQCt,,[qcQCu55 I#Q rV   c                     SU  SU S3$ )Nzmin_propagate_nan(r   r   rU   r  s     rW   minimumCppOverrides.minimum      #A3b1--rV   c                     SU  SU S3$ )Nzmax_propagate_nan(r   r   rU   r  s     rW   maximumCppOverrides.maximum  rm  rV   c                     U  SU SU 3$ )N ?  : rU   )r  r  cs      rW   whereCppOverrides.where  s    Cs#aS!!rV   c                     SU  SU S3$ )Nzmod(r   r   rU   r  s     rW   r  CppOverrides.mod  s    aS1#QrV   c                 (    [        U [        U   5      $ r  )rO   rH   )valr   s     rW   constantCppOverrides.constant  s    Ce!455rV   c                    [        [        R                  R                  U 5      5      n[        R                  R                  R                  [        R                  R                  U[        U 5      S9n[        R                  " X15      $ )Nbounds)
rD   r4   re  rename_indexingr  r  r  r&   r2   r  )r  r   idx_strr   s       rW   
index_exprCppOverrides.index_expr  s`    0067hhll##HHg.CD.I $ 
 ||C''rV   c                 6   [        5       n[        R                  R                  R	                  5       nUR                  SU S35        [        R                  R                  U5         UR                  5          U" 5       nUR                  SU S35        S S S 5        S S S 5        UR                  S5        [        R                  R                  R                  U5        [        USU S35      nU  SU SU 3$ ! , (       d  f       Nj= f! , (       d  f       Ns= f)	Nr    = [&]return r   r  z())rr  z() : )r7   r4   re  r  rF  r   swap_buffersr   r  rH  rO   )maskbodyotherr   body_varr   
other_codes          rW   maskedCppOverrides.masked  s    ~ 88<<&&(xj/0XX""4($++-VFNNWVHA./ +8( 	s	% "%9XJc)BC
s8*E*66 +8-((s$   'D
8C9D
9
D	D


Dc                     U  SU 3$ )N && rU   r  s     rW   logical_andCppOverrides.logical_and      D}rV   c                     SU  3$ )N!rU   r  s    rW   logical_notCppOverrides.logical_not      1#wrV   c                     U  SU 3$ )Nr   rU   r  s     rW   
logical_orCppOverrides.logical_or  r  rV   c                     U  SU 3$ )N != rU   r  s     rW   logical_xorCppOverrides.logical_xor  r  rV   c                     SU  SU  SU S3$ )Nr  r   & r   rU   r  s     rW   bitwise_andCppOverrides.bitwise_and  r  rV   c                     SU  SU  S3$ )Nr  z)(~r   rU   r  s    rW   bitwise_notCppOverrides.bitwise_not  r  rV   c                     SU  SU  SU S3$ )Nr  r   | r   rU   r  s     rW   
bitwise_orCppOverrides.bitwise_or  r  rV   c                     SU  SU  SU S3$ )Nr  r  r   r   rU   r  s     rW   bitwise_xorCppOverrides.bitwise_xor  r  rV   c                    [        5       nUR                  S5        UR                  5          [        U R                     nUR                  SU SU S35        UR                  SU SU SU S35        UR                  5          UR                  S	U  S
35        S S S 5        UR                  S	U  SU SU  SU S3	5        S S S 5        UR                  S5        U$ ! , (       d  f       NG= f! , (       d  f       N2= f)N[&]()constexpr decltype() max_shift = sizeof(z) * CHAR_BIT;$if ((static_cast<std::make_signed_t<>>() < 0) || ( >= max_shift))return decltype(z)(0);z#)(static_cast<std::make_unsigned_t<z) << r   ()r7   r   r   rH   r   r  r  r   scalar_ts       rW   bitwise_left_shiftCppOverrides.bitwise_left_shift  s    ~w[[]#AGG,HNN%aS(=hZ}U NN6xjA3kRSQTTcd !1!E:; NN"1#%H
RUVWUXX]^_]``bc  	t  ]s$   AC+C&C+
C(	$C++
C9c           
         [        5       nUR                  S5        UR                  5          [        U R                     nUR                  SU SU SU S35        UR                  SU SU SU S	35        UR                  5          UR                  S
U  SU  S35        S S S 5        UR                  S
U  SU  SU S35        S S S 5        UR                  S5        U$ ! , (       d  f       ND= f! , (       d  f       N2= f)Nr  r  r  z ) * CHAR_BIT - std::is_signed_v<z>;r  r  r  r  r  r  z >> max_shift); >> r   r  r  r  s       rW   bitwise_right_shift CppOverrides.bitwise_right_shift  s    ~w[[]#AGG,HNN%aS(=hZGghpgqqst NN6xjA3kRSQTTcd !1!BqcIJ NN-aS1#T!B?@  	t	  ]s$   AC.C#C.
C+	'C..
C<seedrC  c                     SU  SU S3$ )Nznormalized_rand_cpu(r   r   rU   r  rC  s     rW   randCppOverrides.rand  s    %dV2fXQ77rV   c                     SU  SU S3$ )Nz
randn_cpu(r   r   rU   r  s     rW   randnCppOverrides.randn  s    D6F81--rV   c           	           SU  SU SU SU S3	$ )Nzrandint64_cpu(r   r   rU   )r  rC  lowhighs       rW   	randint64CppOverrides.randint64!  s#    vRxr#ba@@rV   c                     SU  SU  SU  S3$ )Nr  z)(1) / (decltype(z)(1) + std::exp(-r   rU   r  s    rW   sigmoidCppOverrides.sigmoid%  s    1#.qc1B1#RHHrV   c           
      X   [        5       nSU  S3nSU  S3nUR                  S5        UR                  5          UR                  SU  SU SU S35        UR                  S	U  S
U SU S35        UR                  S5        S S S 5        UR                  S5        U$ ! , (       d  f       N!= f)Nr  )(0)r  r  auto left = z > 0 ? rs  r   auto right = z < 0 ? return left - right;r  r7   r   r   )r  r   scalar_zero
scalar_ones       rW   signCppOverrides.sign)  s    ~!!D) 4(
w[[]NN\!GJ<s;-qQRNN]1#WZLK=PQRSNN12  	t ]s   AB
B)rU   NT)Pr  r  r  r  r  staticmethodr  r   rV  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r  r  r  r  r  r!  r$  r)  r-  r0  r3  rP  rS  rV  rY  r\  rh  rk  ro  ru  r  r{  r  r  r  r  r  r  r  r  r  r  r  r  r   Exprr  r  r  r  r  r  rU   rV   rW   r  r    s   %+ + + + + + ( (T < <             & &     ! ! " "     ! ! # # ! ! % % 	 	     ! ! 	
 	
 % %     & & " " Y Y ! ! " "   & & " " " " # # ! ! " " ! ! ! ! ! ! " " ' ' ! ! " " * * " "  ' ' " " ! ! + +   . . . . " "     6 6 ( ( 7 7          + + & & + + + +  &  " 85:: 8uzz 8 8 .EJJ .

 . . A

 AEJJ A A I I 
 
rV   r  r  c                      ^  \ rS rSrSrU 4S jr\S 5       r\S 5       r\S 5       r	\S 5       r
\S 5       r\S	 5       r\S
 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r \S 5       r!\S 5       r"\S  5       r#\S! 5       r$\S" 5       r%\S# 5       r&\S$ 5       r'\S% 5       r(\S& 5       r)\S' 5       r*\S( 5       r+\S) 5       r,\S* 5       r-\S+ 5       r.\S, 5       r/\S- 5       r0\S. 5       r1\S/ 5       r2\S0 5       r3\S1 5       r4\S2 5       r5\S3 5       r6\S4 5       r7\S5 5       r8\S6 5       r9\S7 5       r:\S8 5       r;\S9 5       r<\S: 5       r=\S; 5       r>\S< 5       r?\S= 5       r@\S> 5       rA\S? 5       rB\S@ 5       rC\SA 5       rD\SB 5       rE\SC 5       rF\SD 5       rG\SE 5       rH\SF 5       rI\SG 5       rJ\SH 5       rK\SI 5       rL\SJ 5       rM\SK 5       rN\STSL j5       rO\SM 5       rP\SN 5       rQ\SO 5       rR\SP 5       rS\TSQ 5       rU\TSR 5       rVSSrWU =rX$ )UCppVecOverridesi:  z.Map element-wise ops to aten vectorization C++c                    >^ [         TU ]  U 5      mUU4S jn[        [        5      R	                  5        H@  u  pE[        USS 5      [        :X  d  M  US;  d  M$  [        TXC" UR                  5      5        MB     T$ )Nc                    >^  UU U4S jnU$ )Nc                    > U  Vs/ s HT  n[        U[        [        R                  45      (       d*  [        U[        5      (       d  M?  UR
                  (       a  MR  UPMV     nnU  Vs/ s H/  n[        U[        5      (       d  M  UR
                  (       d  M-  UPM1     nn[        U 5      nU(       a  U(       a  / nU  H  n[        U[        [        R                  45      (       a  [        U[        R                  5      (       a7  UR                  (       d&  [        R                  " U[        R                  5      nO%[        R                  " U[        R                  5      n[        U[        5      (       a  UR                  OUnUR                  U5        M     U(       a@  [!        U5      S:X  a  [#        U5      nO%T
[$        R&                  :X  a  [#        USS  5      USS & U(       a  U(       a  [        [(        R*                  [,        5      (       d   eU Vs/ s H}  n[        U[        5      (       ac  UR
                  (       dR  T
[$        R.                  [$        R0                  [$        R2                  4;  a  [(        R*                  R5                  U5      OUPM     nnU(       a  T
" U0 UD6$ [7        [$        T5      n[9        UT
R:                  5      nUc   eU" U 0 UD6$ s  snf s  snf s  snf )Nr   r5   )r   rm   r   r  rG   r   r(  	is_numberr2   r  r   int64r{  r3   valueappendr   rL   r  ru  r4   re  CppVecKernelr  r  r  	broadcastr2  rF  r  )argskwargsargscalarsvectorsnew_argsnew_arg
scalar_opsscalar_funcr7  funcr4  s            rW   wrapper6CppVecOverrides.__new__.<locals>.wrap.<locals>.wrapperM  s`     $#!#UZZ'899"37  AD

 #    $#!#~6 ;>:: #  
  :w!H#%cC+<==)#uzz::3==&)nnS%++&F&)ll3&D/9#x/H/H#))cC ,  $  8})#/#9!6!66'3HQRL'A w%ahh====  (0  (0G !+7N C C(/$($3$8$8$3$9$9$3$=$=("%"	 HH..w7 ")) (0   $ 4V44 "'!=J")*dmm"DK&222&777@ s*   >KKK'K"K"K"BK'rU   )r  r  r7  r4  s   ` rW   wrap%CppVecOverrides.__new__.<locals>.wrap@  s    @8D NrV   r7  )r  r  )	r2  __new__varsr  itemsrF  r  setattr__func__)r*  r  kargsr  r   methodr4  r7  s         @rW   r  CppVecOverrides.__new__=  sq    ws#O	b !1779LDv{D1\Ad S G dD$9: : rV   c                     U  SU 3$ )Nr  rU   r  s     rW   r  CppVecOverrides.add      Cs|rV   c                     U  SU 3$ )Nr  rU   r  s     rW   r   CppVecOverrides.sub  r  rV   c                     U  SU 3$ Nr   rU   r  s     rW   rV  CppVecOverrides.mul  r  rV   c                     U  SU 3$ r  rU   r  s     rW   truedivCppVecOverrides.truediv  r  rV   c                     U  S3$ )Nz.abs()rU   r  s    rW   r  CppVecOverrides.abs      F|rV   c                     U  S3$ )Nz.sin()rU   r  s    rW   r  CppVecOverrides.sin  r  rV   c                     U  S3$ )Nz.cos()rU   r  s    rW   r  CppVecOverrides.cos  r  rV   c                     U  S3$ )Nz.exp()rU   r  s    rW   r  CppVecOverrides.exp  r  rV   c                     U  S3$ )Nz.exp2()rU   r  s    rW   r  CppVecOverrides.exp2      G}rV   c                     SU  S3nU  SU 3$ )Nr  r  z	.exp() - rU   )r  vec_ones     rW   r  CppVecOverrides.expm1  s#     aS%IgY''rV   c                     U  S3$ )Nz.erf()rU   r  s    rW   r  CppVecOverrides.erf  r  rV   c                     U  S3$ )Nz.erfc()rU   r  s    rW   r  CppVecOverrides.erfc  r  rV   c                     U  S3$ )Nz	.erfinv()rU   r  s    rW   r  CppVecOverrides.erfinv      IrV   c                     U  S3$ )Nz.sqrt()rU   r  s    rW   r  CppVecOverrides.sqrt  r  rV   c                     [        [        R                  [        5      (       d   e[        U [        5      (       d   eU R
                  c   e[        R                  R                  U R
                  5       SU  SU S3$ )N( == r   r   r4   re  r  rG   r   _get_mask_typer'  s     rW   eqCppVecOverrides.eq  i    !((L1111!^,,,,ww"""(())!''231QCtA3a@@rV   c                    [        [        R                  [        5      (       d   e[        U [        5      (       d   eU R
                  [        R                  :X  aN  UR
                  [        R                  :X  d   e[        [        R                  R                  X45      u  p#U SU 3$ U R
                  c   e[        R                  R                  U R
                  5       SU  SU S3$ )Nr  r$  r   )r   r4   re  r  rG   r   r   rk   rN   r  r'  )r  r(  x_casty_casts       rW   neCppVecOverrides.ne  s    !((L1111!^,,,,77ejj 77ejj(((1!((2B2BQFKNFXT&**77&&&hh--agg67q4s!DDrV   c                     [        [        R                  [        5      (       d   e[        U [        5      (       d   eU R
                  c   e[        R                  R                  U R
                  5       SU  SU S3$ )Nr$  r   r   r&  r'  s     rW   ltCppVecOverrides.lt  i    !((L1111!^,,,,ww"""(())!''231QCs1#Q??rV   c                     [        [        R                  [        5      (       d   e[        U [        5      (       d   eU R
                  c   e[        R                  R                  U R
                  5       SU  SU S3$ )Nr$  z > r   r&  r'  s     rW   gtCppVecOverrides.gt  r3  rV   c                     [        [        R                  [        5      (       d   e[        U [        5      (       d   eU R
                  c   e[        R                  R                  U R
                  5       SU  SU S3$ )Nr$   <= r   r&  r'  s     rW   leCppVecOverrides.le  r*  rV   c                     [        [        R                  [        5      (       d   e[        U [        5      (       d   eU R
                  c   e[        R                  R                  U R
                  5       SU  SU S3$ )Nr$   >= r   r&  r'  s     rW   geCppVecOverrides.ge  r*  rV   c                     U  SU 3$ Nr  rU   r'  s     rW   and_CppVecOverrides.and_  r  rV   c                     U  S3$ )Nz.rsqrt()rU   r  s    rW   r  CppVecOverrides.rsqrt      H~rV   c                     U  SU S3$ )Nz.pow(r   rU   r  s     rW   r  CppVecOverrides.pow  s    E!ArV   c                     U  S3$ )Nz.log()rU   r  s    rW   r  CppVecOverrides.log  r  rV   c                     U  S3$ )Nz.round()rU   r  s    rW   r  CppVecOverrides.round  rE  rV   c                     U  S3$ )Nz.floor()rU   r  s    rW   r  CppVecOverrides.floor  rE  rV   c                     U  S3$ )Nz.ceil()rU   r  s    rW   r  CppVecOverrides.ceil  r  rV   c                     U  S3$ )Nz.trunc()rU   r  s    rW   r  CppVecOverrides.trunc  rE  rV   c                     U  SU S3$ )Nz.fmod(r   rU   r  s     rW   r	  CppVecOverrides.fmod#  s    F1#QrV   c                     U  S3$ )Nz	.lgamma()rU   r  s    rW   r  CppVecOverrides.lgamma'  r   rV   c                 *    [        X5      u  pU  SU 3$ r@  rK   r  s     rW   r  CppVecOverrides.logical_and+      ,Q2Cs|rV   c                     SU  3$ N~rU   r  s    rW   r  CppVecOverrides.logical_not0  r  rV   c                 *    [        X5      u  pU  SU 3$ Nr  rW  r  s     rW   r  CppVecOverrides.logical_or4  rY  rV   c                 *    [        X5      u  pU  SU 3$ Nr   rW  r  s     rW   r  CppVecOverrides.logical_xor9  rY  rV   c                 *    [        X5      u  pU  SU 3$ r@  rW  r  s     rW   r  CppVecOverrides.bitwise_and>  rY  rV   c                     SU  3$ r[  rU   r  s    rW   r  CppVecOverrides.bitwise_notC  r  rV   c                 *    [        X5      u  pU  SU 3$ r_  rW  r  s     rW   r  CppVecOverrides.bitwise_orG  rY  rV   c                 *    [        X5      u  pU  SU 3$ rb  rW  r  s     rW   r  CppVecOverrides.bitwise_xorL  rY  rV   c                     U  SU 3$ )Nz << rU   r  s     rW   r  "CppVecOverrides.bitwise_left_shiftQ  r  rV   c                     U  SU 3$ )Nr  rU   r  s     rW   r  #CppVecOverrides.bitwise_right_shiftU  r  rV   c                     [        [        R                  [        5      (       d   e[        R                  R	                  X5       $ r  )r   r4   re  r  load)r   rC  s     rW   	load_seedCppVecOverrides.load_seedY  s/    !((L1111((---./rV   c                     [        [        R                  [        5      (       d   e[	        5       nSU  S3n[        XU5      $ )Nz)result[offset_idx] = normalized_rand_cpu(, offset[offset_idx]);r   r4   re  r  r7   rF   r  rC  r   rand_functions       rW   r  CppVecOverrides.rand^  sA    !((L1111~7v=ST 	 F-88rV   c                     [        [        R                  [        5      (       d   e[	        5       nSU  S3n[        XU5      $ )Nzresult[offset_idx] = randn_cpu(ru  rv  rw  s       rW   r  CppVecOverrides.randng  s<    !((L1111~9$?UVF-88rV   c                     [        [        R                  [        5      (       d   e[	        5       nSU  SU SU S3n[        XU[        R                  5      $ )Nz#result[offset_idx] = randint64_cpu(z, offset[offset_idx], r   r   )r   r4   re  r  r7   rF   r   r  )r  rC  r  r  r   rx  s         rW   r  CppVecOverrides.randint64n  sT    !((L1111~=dVCYZ]Y^^`ae`ffhiF-EErV   c                 ~    U R                   UR                   :X  d   S5       eU  S[        R                  X5       SU 3$ )Nz;remainder vec implementation expect the same inputs' dtype.z - (z) * )r   r  r  r  s     rW   	remainderCppVecOverrides.remainderu  sG    ww!''! 	
I	
! D11!78QC@@rV   c                     U  S3$ )Nz.tan()rU   r  s    rW   r  CppVecOverrides.tan|  r  rV   c                     U  S3$ )Nz.tanh()rU   r  s    rW   r  CppVecOverrides.tanh  r  rV   c                     U  S3$ )Nz.reciprocal()rU   r  s    rW   
reciprocalCppVecOverrides.reciprocal  s    M""rV   c                     U  S3$ )Nz.atan()rU   r  s    rW   r-  CppVecOverrides.atan  r  rV   c                     U  S3$ )Nz.acos()rU   r  s    rW   r  CppVecOverrides.acos  r  rV   c                     U  S3$ )Nz.asin()rU   r  s    rW   r!  CppVecOverrides.asin  r  rV   c                     U  S3$ )Nz.cosh()rU   r  s    rW   r  CppVecOverrides.cosh  r  rV   c                     U  S3$ )Nz.sinh()rU   r  s    rW   r  CppVecOverrides.sinh  r  rV   c                     U  S3$ )Nz.log10()rU   r  s    rW   rV  CppVecOverrides.log10  rE  rV   c                     U  S3$ )Nz.log2()rU   r  s    rW   rY  CppVecOverrides.log2  r  rV   c                     U  SU S3$ )Nz.nextafter(r   rU   r'  s     rW   r\  CppVecOverrides.nextafter  s    Ks!$$rV   c                     U  SU S3$ )Nz
.copysign(r   rU   r  s     rW   r3  CppVecOverrides.copysign  s    Jqc##rV   c                     U  SU S3$ )Nz.atan2(r   rU   r  s     rW   r)  CppVecOverrides.atan2      GA3a  rV   c                     U  SU S3$ )Nz.hypot(r   rU   r  s     rW   rS  CppVecOverrides.hypot  r  rV   c           
      <    SU  S3nSU  S3nU SU SU  SU SU  S3
$ )	Nr  r  z)(0.5)z * ((r  z)/(r  z)).log()rU   )r  r  vec_one_halfs      rW   r0  CppVecOverrides.atanh  sE     aS%"1#V,uWIS3wis1#XNNrV   c                     U  S3$ )Nz.asinh()rU   r  s    rW   r$  CppVecOverrides.asinh  rE  rV   c                     U  S3$ )Nz.acosh()rU   r  s    rW   r  CppVecOverrides.acosh  rE  rV   c                     [         R                  R                  nUS:X  a  gUS:X  a  U  S3$ US:X  a  U  SU  S3$ Uc	  SU  S	U  S
3$ [        SU< 35      e)Nr_  r`  ra  rb  r  r  r  zat::vec::clamp_min(rc  rd  re  rf  r  s     rW   rh  CppVecOverrides.relu  s|    jj55/!#O#S	?"JSQCt,,[(;qc?? I#Q rV   c                     SU  SU  SU  S3$ )Nr  z)(1)/(decltype(z)(1) + z.neg().exp())rU   r  s    rW   r  CppVecOverrides.sigmoid  s    1#_QCwqcGGrV   c                     U  S3$ )Nz.neg()rU   r  s    rW   r  CppVecOverrides.neg  r  rV   c                    [        U R                  5      (       a*  U R                  UR                  :X  d   S5       eSU  SU S3$ [        S X4 5       5      (       d   eSU  S3n[        R                  R                  UR                  5      S:  a,  U SS[        R                  R                  -  S-
   S	U S
U S3nU  SU 3nSU  SU SU S3nSU  SU SU SU S3	nU SU SU SU S
U SU S3$ )NzDdiv_floor_floating_vec implementation expect the same inputs' dtype.zdiv_floor_floating_vec(r   r   c              3   L   #    U  H  n[        UR                  5      v   M     g 7fr  )r   r   )r  items     rW   r  +CppVecOverrides.floordiv.<locals>.<genexpr>  s     G'

33   "$r  r5   ::blend<r  (1), r  r$  r  r  z(0))r  r   z	(0)) != (z(0)))z	::blendv(r  r  )r   r   r'  r4   re  _get_raw_num_vectorstiling_factor)r  r  _tr  has_remis_negs         rW   r  CppVecOverrides.floordiv  s5   !''""77agg% V% -QCr!A66GGGGGGQCq!Bxx,,QWW59d(A)?)?$?1#D"ERt5QRPSSTUSA3<D!Cs$rd$/G!Ct9QCs2$e<FT4&4&B4uWISPQRRrV   c                     [         R                  R                  UR                  5      S:  a2  SU S3nU SS[         R                  R                  -  S-
   SU SU S3nU  SU 3$ )Nr5   r  r   r  r  r  r  )r4   re  r  r   r  )r  r  r  s      rW   r  CppVecOverrides.truncdiv  sp     88((1A5QCq!B$hQXX%;%; ;q@AB4uQCqQACs|rV   c                     U R                   [        R                  :X  aN  UR                   [        R                  :X  d   e[        [        R
                  R                  X45      u  p#U SU 3$ SU  SU S3$ )Nr  at::vec::minimum(r   r   r   r   rk   rN   r4   re  r  r  r  a_castb_casts       rW   rk  CppVecOverrides.minimum  h    77ejj 77ejj(((1!((2B2BQFKNFXS))&qcA3a00rV   c                     U R                   [        R                  :X  aN  UR                   [        R                  :X  d   e[        [        R
                  R                  X45      u  p#U SU 3$ SU  SU S3$ )Nr  at::vec::maximum(r   r   r  r  s       rW   ro  CppVecOverrides.maximum  r  rV   c                     U  SU  3$ r  rU   r  s    rW   squareCppVecOverrides.square
  r  rV   c                    [        [        R                  [        5      (       d   eUR                  [
        R                  :X  aX  UR                  [
        R                  :X  d   e[        [        R                  R                  XU45      u  p4nSU SU SU SU S3	$ SU SU SU S[        R                  R                  XR                  5       S3	$ )Nr  
)::blendv(r   r   )
r   r4   re  r  r   r   rk   rN   r  _get_mask_cast)r  r  rt  blendv_ablendv_bblendv_cs         rW   ru  CppVecOverrides.where  s    !((L111177ejj 77ejj(((+?  1),(H xj
8*Bxj8*TUVVqcA3b2ahh6M6MaQXQX6Y5ZZ[\\rV   c                    [        5       nSU  S3nSU  S3nSU  SU SU SU SU  S3nSU  SU SU SU  SU S3nUR                  S5        UR                  5          UR                  S	U S
35        UR                  SU S
35        UR                  S5        S S S 5        UR                  S5        U$ ! , (       d  f       N!= f)Nr  r  r  r  r   r   r   r  r  r   r  r  r  r  )r  r   vec_zeror  blendv_lblendv_rs         rW   r  CppVecOverrides.sign  s    ~qc&aS%qcH:Ry8*CPQsRSTqcH:Ry1#S
RSTw[[]NN\(156NN]8*A67NN12  	t ]s   <B33
Cc           
         U[         R                  [         R                  [         R                  [         R                  [         R
                  [         R                  [         R                  [         R                  [         R                  4	;   d   [         SU 35       e[        U [        5      (       d   eU R                  n[        R                  R!                  XU5      n[        R                  R"                  R%                  [        R                  R&                  U5      nUR)                  SX4SU05        U[*        ;   a4  U[         R                  :X  a   [        R                  R-                  XXQ5        U$ )Nz does not support r  r   )r   rk   float64rn   bfloat16float16uint8int8rG  r  r  r   rG   r   r4   re  r  r  r  r  r  r   r  )r  r   r   use_compute_dtypesr  r  s         rW   r  CppVecOverrides.to_dtype)  s   JJMMKKNNMMKKJJKKKK

 

 
	2 Z)%1
	2 

 !^,,,,GG	xx))!I>&&qxx'7'7>j1*{I6NOM!i5;;&>HH((vErV   c                     [         R                  R                  nUS:X  a  U  SU  S3$ Uc  U  S3$ [        SU< 35      e)Nr  r  r  z.log1p()r  r  r  s     rW   r  CppVecOverrides.log1p?  sT    jj66*SQCt,,[S>! J3'R rV   c                 
  ^ [        [        R                  [        5      (       d   e[	        5       n[        R                  R
                  R                  5       n[        R                  R                  U 5       nUR                  SU S35        [        R                  R                  U5         UR                  5          U" 5       nUR                  SU S35        S S S 5        S S S 5        S S S 5        UR                  S5        [        R                  R                  R                  U5        WR                  mU S3nU4S jnUR                  (       a  Un	OU" U5      n	[        U[         T   5      n
U" U
5      n[        W["        5      (       d   U5       eUR                  (       Ga&  [	        5       nUR                  S5        [        R                  R                  U5         UR                  5          UR                  SU S	35        UR                  5          UR                  SU S35        S S S 5        UR                  S
5        UR                  5          [        R                  R
                  R%                  [        R                  R                  U	5      n[        R                  R
                  R%                  [        R                  R                  U5      n[        U["        5      (       d   U5       e[        U["        5      (       d   U5       eTUl        TUl        [        R                  R&                  nUR                  SUR)                  X\U5       S35        S S S 5        S S S 5        S S S 5        UR                  S5        [        R                  R
                  R%                  [        R                  R                  U5      nOUR                  (       aK  [        R                  R
                  R%                  [        R                  R                  U  SU	 SU 35      nOJ[        R                  R
                  R%                  [        R                  R                  U  SU SU
 35      nUR+                  SXX&40 5        U$ ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GNx= f! , (       d  f       GN= f! , (       d  f       GN= f)Nr   r  r  r   r  c                    > T[         R                  :X  a$  [        R                  R	                  5        SU  S3$ [        R                  R                  T5       SU  S3$ )N::from(r   r$  )r   rk   r4   re  r'  _get_vec_type)r   r   s    rW   maskify_or_vecify1CppVecOverrides.masked.<locals>.maskify_or_vecify[  s]     EJJ& 88**,-WTF!< ..u56avQ?rV   [&]if (z.all_zero())elserr  rs  r  )r   r4   re  r  r7   r  rF  r  r   r  r   r  rH  r   r   rO   rH   rG   r  	overridesru  r  )r  r  r  r   r   new_maskr   	body_coder  body_code_vecr  other_code_vecbody_vec_varother_vec_varr  r  r   s                   @rW   r  CppVecOverrides.maskedK  s   !((L1111~hhll!!#XX__T"hNNU3%v./&&t,dkkm23 /<, #
 	s	%e2J		 ==%M-i8M!%e)<=
*:6(N33=X=3???>DNN5!&&t,dkkmhZ|<=[[]NNW^,<A#>? #v&[[]#$88<<#8#8((%$L %&HHLL$9$9((&%M &lNCCQ\QC%m^DDSmSD).L&*/M'**  NN!)//(-"X!YYZ[# # /<,2 NN4 XX\\**  F ]]XX\\**  TF#m_C?O"PF XX\\**  TF#i[J<"HF
 	hU(CRHI /<m,, #"> #] #] /<m,,s   85R0-R>RR#R0#S84&S&S0)S&D
S#S&+S8
RR
R-	(R00
R?
SS&
S#S&&
S5	0S88
Tc                 `   [        [        R                  [        5      (       d   e[        R                  R	                  U 5      n[        R                  R
                  [        R                  R                     n[        R                  R                  X#5      nUS:X  a  [        R                  X5      $ Ub  [        R                  R                  R                  [        R                  R                  [        U5      [        U 5      S9n[        R                   " XQ5      n[        U["        5      (       a  UR$                  n[        R                  R'                  Xd5      nO9[        R                  R)                  S X![        R                  R                  5      nUR+                  SX40 5        U$ )Nr   r~  r  )r   r4   re  r  r  itervars
tiling_idx_try_get_const_strider  r  r  r  r  rD   r&   r2   r  r3   r  arange_load_or_store_non_contiguousr  )r  r   r   
tiling_varstriderZ  r  r  s           rW   r  CppVecOverrides.index_expr  s-   !((L1111((.XX&&qxx':':;
//BQ;**477((,,''  %,7LT7R ( C LL,E%**XX__U3FXX;;eAHH$4$4F 	lTM2>rV   c           
         SU  S3SU  S34n[        S U 5       5      (       a  [        S U 5       5      $ [        U R                     n[        R
                  R                  (       a  [        R
                  R                  O[        R
                  R                  n[        5       n[        R
                  R                  R                  [        R                  S9n[        R
                  R                  R                  U R                  S9nUR                  SU 40 S9  UR                  SU 40 S9  [        R
                  R                  U R                  5      nUS	:X  a  S
U S3OSU SU S3nUR                  US	:X  a  SU S3OSU SU S35        UR                  U SU S35        UR                  S5        UR!                  5          UR                  SU S[        R
                  R                   S35        UR                  U  S[#        U5       S35        UR                  S[        R
                  R                   S35        UR                  SU S[        R
                  R                   S35        UR                  S[#        U5       S35        UR!                  5          UR                  S5        S S S 5        UR                  US	:X  a  U S[#        U5       S3OU SU S [#        U5       S35        UR                  U S!U S"[#        U5       S35        S S S 5        UR                  S#5        [        R
                  R$                  R'                  U5        Xe4n	[)        X5       H.  u  p[        R
                  R                  R+                  X5        M0     Xe4$ ! , (       d  f       N= f! , (       d  f       N= f)$Nr6  r7  r8  c              3   x   #    U  H0  n[         R                  R                  R                  U5      S Lv   M2     g 7fr  r:  r<  s     rW   r  (CppVecOverrides.frexp.<locals>.<genexpr>  r?  r@  c              3   t   #    U  H.  n[         R                  R                  R                  U5      v   M0     g 7fr  r:  r<  s     rW   r  r    rB  rC  rD  rP  )r  r5   at::vec::Vectorized<r   at::vec::VectorizedN<r   zat::vec::Vectorized<int32_t> r   zat::vec::VectorizedN<int32_t, > r   r  __at_align__ std::array<	> tmpbuf;.store(tmpbuf.data(), r   z!__at_align__ std::array<int32_t, z> tmpbuf_exponent;z> tmpbuf_mantissa;r   r   z@tmpbuf_mantissa[i] = std::frexp(tmpbuf[i], &tmpbuf_exponent[i]);z? = at::vec::Vectorized<int32_t>::loadu(tmpbuf_exponent.data(), z! = at::vec::VectorizedN<int32_t, z!>::loadu(tmpbuf_exponent.data(), r   z ::loadu(tmpbuf_mantissa.data(), z();)r'  r   rH   r   r4   re  	tail_sizer  r7   r  rF  r   rG  r  _get_num_vectorsr   r   rE   r  rH  rI  rJ  )r  rK  r   rB  r   rL  rM  n_vec
mantissa_trN  r=  rO  s               rW   rP  CppVecOverrides.frexp  s_   aS%s$'77
WJWWWU*UUUagg&%&XX%7%7qxx!!QXX=S=S~88<<&&U[[&988<<&&QWW&5!b9!b9))!''2 z #6(!,(5'; 	
 	z ,H:Q71%8*AF	

 	*Qxj23w[[]NN*6("QXX5K5K4LIV NNaS 6{47H6ILMNN3AHH4J4J3KK]^ NN*6("QXX5K5K4LL^_ NN1+d2C1DFKLV  NNA: *[\ghl\m[nnpq z!B5'Ijkvw{k|j}}  A
 NN*C
|+KKX\L]K^^`a+ 0 	u	%'"%j";IHHLLY0 #<!!#  ]s&   
CN3&N"8A%N3"
N0	,N33
Oc                    ^ U4S jnU$ )Nc                  t  > U(       a   e[         R                  n[        U[        5      (       d   e[	        5       nUR                  S5        U S   R                  nUR                  U5      nUR                  (       a  UR                  OUR                  n/ n[        U   nTR                  S;   n	U	(       a  SOUn
TR                  S:X  a  [        U S      OU
n
UR                  5          [        U 5       H  u  p[        U[        5      (       a  UR                  (       d   eUR                  U:X  d   eUR                  SU SUR                   S	U S
35        UR                  U SU S[!        U5       S35        UR#                  SU S35        M  UR#                  U5        M     UR                  SU
 SUR                   S35        T" U6 nUR                  S[!        U5       S35        UR                  5          UR                  SU S
35        S S S 5        U	(       a  UR                  (       a   eSnSU SU S3nO$S[!        U5       3nUS:X  a  SU
 S3nO	SU
 SU S3nUR                  SU SU S35        S S S 5        UR                  S5        U$ ! , (       d  f       N= f! , (       d  f       N2= f) Nr  r   )r  r  r  rk   r  r   r   z> tmpbufr   z.store(tmpbufz	.data(), r   tmpbufz[i]z> tmpbuf_out;r   r   ztmpbuf_out[i] = ztmpbuf_out.data()at::vec::VecMask<,z>::fromztmpbuf_out.data(), r5   r  z>::loaduz at::vec::VectorizedN<r  r$  r  )r4   re  r   r  r7   r   r   r  r  r  rH   r  r   r   rG   r   rE   r  )r  r  re  r   	vec_dtyper  rB  scalar_argsr   output_maskoctypeargidxr  res	load_argsload_fnr  s                   rW   rd  )CppVecOverrides._scalarize.<locals>.inner  s   :XXFfl3333>DNN7#QI++I6E'-'7'76##V=Q=QDK!),F%.. 3 K
  +VF  ((,>> T"X& 
 #,T?KF!#~66"zz)z"yyI5556vhbAUAU@VV^_e^ffgh "e=	+dBSATTVW $**VF83+?@#**3/ $3 .vhb9M9M8Nm\ ";/!5k$6G5HOP[[]NN%5cU!#<= #%//// 3I 1&5'IG"5k$6G5H IIz$8"I$:6("UG8"T	9+R@A? @ NN4 K #]' s&   )DJ)=JA*J)
J&	"J))
J7rU   )r*  r  rd  s    ` rW   
_scalarizeCppVecOverrides._scalarize  s    7	r rV   c                    [        [        5      n[        [        5      R                  5        HY  u  p#[	        U[
        5      (       d  M  X!;  d  M#  U R                  UR                  5      nX$l        [        X[        U5      5        M[     g r  )
r  r  r  r  r   r  r  r  r  r  )r*  vec_varsr   r  r  s        rW   _initialize_scalarize%CppVecOverrides._initialize_scalarize%  s^    ( .446LD&,//D4H~~foo6 $<#56	 7rV   rU   r  )Yr  r  r  r  r  r  r  r  r   rV  r	  r  r  r  r  r  r  r  r  r  r  r(  r.  r1  r5  r9  r=  rA  r  r  r  r  r  r  r  r	  r  r  r  r  r  r  r  r  r  r  r  rr  r  r  r  r  r  r  r  r-  r  r!  r  r  rV  rY  r\  r3  r)  rS  r0  r$  r  rh  r  r  r  r  rk  ro  r  ru  r  r  r  r  r  rP  rk  r  r  r  rl  rm  s   @rW   r  r  :  sV   8[z                   ( (
         A A 	E 	E @ @ @ @ A A A A                                           0 0 9 9 9 9 F F A A     # #               % % $ $ ! ! ! ! O O        H H   S S$   1 1 1 1   	] 	]    * 	 	 J JX  , 6" 6"p : :x 7 7rV   r  cppvecc                   $    \ rS rSr\S 5       rSrg)CppTile2DOverridesi3  c                     [        [        R                  [        5      (       d   e[        R                  R	                  U 5      n [
        R                  X5      $ r  )r   r4   re  CppTile2DKerneltransform_indexingr  r  )r  r   s     rW   r  CppTile2DOverrides.index_expr4  s>    !((O4444xx**40))$66rV   rU   N)r  r  r  r  r  r  r  rU   rV   rW   r  r  3  s    7 7rV   r  c                     ^  \ rS rSr\r\rSrSr	U 4S jr
\\4S jrS rS0S\\   4S jjr\R&                  S	 5       r S1S
\R,                  4S jjrS
\R,                  S\4S jrS
\R,                  S\R4                  4S jrS
\R,                  S\R4                  4S jrS rS\R,                  S\R,                  S\S\4S jrS\S
\R,                  4S jr S0S jr!S\"\#\4   S\S\S\$RJ                  4S jr&S0S\\'   4S jjr(S  r)S! r*S" r+S# r,S$ r-S% r.\/S\4S& j5       r0S' r1\R&                  S( 5       r2S) r3S* r4S+ r5  S2S\S,\\   S-\\R4                     4S. jjr6S/r7U =r8$ )3	CppKerneli;  r   r   c                 Z  > [         TU ]  U5        0 U l        / U l        S U l        / U l        / U l        S U l        [        5       U l	        / U l
        [        5       U l        [        5       U l        [        5       U l        [        5       U l        [        5       U l        SU l        [        5       U l        [%        U R&                  U R(                  SS9U l        [%        U R&                  U R(                  SS9U l        [        5       U l        [        5       U l        X l        0 U l        / U l        g )NFtmp_acc)name_prefixwrecps)r2  r3  active_rangesinner_itervarscall_rangesrW  r  reduction_depthr=   reduction_prefixreduction_prefix_generatorsreduction_suffixparallel_reduction_prefixparallel_reduction_suffixlocal_reduction_initlocal_reduction_storesis_reductionnon_parallel_reduction_prefixr8   newvar_prefixsuffixreduction_cseweight_recps_csepreloads
poststoresnum_threadsreduction_omp_decreduction_var_names)r4  r  r=  r7  s      rW   r3  CppKernel.__init__A  s    HJ 35=A(*,.# . 0 <>( . 0)7)9&)7)9&$2$4!&4&6#!-;-=* !3!3T[[iX #!
 '((*&=?.0 rV   c                 J   [         R                  R                  (       a,  U R                  (       d  U R                  R	                  S5        U S3n[         R                  R                  (       a  SO	[        5       nU S3n	U R                  R	                  U SU SU" X45       S35        U R                  R                  [        UUUUUU5      5        U R                  R	                  U	 SU S35        U R                  R                  SU S	3S
SU SU" X1XS9 S3S/5        g )Nz(int max_threads = omp_get_max_threads();_localmax_threadsz	_arr[tid]r   r   r   zfor (int tid = 0; tid < z; tid++)r   r   r   r   )r   r  dynamic_threadsr1  r   r+   r3  rH  r   r4  r2  r   )
r4  r   r   r   r   reduction_combine_fnreduction_init_fn	acc_localr=  acc_local_in_arrays
             rW   _gen_parallel_reduction_buffers)CppKernel._gen_parallel_reduction_buffersf  s6    ::%%d.L.L**44: e6N	#ZZ77M=Q=S 	 !$uI.!!++j)C(9.(P'QQRS	
 	&&--"!		
 	##--1C0DC	{RS.TU&&11*;-x@se33NI[mnnop		
rV   c                 \    U R                    H  n[        U R                  X S35        M     g )NrB  )r?  r   stores)r4  var_names     rW   %update_stores_with_parallel_reduction/CppKernel.update_stores_with_parallel_reduction  s'    00HT[[(j4GH 1rV   r   c                    Ub   e[        5       n[        R                  " 5        n[        U S5      (       aK  UR	                  U R
                  5        U R                  U5        UR                  UR                  5       5        UR	                  U R                  5        UR	                  U R                  5        UR	                  U R                  5        S S S 5        [        U S5      (       a  UR	                  U R                  5        U R                  (       a4  U R                   H$  nU R                  U   u  pE[        XU S3XE5      nM&     U$ ! , (       d  f       N= f)Ncodegen_inner_loops_tail)r7   r   r   r   rH  r;  rR  r   r   loadsr  rM  r<  r+  r*  r   )r4  r   r   rZ  startends         rW   gen_bodyCppKernel.gen_body  s   ||~!!#ut233DMM*((.##DKKM2KK

#KK%KK$ $ 4.//KK(**!//4
1$uE]EW +  $#s   B.E
Ec              #   0  #    U R                   nU(       ac  [        R                  " X5      n[        U[        5      (       a8  UR
                  n[        U[        5      (       d   e[        R                  Ul	        Xl          Uv   X l         g! X l         f = f7f)z>Context manager to add an additional mask to loads and stores.N)

_load_maskr2   rA  r   r3   r  rG   r   rk   r   )r4  r  priors      rW   r  CppKernel.masked  sp      88D(D$))zz!$7777 #ZZ
	$J#OeOs   A=B B BBBr   c                 L    U R                   U   nXUU-  U-   0n[        X5      nU$ r  )r  r0   )r4  r   scaleitervar_idxrC  r   r   r   s           rW   scale_index_with_offset!CppKernel.scale_index_with_offset  s3     mmK(%K&01u2	rV   r   c                 6    [        U R                  U5      5      $ )z
Convert an index expr to a string that can be used in cpp code.
e.g. a sympy expression "s2" may actually appear as "ks1" in the cpp kernel.
)rD   r  r4  r   s     rW   index_to_strCppKernel.index_to_str  s    
 T))%011rV   itervarc                 F   ^ ^ [        UU 4S jUR                   5       5      $ )zM
Check if an index has free symbol CppCSEVariable that depends on `itervar`.
c              3   D  >#    U  H  nUR                   TR                  R                  ;   d  M)  [        TR                  R                  UR                      [        5      (       d  Ma  TR                  R                  UR                      R                  T5      v   M     g 7fr  )r   r  varname_mapr   rG   
depends_on)r  srf  r4  s     rW   r  6CppKernel.index_indirect_depends_on.<locals>.<genexpr>  su      
'vv--- = 488//7H =DHH  (33G<<'s   (B 4B '9B )rf   free_symbolsr4  r   rf  s   ` `rW   index_indirect_depends_on#CppKernel.index_indirect_depends_on  s%      
''
 
 	
rV   c                 N    X!R                   ;   =(       d    U R                  X5      $ r  )rm  ro  rn  s      rW   index_depends_onCppKernel.index_depends_on  s(    ,,, 
0N0N1
 	
rV   c                 T    [        [        U R                  U R                  5      5      $ r  )dictrI  r  rW  r:  s    rW   
var_rangesCppKernel.var_ranges  s    Ct{{344rV   r  rB  lowerupperc                 ,   U(       d  U(       d  g [        U[        R                  5      nU(       aJ  [        R                  " U[
        R                  5      R                  n[        R                  R                  nO[        R                  R                  n U R                  [        R                  l
        [        R                  " U[
        R                  5      R                  nU[        R                  l
        U R                  nU(       a.  [        R                  R                  U R                  U5      5      OS n	U R                  Xc(       a  SOS XR                  5      n
U R                   R#                  XzSS9  g ! U[        R                  l
        f = f)N0F)
assignment)r   r   TMPr2   r  r   r  r  r4   re  r  rT  sexprr  indirect_assertrZ  r  r  )r4  r  rB  rx  ry  indirectr  r   prior_computesize_strr   s              rW   check_boundsCppKernel.check_bounds  s    &tTXX6^^D%++6<<FXX%%F HH,,M1#':: ekk:@@#0 ZZFAF188>>$"6"6t"<=D##5CdHoo
 	&59 $1 s   AE< <Fr   c                     U R                   R                  U5      nU R                  U5      nU S[        U5       S3nU R                  R                  U R                  U5      nUR                  SXU40 5        U$ )N[]rq  )r  inputr  rE   r  r  rT  r  )r4  r   r   r   r   r  s         rW   rq  CppKernel.load  sr    iiood#$$U+aE*+1-""4::t4ft5&92>rV   c                    SU;   d   eU R                   R                  U5      nU R                  U5      nUc  U S[        U5       SU S3nOUS:X  a  [        R
                  R                  (       d%  U R                  S:X  a  U S[        U5       SU S3nOS[        R                  R                  U5      nS[        U    S	U S
3nSU S[        U5       SU S3nO[        SU 35      eU R                  R                  [        X5      5        g )Nbufr  ] = r   
atomic_addr5   z] += zstatic_cast<r  r   zatomic_add(&z], r   store mode=)r  outputr  rE   r   r  rE  r=  r4   graph	get_dtyperH   NotImplementedErrorrM  r   r;   )r4  r   r   r  moder   r   r   s           rW   storeCppKernel.store  s   }}iit$$$U+<U!K./tE7!<D\!::--$2B2Ba2GaE 235qA))$/&|E':&;2eWAF%cU!K,>+?s5'L%D6&:;;l467rV   r   r   rtyper   c                 F   ^^^^^ SS[         [           4UUUUU4S jjjnU$ )NrB  c                 N   > U c  T ST ST" TT5       S3$ [        TTTTU T5      $ )Nr   r   r   )r   )rB  r   r   r   r   r  s    rW   rd  .CppKernel._gen_reduction_prefix.<locals>.inner$  sH    |"1SEWUE-B,C1EE- rV   r  )r   rm   )r4  r   r   r  r   r   rd  s    ````` rW   _gen_reduction_prefixCppKernel._gen_reduction_prefix  s    	 	 	 rV   c                 l    U R                    H$  nU R                  R                  U" U5      5        M&     g r  )r/  r.  rH  )r4  rB  gen_fns      rW   finalize_reduction_prefix#CppKernel.finalize_reduction_prefix3  s*    66F!!((6 7rV   c           
      V   US;   nX#U4nX`R                   R                  ;   a  U R                   R                  U   $ U R                   R                  U R                  SU 3SS9nU R                  R                  U 5        SU l        U(       a  UOUn[        X85      n	U R                  R                  U R                  XyX8[        5      5        U R                  c   eU R                  U R                     n
[        U R                  S-   [        U R                  5      5       H$  nXR                  U   -  U R                  U   -   n
M&     U R                   R#                  U S[%        X7XJ5       S35        U R'                  XyX85        [)        X75      nXR                   R                  U'   U$ )	Nra   r`   
reduction FwriteTr5   r   r   )r9  reduction_cacher  rT  r?  r  r5  r   r/  r  r   r-  r  rQ  r   rW  rM  r   r   rJ  r   )r4  r   r   r   r  argmax_or_argminreduction_keyr   
init_dtyper   r   r   r   s                rW   	reductionCppKernel.reduction7  s   )-AA!58..>>>%%55mDD  ))JJ*]O4E * 
 	  ''3%1 "2Y
%nA((//&&~>	

 ##///d223t++a/T]]1CDAKKN*T]]1-==E Ee3(eKLAN	
 	,,SNW">7<B**=9rV   c                     U R                  U5      nU R                  R                  U5      nU R                  R	                  [        X S[        U5       SU S35      5        g )Nr  r  r   )r  r  r  r0  r   r;   rE   )r4  r   r   r  r   s        rW   store_reductionCppKernel.store_reductionV  s[    $$U+iit$''aE(:';4waHI	
rV   c                    U R                   (       al  U R                   [        U5      [        U5      -   :X  d+   U R                    S[        U5       S[        U5       35       eU R                  [        U5      :X  d   eO[        U5      [        U5      -   U l         U R                    Vs/ s H  o0R	                  U5      PM     snU l        [        [        U R
                  5      5       Vs/ s H  n[        [        R                  U5      PM     snU l
        [        U5      U l        U R                  S U R                   U R                  U R                  S  4$ s  snf s  snf )Nr%  r  )r,  r   r-  r   r  rW  rQ  r.   r   XBLOCKr  )r4  lengthsreduction_lengthsr  ns        rW   
set_rangesCppKernel.set_ranges]  s;   ##uW~>O8P'PP ##$Dw(8EBS<T;UVP ''3w<777$W~6G0HHD<@<L<LM<Lq//2<LMDK s4;;/00A /t{{A>0DM $'w<D MM0D001MM$..01
 	
 Ns   (E*$Ec                     U R                   c   e[        R                  R                  R	                  [        U R                   5      SS9$ )N    fallback)r,  r4   r  sizevars	size_hintr/   r:  s    rW   r  CppKernel.size_hintp  sF    +++ww))$**+d * 
 	
rV   c                   ^^^^^^^^^^ [        U [        5      (       d   e[        5       mU R                  c   e[        UR                  [
        5      (       a+  UR                  R                  UR                  5       T5      mO U R                  UR                  5       T5      mUR                  S L=(       a#    UR                  TR                     R                  m[        R                  " 5        nTR                  (       a;  T(       a  TR                  5         OTR                  T5        UR!                  T5        O:TS:  a4  TR#                  5       (       a  UR%                  TR'                  5       5        S[(        4UUU4S jjmSS jm SS[(        S[*        4UUUUUU4S jjjmSS[(        S[*        4UU4S jjjm  SS[(        S[*        S[,        4UU4S	 jjjmUR%                  TR'                  5       5        [        UR                  [
        5      (       Ga/  [        [.        R0                  [2        5      (       Ga  [.        R0                  R4                  (       a  [.        R0                  R4                  nUR7                  5        H  n[9        UR;                  5       R<                   Vs/ s H  nU R?                  U5      PM     sn5      n[@        UR;                  5       RB                     n	S
U	 S[E        U5       S3n
URG                  5       nTRI                  SU	 SU SU
 S35        TRI                  U	 SU SU S35        M     T" U5        S S S 5        g s  snf ! , (       d  f       g = f)Nr5   
_loop_nestc                   >^  U U4S jnT R                  5       n[        U[        5      (       a  UR                   H  m T" T 5        M     g [        U[        5      (       d   eT R
                  b  U" 5       (       a  UR                  5         [        R                  " 5        nUR                  TR                  5       5        UR                  T5        S S S 5        g ! , (       d  f       g = f)Nc                     > TR                   (       d   eTR                   TR                     n U R                  =(       a    U R                  $ r  )rK  r  r5  parallel)rootr  	par_depths    rW   is_parallel_reductionOCppKernel.codegen_loops_impl.<locals>.gen_kernel.<locals>.is_parallel_reduction  s=    %++++%++I,A,ABD,,>>rV   )
get_kernelr   rb  rd  CppKernelProxyrK  rO  r   r   r   r   rW  )r  r  re  r   r   gen_loop_nestr  s   `   rW   
gen_kernel0CppKernel.codegen_loops_impl.<locals>.gen_kernel  s    ?
 $..0f&:;;&,ll
%j1 '3 &fn====!''38M8O8ODDF#--/5++DKKM:- 0//s    1C
C(c                     U(       a$  U R                   nU(       a  U R                  U-   nU$ U R                  nU(       a  X@R                  -   nU$ X@R                  -   nU$ r  )r0  r2  r.  r1  r6  )re  r  	is_suffixr8  prefixs        rW   get_reduction_prefix_suffixACppKernel.codegen_loops_impl.<locals>.get_reduction_prefix_suffix  s`    #44F!'!A!AF!J!M#44F!'*J*J!J "M "(*N*N!N!MrV   depthc           	        > U R                  5       nU R                  (       d   eU R                  U   n[        R                  " 5        nUR                  (       aO  U(       dH  T	" X4R
                  SS9nU(       a  UR                  TR                  5       5        TR                  U5        T
(       aa  UR
                  (       aP  TR                  T5        UR                  (       a.  UR                  (       d   eTR                  UR                  5        T" X5        T
(       aM  UR
                  (       a<  UR                  (       a  TR                  UR                  5        TR                  5         UR                  (       a'  U(       d   TR                  T	" X4R
                  SS95        S S S 5        g ! , (       d  f       g = f)NF)r  T)r  rK  r   r   r5  r  r   r   rH  r3  r4  close)r  r  in_reductionre  loopstack_outerr.  r   gen_loop_atr  is_reduction_loopthreadsworksharings          rW   gen_loop_with_reduction=CppKernel.codegen_loops_impl.<locals>.gen_loop_with_reduction  s2    $..0!''''!''.))+{((+F"MMU,( ,'55dkkmD$45(T]]#,,W5!66#)#@#@@#@ KK(C(CD
2(T]]!88 KK(E(EF#))+((7 &+ ,++s   	EF//
F=c                 n  > [         R                  " 5        nU R                  (       d   eU R                  U   nUR                  5       nUc
   S S S 5        g TR	                  U5        UR                  TR                  5       5        T" XS-   UR                  5        S S S 5        g ! , (       d  f       g = fr   )r   r   rK  linesr   r   r   r5  )r  r  r   r  
loop_linesr   r  s        rW   r  1CppKernel.codegen_loops_impl.<locals>.gen_loop_at  s    ))+u%++++%++E2D!%J!) ,+ OOJ/''6!*ai9J9JK ,++s   7B&AB&&
B4r  c                 v   > U R                   b  U[        U R                   5      :X  a	  T" U 5        g T" XU5        g r  )rK  r   )r  r  r  r  r  s      rW   r  3CppKernel.codegen_loops_impl.<locals>.gen_loop_nest  s5    
 ##+uJ<L<L8M/Mz*+J|LrV   zstd::make_unique<z []>(r   zstd::unique_ptr<z	 []> buf_r   r   z* z = buf_z.get();)FF)r   F)r   )%r   r  r+   r,  re  rb  decide_parallel_depthmax_parallel_depthrK  r  r5  r   r   r  r  r  mark_parallelsingler   r   rP  rm   rk   r4   local_buffer_contextrJ   local_buffersvaluesr/   
get_layoutrB  r  rH   r   rD   get_namerH  )r4  rR  r   r  r   r  local_buffersize_vallocal_buf_sizelocal_buf_dtypeallocatelocal_buffer_namer  r  r  r  r  r  r  r  s     ``        @@@@@@@@rW   codegen_loops_implCppKernel.codegen_loops_implv  s!   $////&(+++i&&(<==!((>>,,.I 22,,.I
 OO4' D	 5 56CC 	 !!#u''$%%'((1''	21%%''''6.x . .$" DI$-0 B	L 	L 	L 	L %*M$MM #M M . 9++-ABBq557IJJ**88 !" 6 6 D D$1$8$8$:L%2 -9,C,C,E,J,J,J !00:,J&N '3<3J3J3L3R3R&SO!2?2C5~I^H__`aH(4(=(=(?%KK*?*;9EVDWWZ[cZddef KK*+2.?-@HYGZZab %;" )$y $#\] $#s    2F>M$0MBM$M$$
M2c                 R    [         R                  U 5      nU R                  X1U5        g r  )rP  buildr  )r4  r   r  rR  s       rW   codegen_loopsCppKernel.codegen_loops	  s     NN4(		=rV   c                 D    [         R                  R                  (       a  gg)NAOTI_TORCH_CHECKTORCH_CHECK)r4   r  aot_moder:  s    rW   assert_functionCppKernel.assert_function		  s    77% rV   c                    U R                   c   eU R                   UR                  UR                  UR                  -    nU R                  5       nSnSnU Hj  n[        R
                  R                  R                  USS9nUSU-  :  d  XR:X  a    O3XB-  [        R                  R                  :  a    OUS-  nXX-  nXH-  nMl     [        R                  R                  (       a  US:X  a  [        U5      S:  a  Sn[        XaR                  S9$ )Nr5   r   r  r  r   r  r  )r,  r  r  r  r4   r  r  r   r  min_chunk_sizerE  r   r  )	r4  r  r  rW  seqparr  r  hints	            rW   r  CppKernel.decide_parallel_depth	  s    +++!!**"..1C1R1RR

 nnD77##--dT-BDa'k!S^~

 9 99QJEKCKC  ::%%%1*VqE .L.L
 	
rV   c              #     #    U R                   U R                  U R                  U R                  4n[	        5       U l         [	        5       U l        [	        5       U l        U R                  R                  5       U l        S v   U R                  R                  U R                   5        U R                  R                  U R                  5        U R                  R                  U R                  5        Uu  U l         U l        U l        U l        g 7fr  )rT  r  rM  r  r=   cloner0  rH  )r4  r[  s     rW   write_to_suffixCppKernel.write_to_suffix-	  s     T\\4;;A#%
%'$&88>>#$$TZZ0$$T\\2$$T[[1<A9T\4;s   D
Dc                     [        U0 UD6$ r  )rG   )r4  r  r  s      rW   create_cse_varCppKernel.create_cse_var:	  s    t.v..rV   c                 "    S[         U    SU S3$ )Nzc10::convert<r  r   )rH   )r4  srcr   r   s       rW   r  CppKernel.get_to_dtype_expr=	  s    |E232cU!<<rV   c                 ^    U R                  X2U5      nU R                  R                  XQ5        g r  )r  r  rJ  )r4  dst	dst_dtyper  r   r  s         rW   r  CppKernel.cache_dtype_convert@	  s$    %%ci@TrV   r  r   c                   ^ ^
 Uc  SnT R                   (       d  g/ m
U
U 4S jnUb3  UT R                   ;   d   eT R                   U   u  pVU" XVU5      (       d  gO7T R                   R                  5        H  u  pxUu  pVU" XVU5      (       a  M    g   SR                  T
5      n	U	(       a  UR                  SU SU	 S35        gg)	NrR   Tc                 J  > X:X  a  gS n[        TR                  5       H  u  pEX%:X  d  M  Un  O   [        T5      [        :X  a"  U(       a  U S:X  a  UTR                  U   :X  a  SnTR                  U S[        U 5       35        TR                  U S[        U5       35        g)NFr   r5   r<  r   T)r   r  r  r%  rW  r  rE   )rU  rV  r   var_idr   _var
conditionsr4  s         rW   gen)CppKernel.codegen_conditions.<locals>.genP	  s    |F$T]]3;F 4
 T
i'QJ4;;v..T+e*<)=>?SS)9(:;<rV   Fr  zif(r$  r   )r*  r  joinr   )r4  r   r  r   r  rU  rV  r  _rangejoined_conditionsr  s   `         @rW   codegen_conditionsCppKernel.codegen_conditionsD	  s     >F!!
	& ?$,,,,,++C0JEu3'' ( !% 2 2 8 8 :#
5t,,  !; #KK
3NNS*;)<B?@rV   )rZ  r*  r,  r  r  r+  r5  r  rT  r3  r4  r6  r=  r1  r2  r<  r;  rW  r9  r-  r>  r.  r/  r0  r?  rM  r:  r  )r5   r   NN)9r  r  r  r  r  r  rD   r~  r7  r8  r3  r   r   rJ  rO  r   r7   rW  r   contextmanagerr  r   r  r`  ro   rd  r   ro  rr  rv  rk   r  rq  r  r	   r9   r   r   r  rm   r  r  r  r  r  r  r  propertyr  r  r  r  r  r  r  r  rl  rm  s   @rW   r%  r%  ;  s   IEMF#1V /('
RIXl3 ( $ $& BCZZ2%** 2 2	
uzz 	
ELL 	

ejj 
5<< 

5:jj: jj: 	:
 :@ UZZ 8$;#$  	
 {{:7hsm 7>

&
M%^> ! ! !
: 
B 
B/=  !%&*	.. . ell#	. .rV   r%  c                     ^  \ rS rSr\r S+U 4S jjrS\R                  S\R                  4S jr
S\R                  S\4S	 jrS\R                  S\4S
 jrS\R                  S\4S jr\R                  4S\R                  S\4S jjrS\S\R                  S\4S jr S+S\S\R                  S\R                  S\\   4S jjr   S,S\\   S\R                  S\R                  S\\   S\\\\4      S\S\\   4S jjrS\S\R                  4U 4S jjr S-S\\\4   S\S\R                  S\R                  S\4
S jjrS+S jrS rS r S\S\4S jr!S\S \R                  S\4S! jr"S" r#S# r$S+S$ jr%SSS\RL                  4S\\R                     S%\\   S&\\R                     4S' jjr'S+U 4S( jjr(U 4S) jr)S*r*U =r+$ ).r  iu	  Nc                    > [         TU ]  X5        [        R                  " 5       U l        U R                  (       d   eUS:  d   S5       eX0l        X@l        XPl        U(       a  XPl        g UU l        g )Nr   z0Expect pass in Non-Zero tiling_factor explicitly)	r2  r3  r   pick_vec_isavec_isar  r  r  	num_elems)r4  r  r=  r  r  r  r7  s         rW   r3  CppVecKernel.__init__x	  s_     	+"//1|||q T"TT *$"&/]rV   r   rf  c                   ^  T R                  X5      (       a  g U 4S jUR                   5        H-  n[        U[        5      (       d   eUR                  (       d  M-    g    [        XT R                  5      nUR                  (       a  U$ S $ )Nc              3      >#    U  HI  n[        U[        R                  5      (       d  M$  TR                  R                  UR
                     v   MK     g 7fr  r   r   r}  r  ri  r   r  rk  r4  s     rW   r  5CppVecKernel._try_get_const_stride.<locals>.<genexpr>	  s;      
'a* )DHH  ('
   #A*A)ro  rm  r   rG   r   r	  r  r  )r4  r   rf  indirect_varr  s   `    rW   r  "CppVecKernel._try_get_const_stride	  s|    ))%99
''
L
 lN;;;;"""
 %UT5G5GH))v3t3rV   r   r   c                     [         R                  " U R                  UR                  -  S-  U R                  R                  5       -  5      nUS:  d   eU$ )N   r5   )mathr  r  itemsizer$  	bit_widthr4  r   num_vectorss      rW   r  CppVecKernel._get_num_vectors	  sO    ii/!3dll6L6L6NN
 arV   c                 p    U R                   UR                  -  S-  U R                  R                  5       -  $ )Nr0  )r  r2  r$  r3  )r4  r   s     rW   r  !CppVecKernel._get_raw_num_vectors	  s0     !!ENN2Q69O9O9QQQrV   c                 j    U R                  U5      nUS:X  a  S[        U    S3$ S[        U    SU S3$ )Nr5   r  r   r  r  )r  rH   r4  s      rW   r  CppVecKernel._get_vec_type	  sJ    ++E2!),u*=)>a@@*<+>*?qQOOrV   c                 n    U[         R                  :X  a  gU R                  U5      nS[        U    SU S3$ )NrR   r  r  r   )r   rk   r  rH   r4  s      rW   r'  CppVecKernel._get_mask_type	  s<    EJJ++E2"<#6"7qQGGrV   r  c                     UR                   [        R                  :X  d   [        U5      5       eU R	                  U5      nU S[
        U    SU S3$ )Nz.template cast<r  r   )r   r   rk   reprr  rH   )r4  r  r   r5  s       rW   r  CppVecKernel._get_mask_cast	  sP    zzUZZ'3d3'++E2|E':&;1[MMMrV   r   	load_maskc                    [         U   nU R                  U5      nSnU(       aX  UR                  (       d&  U R                  [        R
                  5       SU S3nO!U R                  U[        R
                  5       nUS:w  a  U S[        U5       3OUnU[        R                  :X  a  U R                  5        SU S3n	U	$ U(       a  U SU SU SU S3O,U R                  U5       S	U S
[        U R                  5       S3n	U	$ )a  
Get a load line str that loads a vector from `var` at `index` of type `dtype`.
If `load_mask` is not None, we do a masked load accordingly.
Notes on the `dtype`:
1. We always load `self.tiling_factor` number of elements regardless of the `dtype`.
   It means we load half of the vector lanes for 16-bit data types and quarter of the
   vector lanes for 8-bit data types.
2. `torch.bool` and `torch.uint8` could mean masks and we load them as float mask vectors.
Nr  r   r   r  z.template loadu<r  r  ::loadu(r   )rH   r  r   r'  r   rn   r  rE   rk   r  r%  )
r4  r   r   r   r@  cpp_typer5  load_mask_strloadbufr   s
             rW   _get_vec_load_lineCppVecKernel._get_vec_load_line	  s      &++E2###'#6#6u{{#C"DGI;VW X#'#6#6y%++#N"O5:aZSE[/01SEJJ))+,GG9A>D  ! !/!1(1[MG9TUV**512(7)2kRVR`R`FaEbbcd 
 rV   Fr   store_value
accu_storec                 4	  ^ ^^^ U(       a
  Uc   S5       eU(       a	  U(       d   eTc  T R                   mS[        R                  S[        4U 4S jjmS[        R                  S[        4U 4S jjmS[        S[        4UUUU 4S jjn[        5       nUR                  S	5        UR                  5          T" U5      n	T" U5      n
S
[        U    SU
 S3nUR                  U5        U(       a   UR                  U S[        U	5       S35        [        T R                  T R                      S35      n0 nU 4S jUR                   5        H?  n[        U[        5      (       d   eUR                  (       d  M-  U" U5      nU SU S3X'   MA     T R!                  UT R                  US9nSnT R"                  b{  U(       a   S5       e[        T R"                  [        5      (       d   T R"                  5       eT R"                  R                  (       a  T R"                   SU S3nOT R"                   S3n[$        R&                  " 5       (       a  UR                  ST R(                   35        OUR                  ST R(                   35        UR                  SU S3U S[        T R*                  5       S3-   U S3-   5        UR                  5          [,        R.                  " 5        n[        U5      nU H$  n[0        R2                  " SU -   S-   X   U5      nM&     Ub  U SU S3OU nU(       a4  UR                  S U S35        UR5                  UR                  5       5        U(       a&  U(       a  S!OS"nUR                  U S#U S$U S%35        OUR                  S&U S'U S(35        SSS5        SSS5        U(       d(  T R7                  S)S*U5      nUR                  S+U S(35        SSS5        UR                  S,5        U(       a#  UR                  S(5        TR9                  U5        gT R:                  R=                  TU5      n[        U[        5      (       d   eS-Ul        U$ ! , (       d  f       N= f! , (       d  f       N= f! , (       d  f       N= f).a  
Load or store a vector in a non-contiguous way. The vector is initialized from an array that is
filled in an inner loop over the tiling factor.
:param var: buffer to load from or store to, i.e. `var[transformed(index)]`. If None, we load the index
            as index expression, i.e. `transformed(index)`.
:param index: index into the `var` or the index expression by its own if `var` is None.
              The `index` could contain indirect indexing or the tiling itervar. When used in
              the inner loop, the index is transformed as follows:
              1. the index is linearized along the tiling dim.
              2. the indirect indexing vector variables are transformed into arrays over the tiling dim.
:param dtype: data type of `var` or `index` if `var` is None.
:param buffer: the code buffer to write the generated code to. If None, we write to `self.loads`.
:param store_value: the value to store. If None, we load the vector.
:param accu_store: whether accumulate the store_value to store_ptr. If True, a store_value should be provided
:return: a CppCSEVariable that represents the loaded vector or None if it is a store.
Nzstore var must be providedr   r   c                 t   > U R                   S:  a  TR                  SU R                   -  -  $ TR                  $ N   )r2  r%  r   r4  s    rW   get_result_sizeCCppVecKernel._load_or_store_non_contiguous.<locals>.get_result_size	  s1    ~~!~~enn)<==~~%rV   c                 t   > U R                   S:  a  TR                  SU R                   -  -  $ TR                  $ rL  )r2  r  rN  s    rW   get_tiling_sizeCCppVecKernel._load_or_store_non_contiguous.<locals>.get_tiling_size	  s5    ~~!))Q%..-@AA)))rV   vec_varc                 n  > U R                   (       d   e[        5       nUR                  S5        UR                  5          U R                  nUc   eU[
        R                  :X  a  [
        R                  nT" U5      nT	" U5      nUR                  S[        U    SU S35        U  S[        U5       S3nUR                  U5        UR                  S5        S S S 5        UR                  S5        T
R                  R                  TU5      n[        U[        5      (       d   eU$ ! , (       d  f       NT= f)	Nr  r   r   r  r  r   zreturn tmpbuf;r  )r   r7   r   r   r   r   rk   rn   rH   rE   r  r  r   rG   )rT  r   r  result_sizetiling_sizer   r  r   rO  rR  r4  s          rW   vec_to_array@CppVecKernel._load_or_store_non_contiguous.<locals>.vec_to_array
  s   >>!>>DNN5!#MM	 ,,,

* %I-i8-i8.|I/F.Gr+V_` ""8[9Q8RRTUt$/0  NN4 XX&&vt4Ffn5555M! s    BD&&
D4r  r   r   r  r  r   rM  c              3      >#    U  HI  n[        U[        R                  5      (       d  M$  TR                  R                  UR
                     v   MK     g 7fr  r)  r*  s     rW   r  =CppVecKernel._load_or_store_non_contiguous.<locals>.<genexpr>,
  s;      !+A!!TXX. -$$QVV,+r,  r  r  r_  rC  zunexpected store with load maskz.is_masked(r   z != 0z#pragma GCC unroll z#pragma unroll 
for (long  = 0; r   r   r   r   r  +==r   z tmpbuf[r   ztmpbuf[r  r   ztmpbuf.data()r   r  r  T)rT  r   r   rm   rG   r7   r   r   rH   rE   r-   r  r  rm  r   r   r`  rZ  r   is_gccr  r%  r   r   r   r   r   rF  rH  r  r  )r4  r   r   r   r   rH  rI  rX  r   rV  rW  result_declareitervar_innerreplacementsr-  	array_varr@  r   index_crhsr   	load_liner  rO  rR  s   `   `                  @@rW   r  *CppVecKernel._load_or_store_non_contiguous	  s5   2 #/O3OO1;>ZZF	&5;; 	&3 	&	*5;; 	*3 	*	. 	^ 	 	, ~u[[])%0K)%0K*<+>*?r+iX  NN>*"m#9+k:R9SSUV /==12&9M L!++!
 ",????&&& ,\ :I4=;aa1PL.! 004??= 1 E I*&I(II!$//>BBSDOOSB??))#'??"3;}oQ OI#'??"35 9I!!##!4T5G5G4HIJ1C1C0DEFNN]O62"O3{4>>'B&C2FG"O3'(
 
 4 4 6%%e,$0L ff<.1E9$2G %1 .1_Qwiq)WINNT)A#67''6*4$#KNNcU!K=r#RSNNW]O4uA#FG! !7"  33OQN	156 @ 	tNN3MM$XX&&vt4Ffn5555 FMM; !7 6Y ]sE   3B?R	6ER	Q8%CQ')Q817R	'
Q51Q88
R	R		
Rr   c                 8  > U R                   R                  U5      nU R                  U5      n[        R                  R                  U5      nU R                  U R                     nU R                  X%5      nUS:X  a  [        T	U ])  X5      $ US:X  aC  U R                  X2X@R                  5      nU R                  R                  U R                  U5      nOU R!                  X2U5      n[#        U[$        5      (       d   eUR'                  SXU40 5        SUl        U$ )Nr   r5   rq  T)r  r  r  r4   r  r  r  r  r  r2  rq  rF  rZ  r  r  rT  r  r   rG   r  r   )
r4  r   r   r   r   r  r  r   r  r7  s
            rW   rq  CppVecKernel.loadh
  s    iiood#$$U+!!$']]4??3
++E>Q;7<,,q[**3uooNDXX&&tzz48F77EJF&.1111ft5&92>rV   r  c           	         [        U[        5      (       d-  [        U[        5      (       a  UR                  (       d   U5       eU R                  U R
                     nU S[        U5       3nU R                  X65      n[        5       n	US:X  a  U(       an  U[        R                  :X  a$  U R                  c  U R                  U5       SU S3O,U R                  U5       SU S[        U R                  5       S3n
SU SU
 S3nU[        R                  :X  a&  U R                  c  U	R                  U SU S35        U	$ U	R                  U SU S[        U R                  5       S35         U	$ U R                  X#XIXS	9  U	$ )
a  
Get a store line buffer that stores `value` into `var` at `index` of `dtype`. It handles
both contiguous and non-contiguous store cases.
:param value: Vectorized type templaterized on `dtype`.
:param var: buffer to store into.
:index: index into the `var`.
r  r5   rB  r   r   r$  .store(r   )r   rH  rI  )r   ro   rG   r   r  r  rE   r  r=   r   rn   r  r  r%  r   r  )r4  r  r   r   r   rI  r  var_exprr  r   rq  s              rW   _get_store_lineCppVecKernel._get_store_line|
  s   " %%%un--%,,		 
 ]]4??3
U#k%012++E>Q; +0F ))%01(1E ..u56hxj;W[WeWeKfJgghi 
 E7#dV1-#(>%z<=  gWXJbT^^1L0MRP  ..EE /  rV   c                 R  ^ ST;   d   e[        U[        5      (       d   U5       eUR                  (       d  U R                  U5      nU R                  R                  T5      nU R                  U5      n[        R                  R                  T5      nUcA  U R                  X5X&5      nU R                  R                  UR                  U4S j5      5        g US:X  Ga=  [        R                  R                   (       dS  U R"                  S:X  aC  U R                  U UUUSS9nU R                  R                  UR                  U4S j5      5        g U R%                  U5      nU R%                  [&        R(                  5      n	[*        U   n
[,        R.                  " U[&        R(                  5      R0                  n[        U[        5      (       a  UR                  (       d   eSU
 S	U	 S	U S
U S	U S	U S3nU R                  R3                  [5        TU5      5        g [7        SU 35      e)Nr  c                    > [        TU 5      $ r  r;   r  r   s    rW   <lambda>$CppVecKernel.store.<locals>.<lambda>
  s    ,tQ2GrV   r  r5   T)rI  c                    > [        TU 5      $ r  rs  rt  s    rW   ru  rv  
  s    l46KrV   zatomic_add_vec<r   r  r   r  )r   rG   r   r  r  r  r  r4   r  r  ro  rM  rH  mapr   r  rE  r=  r  r   r  rH   r2   r  r  r   r;   r  )r4  r   r   r  r  r   r   r   n_srcn_idxr   r   s    `          rW   r  CppVecKernel.store
  s   }}%007%70||NN5)Eiit$$$U+!!$'<''EADKKtxx(GHI\!::--$2B2Ba2G++g# ,  ""488,K#LM--e4--ekk:%e,uekk:@@!%88U\\II(5'E7"SEE7RTUZT[[]^%%l4&>?%D6&:;;rV   c                    U[         ;   d   eUS;   nU R                  U R                  :  nU(       a  UOUn[        U[        5      (       d   U5       eUR
                  (       d  U R                  U5      nX#U4nXR                  R                  ;   a  U R                  R                  U   $ Sn	U	 S[        U    S3n
[        X75      nU R                  X75      nU R                  R                  U R                  SU 3SS9n[        U[        5      (       d   eU S3nS	U 3nU =R                  U X/-  sl        S
U l        U R                   R#                  U R%                  XX7[&        5      5        U R                   R#                  U R%                  UUUUU R(                  5      5        [*        R,                  " S U R.                  U R                  S  5      nUS:X  Ga)  U R                  c   eU R                   R#                  U R%                  UUUUU R(                  5      5        [*        R,                  " S U R.                  U R                  S  5      nU R                  U R                  :  a  U R0                  OSn[3        UU5      U l        U R4                  U R6                  R                  ;  a  U R6                  R                  U R8                  SU R4                   3SS9U l        U R:                  U R6                  R                  U R4                  '   U R<                  R?                  U RA                  U5      5        [B        RD                  RF                  (       a  SO	[I        5       nU RJ                  R?                  U RA                  UU5      5        O(U R6                  R                  U R4                     U l        U RL                  (       a  UOUnU RN                  R?                  U SU RQ                  UUUS
5       S35        OU R                  c   eU RR                  U R                     n[U        U R                  S-   [W        U RR                  5      5       H%  nUU R.                  U   -  U RR                  U   -   nM'     UUUUS.nU RN                  R?                  U SU RP                  " X>40 UD6 S35        U RY                  UUUUU RP                  U R(                  S9  U RY                  UUUU[Z        [&        S9  US:X  a(  U RY                  UUUUU RP                  U R(                  S9  U[\        R^                  :H  nU(       GaE  [a        U5      (       aT  U Rc                  U5      S;   d   S5       eSU S3nSU S3nU Rd                  R?                  U S[[        X=U5       S35        OU(       a	  U SU S3nOU(       a  US;   a  SU S3nOUS:X  d   eU S3nOSU RQ                  USS 5      -   S!-   nU[\        R^                  :H  nU(       a  [\        Rf                  OUnS"[        U    S3n
S#[        U    S$U Rc                  U5       S3nU S%U
 S&U
 S'U S$U S3
nU Rd                  R?                  U S[[        X=UUS(9 S35        UnOCUn[a        U5      (       a1  S	U 3nU Rd                  R?                  U S[[        UUU5       S35        [i        UU5      nUU R                  R                  U'   U$ ))Nr  zat::vecz::Vectorized<r   r  Fr  _vecmasked_Tc                 
    X-  $ r  rU   r'  s     rW   ru  (CppVecKernel.reduction.<locals>.<lambda>
  s    rV   rg   c                 
    X-  $ r  rU   r'  s     rW   ru  r    s    QUrV   r5   rC  r   r   )r   r   horizontal_reductionr   )rF  rG  )r5   r   z4Welford reduction does not support VectorizedN (N>2)zwelford_vec_reduce_all(r   z_vec_reduce_all()rf   rc   r_   r  z.all_zero()r^   z.all_masked()z	{ return r  r(  z; }r  zat::vec::vec_reduce_all<r   z([](z& x, z& y) rD  )5VECTORIZABLE_RTYPESr  r-  r   rG   r   r  r9  r  rH   r   reduction_acc_type_vecr  rT  r?  r5  r/  r  r  r   reduction_init_vecrS  rT  rW  r  r   weight_recp_vec_ranger:  r  weight_recps_valr6  r   welford_weight_reciprocal_vecr   r  rE  r+   r3  r  rM  reduction_combine_vecr  rQ  r   rJ  r   r   rk   r*   r  r0  rn   r   ) r4  r   r   r   r  r  r  r  r  vec_nsvecr   acc_type_vecr   acc_vecmasked_acc_vecreduction_sizereduction_factorr=  acc_vec_r   r   r  r   r   masked_next_valuereduce_all_bodyr  vec_reduce_all_functmpvarmasked_tmpvarr   s                                    rW   r  CppVecKernel.reduction
  st    !4444)-AA#$2F2FF"2Y
%007%70||NN5)E!58..>>>%%55mDDl5&9%:!<%nA22>N  ))JJ*]O4E * 
 #~....E,"7),  uw$GG  ((//&&~>	

 	((//&&''	
 #))D,@,@,B C
 --''333,,33**" "++ '--"DKK0D0D0F$GN '+oo9M9M&M""ST  *2.BR)SD&))1F1F1V1VV(,(=(=(F(FLLJt/I/I.J"KSX )G )% )) %%55d6P6PQ 22<<66u= zz11 "-/ 
 ))3366ukJ )-(=(=(M(M..)% *.~WHKK!!*C : :>8UZ\` abbcd ''333MM$"6"67E4//!3S5GHA.q1AA I $(<&	F KK!!)3t99.\U[\]]^_ 	,,!%!;!;"55 	- 	
 	,,!2, 	- 	
 --00%)%?%?"&"9"9 1  5::%#N33,,U3 8  J JJ   7wiqA
&=n=MQ$O!%%//e30FWXYYZ[ " .//?yJ
! & 
 $%WI[!9J)U222$+9M!:J  00cJK  
  5::-+2EKK	,\)-D,EQG(@iAX@YY[\`\q\qr{\|[}}~&# 34DU3%u_L]]_`g_hhij
!!++%s,^*Xabccde FF#N33")& 2%%//hc"3NFM"Z![[\] #>6:<B**=9rV   c                 6  ^ U R                  U5      nU R                  R                  T5      n[        R                  R                  T5      nUR                  (       a&  U[        R                  :X  a  UO[        R                  O[        R                  n[        R                  R                  U5      n[        R                  R                  U5      n[        5       n	U R                  U R                  :  a.  U	R!                  U S[#        U5       S[$        U    SU S35        OXV:w  a  [$        U    SU 3n
U[        R&                  :X  a&  U SU R                  [        R&                  5       S3nO@Xs=:X  a  S:X  a  O  OS	[$        U    SU S
3nO S	[$        U    SU S[$        U    SU SU S
3nU	R!                  SU
 SU S35        U
nU	R)                  U R+                  X4X%5      5        U R,                  R)                  U	R/                  U4S j5      5        g )Nr  z] = static_cast<r  r   r   z.template cast<bool,r   r5   at::vec::convert<r   r  r   r   r   c                    > [        TU 5      $ r  rs  rt  s    rW   ru  .CppVecKernel.store_reduction.<locals>.<lambda>  s    T18MrV   )r  r  r  r4   r  r  is_floating_pointr   rj   rn   r  re  r  r=   r  r-  r   rE   rH   rk   rH  ro  r0  rx  )r4  r   r   r  r   	out_dtyper   out_num_vectorssrc_num_vectorsr   converted_valueconverts    `          rW   r  CppVecKernel.store_reduction  s   $$U+iit$GG%%d+	 ** $u||3Y 	
 ((33I>((33E:??d222NN%qU+,,<\)=T<UUWX]W^^`a
 !%1)%<$=Qug"F

*!&';D<Q<QRWR\R\<];^^abG&>Q>/Y0G/H5'QRS  
 0Y0G/H./qe1D0EQFWWYZ_Y``ac   &7s7)1EF'KK,,UJK$$TXX.M%NOrV   
scalar_varc                 .   UR                   (       a   eUR                  [        R                  :X  aE  U R                  R                  U R                  U R                  5        SUR                   S35      nO^UR                  c   eU R                  R                  U R                  U R                  UR                  5       SUR                   S35      n[        U[        5      (       d   eUR                  Ul        UR                  Ul        SUl         U$ )Nr  r   r$  T)r   r   r   rk   r  r  r  r'  r   r  r   rG   dependent_itervars)r4  r  rT  s      rW   r  CppVecKernel.broadcast  s    $$$$uzz)hh''!4!4!6 7wz>OqQG ##///hh''%%j&6&678*//9J!LG '>2222"((%/%B%B"rV   r  c           	      8   UR                   (       a   eUR                  c   eU R                  R                  U R                  U R                  UR                  5       SU SU S35      n[        U[        5      (       d   eUR                  Ul        SUl         U$ )Nz	::arange(r   r   T)r   r   r  r  r  r  r   rG   )r4  r   r  r  s       rW   r  CppVecKernel.arange  s    <<{{&&&""LL!!%++./yr&K
 &.1111{{rV   c                    [         U   nU R                  U5      n[        U5      (       a  SU S3$ US;   aa  [        U   nU R	                  X5      nUS:X  a  [        U5      (       a  SU S3OSU S3nO[        U5      (       a  SU S3OSU S	3nU S
U S3$ US:X  a  U R                  5        S3$ [        X5      nU S
U S3n	U[        R                  :X  a  US;   d   eU R                  5        SU S3$ U	$ )Nr   r   ry   r`   r|   r{   r~   rz   r}   r$  r   rf   z	::from(0))r^   r_   rc   r  )
r<   r  r*   rH   r  r   r'  r   r   rk   )
r4  r   r   r   vec_typer   r   rz  scalar_initvec_inits
             rW   r  CppVecKernel.reduction_init_vec  sF   07%%k2//hZs++11!+.F22>IH) &e,, +6(-@/xx@  &e,, ,F8=A/xx@ 
 ZqQ''U"))+,I66$^;ZqQ/EJJ!%::::))+,GK=BBrV   c                    [         U   nU R                  U5      n[        U5      (       a  SU S3$ US;   ax  U R                  U5      nU R                  [        R
                  5      nU[        R                  :X  a!  S[        [        R                      SU SU S3$ S[        U    SU SU S3$ U[        R                  :X  a  US;   d   eU R                  5        $ U$ )Nr   r   ry   zIndexValueVec<r   )r^   r_   rf   rc   )
r<   r  r*   r  r   r  rk   rH   rn   r'  )r4  r   r   r   r  ry  rz  s          rW   r  #CppVecKernel.reduction_acc_type_vec  s    07%%k2//hZq))11))+6E))%++6E

"'U[[(A'B"UG2eWTUVV#L$=#>br%PQRREJJ!%AAAA))+,-rV   c                     U(       a  [        U R                  U5      OU R                  n[        U5      nSU R                  U5       SU R                   SU S3$ )Nzstatic WeightRecp<r  r$  r   )r   r  rE   r  r  )r4  r   r=  vec_num_range_threadvec_num_range_thread_exprs        rW   r  *CppVecKernel.welford_weight_reciprocal_vec  sk      D..<++ 	
 %00D$E! !3!3E!: ;2d>S>S=T()	
rV   r  r   c                    U[         R                  :H  nUS:X  aG  U R                  (       a  SU SU S[        U R                  5       S3$ U(       a  U SU 3$ SU SU S3$ US:X  aG  U R                  (       a  SU SU S[        U R                  5       S3$ U(       a  U S	U 3$ S
U SU S3$ US:X  aE  U R                  (       a  SU SU S[        U R                  5       S3$ U(       a  SOSn	U SU	 SU 3$ US:X  a7  U R                  (       a  SU SU S[        U R                  5       S3$ U SU 3$ US:X  a7  U R                  (       a  SU SU S[        U R                  5       S3$ U SU 3$ US:X  a  U(       aS  U R                  (       a,  SU SU S[        U R                  5       SU R                   S3	$ SU SU SU R                   S3$ U R                  (       a  SU SU S[        U R                  5       S3$ SU SU S3$ US:X  an  [        U[        5      (       a  Uu  pnO[        X5      u  pnU R                  (       a%  SU SU
 SU SU S[        U R                  5       S3$ SU SU
 SU SU S3	$ US;   a  Uc   e[        U   nU[         R                  :X  a  [        [         R                     nU R                  U5      nU R                  [         R                  5      nSnSnUb&  Uc   eS[        U5      R                  5        3nSU 3nU R                  (       a.  U SU SU SU U S U SU U S[        U R                  5       S3$ U SU SU SU U S U SU U S3$ US!:X  ac  [        U[        5      (       aG  UR                  [         R                  :X  d   e[!        ["        R$                  R&                  U45      u  nU SU 3$ [(        e)"Nr_   zmax_masked_reduce(r   r   r  r  r^   zmin_masked_reduce(r  r  rc   zsum_masked_reduce(r   rZ   r   rd   zprod_masked_reduce(r   re   zxor_sum_masked_reduce(r   rg   r   rE  rh   r   z}, r   ry   rR   z_combine_vec<r  rf   )r   rk   r  rE   r  r   r   r   rH   rn   r  r  ro   rx  rG   r   rN   r4   re  r  r  )r4  r   r   r   use_weight_recpsr   r  r   r   r   r   r   r   r   ry  rz  t_extra	arg_extras                     rW   r  "CppVecKernel.reduction_combine_vec
  s    uzz)U"~~+C5:,bT^^A\@]]^__  e3zl+ -SEJ<qA
 u$~~+C5:,bT^^A\@]]^__  e3zl+ -SEJ<qA
 u$~~+C5:,bT^^A\@]]^__%,c#a}Aj\::v%~~,SEJ<r+dnnB]A^^_``c*..y(~~/uBzl"[QUQ_Q_E`Daabccc*..//>>-cU"ZL;t~~C^B__bcgcxcxbyyz{{-cU"ZLDDYDYCZZ[\\>>-cU"ZL;t~~C^B__`aa-cU"ZLBB00*e,,#- & $5^#P &~~)#d4&2$b[Y]YgYgMhLiijkk)#d4&2$bLL33(((!),FEJJ&%ekk2)))4E))%++6EGI +777s#78>>@AB L	~~%&mF82eWBuggY WuBzl9+RDNN8S7TTUW
 ))vhbr%QXPYY[\_[``bcmbnoxnyyz{{u$*n55!''5::555 4QXX5E5E
} UU#j\**%%rV   c           	         > [        U[        5      (       d   eUR                  c   eUR                  (       d<  [        U[        5      (       a  UR                  (       a  SU S3n[        T	U ]  XX45      $ UnUnU(       a!  U R                  UR                  5       SU S3nU(       a!  U R                  UR                  5       SU S3nU(       a!  U(       a  SU SU SU SU S3	nU SU SU 3nO-U(       a  U SU 3nU SU 3nOU(       d   eU SU 3nU SU 3nU R                  UR                  5       SU S3nU(       a;  UR                  (       d!  U R                  UR                  5       SU S3nSU SU S3nU R                  (       aS  U R                  UR                  5       SU R                  UR                  5       S	U S
[        U R                  5       S3nSU S3nU R                   SU SU S3$ )Nr$  z).all_masked()r   r8  z) & (r   z) | ~(z::set(z::from(1), (r   z, "index out of bounds: z"))r   rG   r   r   r2  r  r  r'  r  rE   r  )
r4  r   rx  ry  r  lower_scalarupper_scalarcond
cond_printr7  s
            rW   r  CppVecKernel.indirect_asserti  s&   #~....yy$$$zz$//DKK4&/7*3uCC))#))45QugQ?E))#))45QugQ?EUugT#eC5E7!<D(>cU#l^DJWD&D(>cU3JL5U#eW%D5L>2J%%cii014&:;;--cii894&BtfF4&*D>>&&syy12&9L9LSYY9W8X YV3{4>>:;1>  4&'&&'q.FzlRTUUrV   c                 F  > [        U[        5      (       d   eUR                  (       d  [        T	U ]  XU5      $ [
        U   nU R                  U5      n[
        U   nU R                  U5      nSU S3nU[        R                  :w  a3  U[        R                  :X  a  U R                  U5       SU SU SU S3nU$ U[        R                  :X  a!  U[        R                  :w  a  U SU SU S3nU$ X2:w  a,  XWs=:X  a  S:X  a  O  OS	U SU S3nU$ S	U SU SU SU SU S3nU$ )
Nr$  r   z::from<r  r  z.to<r   r5   r  )
r   rG   r   r2  r  rH   r  r   rk   r'  )
r4  r  r   r   src_cpp_typer  dst_cpp_typedst_num_vectorsr  r7  s
            rW   r  CppVecKernel.get_to_dtype_expr  sY   #~....zz7,SCC#I.//	:#E*//63%qz

"u

':)))45W\N!OK\\^_b^ccdeD  %**$%**)<U$|nAo->cBD  6Q6*<.3%qA  +<./9J!L>YZ[jZkkmnqmrrstrV   )r5  r%  r  r  r  r$  r  r  r  )NNF)F),r  r  r  r  r  r  r3  r   r  r   r  r   r   rm   r  rn   r  ro   r  r'  rG   r  r   rF  r=   r	   rk   r  rq  ro  r  r  r  r  r  r  r  r  r   r  r  r  r  rl  rm  s   @rW   r  r  u	  s   I C"45:: 4 4ekk c R%++ R% R
P5;; P3 P 38++ HEKK H# HN> N%++ N# N /3## zz# {{	#
 N+#T ,0<@ Lc]L zzL {{	L
 (L eC$789L L 
.	!L\ UZZ 4 !*S.()* * zz	*
 {{* *X<B{z$PLN ~ $
N 
ELL 
^ 
 D 
& (,/3+0==]& %]& 'tn]& EKK(]&~#VJ rV   r  c                      ^  \ rS rSrSr\r  SU 4S jjrS rS r	 SS jr
S\S\R                  4U 4S	 jjrSU 4S
 jjrS rU 4S jrS\R                  S\R                  4S jrSrU =r$ )r!  i  a  
A vector kernel that handles the 2d tiles with the tile size defined in `tiling_factor` on
the inner-most loop level and one of the outer loop level (`outer_tiling_idx`). When the data
tile is accessed in a contiguous way from the outer loop axis, a transposition is applied on the
tile to make the access contiguous from the inner-most loop axis. Then, the same vectorization
logic from its parent `CppVecKernel` is leveraged for load/store/compute. The transposed tile load
and store are generated into kernel.preloads and kernel.poststores buffers.

The loop structure looks like below:
for ...
  for i_outer ...
    for ...
      for inner_most ...
        // generated by CppTile2DKernel
        float tmp0[16*16]; at::vec::transpose_mxn<...>(tmp0, in_ptr0 + ..., ...); // into kernel.preloads
        float tmp1[16*16]; // into kernel.preloads
        for i_inner ... { // the kernel inner loop
          vectorized loads/compute/stores (e.g., load tmp0, store tmp1) // into kernel.loads/compute/stores
        }
        at::vec::transpose_mxn(out_ptr0 + ..., tmp1, ...) // into kernel.poststores
      for inner_most ... (tail)
        // generated by CppVecKernel
        ...
  for i_outer ... (tail)
    for ...
      for ...
        // generated by CppKernel
        ...
c                    > [         TU ]  UUUUS   U5        X@l        XPl        X`l        U(       a  UOUU l        U(       a  UOUU l        SU l        g )Nr5   T)r2  r3  tiling_indicesinner_tail_sizeouter_tail_sizeinner_num_elemsouter_num_elemsinner_is_tiling_idx)r4  r  r=  r  r  r  r  r7  s          rW   r3  CppTile2DKernel.__init__  sZ     	1	
 -..2A}2A}#' rV   c                 L    [        U R                  U R                      S35      $ )NrM  )r-   r  	outer_idxr:  s    rW   inner_itervarCppTile2DKernel.inner_itervar  s"    !T]]4>>%B$C6"JKKrV   c                    U R                   U R                     nU R                   U R                     n[        XU R                  5      n[        XU R                  5      nU R
                  S L =(       aW    US:H  =(       aK    UR                  U5      =(       a3    UR                  U5      (       + =(       a    UR                  U5      (       + $ r   )r  r  r  r	  r  rZ  r   )r4  r   	outer_var	inner_varouter_strideinner_strides         rW   need_vec_transpose"CppTile2DKernel.need_vec_transpose  s    MM$..1	MM$//2	*5T=O=OP*5T=O=OPOOt# 0!0		)$0 !$$Y//0 !$$Y//	
rV   c                    [         R                  R                  U5      nU R                  nU S[	        U5       3nSn	[	        [        X0R                  U R                     U R                  5      5       n
[	        U R                  5       nU(       a  XpXpSnU R                  U-  (       a  U R                  U R                  pOU R                  U R                  pU(       a  US:X  a  SOSn[        U[        R                  5      (       a  UR                  (       a0  [        U[        R                  5      (       aF  UR                  (       d5  S[         U    SU S	U S
U
 S
U	 S
U S
[	        U5       S
[	        U5       S3nO4S[         U    S[	        U5       S[	        U5       SU S	U S
U
 S
U	 S
U S3nU(       a  U R"                  R%                  5       nOcU R"                  R'                  U5      (       d&  U R"                  R)                  U R*                  USS9nOSnU R"                  R-                  U5      nU(       a>  [         U   nSU SU S3nU SU SU SU SU S3
nU R*                  R/                  U5        UR1                  S[3        U5      5      nU(       a'  U R4                  R/                  [7        UU5      5        U$ U R*                  R/                  U5        U$ )Nr  __place_holder__Tr  truefalseztranspose_mxn<r  r  r   r   Fr  zalignas(std::max(std::size_t(z), alignof(z)))r   r  r[   r   )r4   r  r  r  rE   r	  r  r  r%  r  r  r  r   r   r  r  rH   r  rF  containsr  r;  getr   r  ro   r<  r;   )r4  r   r   r   is_store
store_moder   factorr  r  ld_srcld_dstneed_defineMNr  load_or_storetile_var	cpp_dtypealignasdefine_lines                        rW   gen_transposed_tile_load_store.CppTile2DKernel.gen_transposed_tile_load_store  s    !!$'##SU+,-  3E==;Y[_[m[m nop/0#F##h.'')=)=q $$$$  !)jL.HVPW
q%**%%akkq%**%%akk !e!4 5Qzl C56("SEF82k!n5ERTUGWWY[  !e!4 5Q{1~6FaTUGWWXYcXd e56("SEF827  xx(H""=11xx((U(SHKxx||M2H$U+I 6fX[SVWG$IQyk8*AfXQvhbQKMM##K0%--.@#h-POO%%l4&GH  MM##M2rV   r   r   c                 \  > U R                   R                  U5      nU R                  U5      nU R                  5       nU R	                  U5      (       a  U R                  XUSS9nU S[        X@R                  -  5       3n[        R                  R                  U5      nU R                  USU5      nU R                  R                  U R                  U5      n	U	R                  SXU40 5        [!        U	["        5      (       d   eSU	l        U	$ U R'                  U5      n
[(        TU ]U  X5      $ )NF)r  r  r   rq  T)r  r  r  r  r  r  rE   r%  r4   r  r  rF  r  r  rT  r  r   rG   r   r"  r2  rq  )r4  r   r   r   rd  r  rE  r   r   r  r   r7  s              rW   rq  CppTile2DKernel.load'  s   iiood#$$U+""$""5))::55 ; H "
#k%..2H&I%JKGGG%%d+E**7Au=DXX&&tzz48F!!&4u*=rBfn5555 FMM//6I7<00rV   c                   > SU;   d   e[        U[        5      (       d   U5       eUR                  (       d  U R                  U5      nU R                  R                  U5      nU R                  5       nU R                  U5      nU R                  U5      (       a  U R                  XUSUS9nU S[        X`R                  -  5       3nU R                  (       dH  [        R                  R                  U5      [         ["        R$                  ["        R&                  /-   ;   a  U SU S[        U R                  5       S3n	OU SU S3n	U R(                  R+                  [-        X5      5        g U R/                  U5      n
[0        TU ]e  XX45        g )Nr  T)r  r  r  rm  r   r   )r   rG   r   r  r  r  r  r  r  r  rE   r%  r  r4   r  r  r   r   r  r  rM  r   r;   r"  r2  r  )r4  r   r   r  r  r   rd  r  storebufr   r   r7  s              rW   r  CppTile2DKernel.store=  s`   }}%007%70||NN5)Eiit$""$$$U+""5))::54D ; H #3{5>>3I'J&KLH~~!2!24!8M

M = "  zK4O3PPRSz4KK!!,t":;//6IGM$57rV   c                    U R                  5       nU R                  (       a2  UR                  SU SU S[        U R                  5       SU S3	5        g UR                  SU SU S[        U R
                  5       SU S3	5        g )Nr]  r^  r   r   r   )r  r  r   rE   r  r  )r4  r   rd  s      rW   rR  #CppTile2DKernel.codegen_inner_loopsZ  s    ""$##NNUG6%K@T@T4U3VVXY^X__bc NNUG6%K@T@T4U3VVXY^X__bcrV   c                   > [         TU ]  X5      nU R                  S   U R                  :  a  U R                  O[	        U R                  5      u  U l        U l        U R                  U R                  S   :X  a+  U R                  U l        U R                  U l
        SU l        U$ U R                  U l        U R                  U l
        SU l        U$ )Nr5   r   FT)r2  r  r  r-  reversedr  r  r  r  r  r%  r  r  r  )r4  groupreduction_groupr  r7  s       rW   r  CppTile2DKernel.set_rangese  s    w!%9 ""1%(<(<< $--. 	(
 ??d11!44!11DN!11DN',D$
  "11DN!11DN'+D$rV   r   c                 T    U R                  UU R                  U R                  5       S9$ )Nr\  )r`  r  r  rc  s     rW   r"  "CppTile2DKernel.transform_indexingw  s0    ++%%' , 
 	
rV   )
r  r  r  r%  r  r  r  r  r  r  r  r  )r  r  r  r  r  r  r  r3  r  r  r  ro   r   r  rq  r  rR  r  r"  r  rl  rm  s   @rW   r!  r!    sx    < #I (.L
 6::x1 1UZZ 1,8:	$


 
uzz 
 
rV   r!  _bodyc                    U R                   /[        U R                  R                  5       5      -   nSnSnU GH  nUR                  R
                   H  nUR                  S:X  d  UR                  S;   a  M%  UR                  S;  a  Sn[        US5      (       a  UR                  (       a  [        R                  UR                  ;   d   eUR                  [        R                     nUR                  (       a  UR                  [        ;  a  SnM  Ub)  X&R                  :w  a  [        R                  " S5        M  M  UR                  nM  SnM     GM     X#4$ )	z
Returns the low precision data type (torch.float16/torch.bfloat16) contained in the nodes
and if all the nodes can codegen with this data type without converting to float.
Otherwise returns None and True.
NFplaceholder)	get_indexr  )rq  r  r  r  r  Trx  z.bf16 and fp16 are mixed in the scheduler node.)
root_blockr(  	subblocksr  r  nodesoptargetr   rx  rA   rw  r   r   warningswarn)r  
sub_blocks_lowp_fp_type	_use_fp32	sub_blockr6  rs  s          rW   get_loop_body_lowp_fpr
    s     ""#d5??+A+A+C&DDJ+/MI	__**Exx=(ELL = -  || $  !	uf%%%***..%**<<</4zz:M:Q:Q/R}}](J $I".$5 &VW 6 %,MMM 	9 +  > ##rV   c                   V   ^  \ rS rSrSrU 4S jrS\\\   \\   4   4S jr	S r
SrU =r$ )TilingSelecti  z
Implement the heuristic to select the tiling factors and tiling indices.
In the future, we can implement advanced heuristic in a subclass.
c                 "   > [         TU ]  5         g r  )r2  r3  r4  r7  s    rW   r3  TilingSelect.__init__  s    rV   r   c           	      L	  ^# [        U5      n[        U5      nU(       d   e[        S U 5       5      (       a  / / 4$ [        R                  n[        US   5      S   m#T#(       a  [        U#4S jUSS   5       5      (       a  T#n[        R                  " 5       R                  US9nU R                  XU5      nU(       Ga  [        US S9u  p[        U5      [        U	5      -   n
[        R                  R                  (       Ga  S nS	 nS
 n[!        [#        U
5      5       Vs/ s H  n[%        [&        R(                  U5      PM     nn[#        U5      nUS U UUS  nn0 n0 nU GH  nUR*                  /[-        UR.                  R1                  5       5      -   nU GHS  nUR2                  R4                   GH4  nUR6                  S;   a  UR6                  S:X  a  SOSnUR8                  R;                  UU45      UR<                  U   R<                  S      nU" X5      (       a7  U" UXU5      nUR6                  S:X  a  Uc  OUS;  a  U" UR6                  U5        [?        UR6                  [@        5      (       d  M  UR6                  RC                  S5      (       a  M  UR6                  S;   a  M  UR6                  U;  a  SUUR6                  '   GM  UUR6                  ==   S-  ss'   GM7     GMV     GM     [E        UR1                  5       5      n[E        UR1                  5       5      nSnSnUU:  d  US:  a  UU-  U:  a  / / 4$ U	(       dD  U(       a=  [#        U5      S:X  a.  [G        XS      /5      (       d  XS      US-  :  a
  US:  a  / / 4$ U[H        ;   a  [        R                  " 5       R                  US9n U H  n!U!S:  a  U![#        U
5      -   n!U!S:  d  U![#        U
5      :  a  M.  [G        U
5      (       ad  [J        R2                  RL                  RO                  U
U!   SS9n"U"U :  a1  [J        R2                  RL                  RQ                  U"U 5        U S-  n  OM  U
U!   U :  d  M  U S-  n  O   [#        U5      S:X  a  U/U4$ [#        U5      S:X  a  Xf/U4$ / / 4$ s  snf )Nc              3   2   #    U  H  o[         ;  v   M     g 7fr  )rv   r  r   s     rW   r  -TilingSelect.select_tiling.<locals>.<genexpr>  s     HZE//Z   r   c              3   F   >#    U  H  n[        U5      S    T:H  v   M     g7f)r   N)r
  )r  	loop_body_lowp_fp_dtypes     rW   r  r    s'      "
,	 #9-a0NB,   !r5   rD  c                     [        U S   5      $ r   r   sizess    rW   ru  ,TilingSelect.select_tiling.<locals>.<lambda>  s    #eAh-rV   rw  c                 R    XS      n[        XU5      nUR                  (       a  U$ S $ r`  )r	  r  )r   r  r  r  rf  r  s         rW   _try_get_stride3TilingSelect.select_tiling.<locals>._try_get_stride  s0     'a'89G0OF%+%5%56?4?rV   c                 0    X;  a  SX'   g X==   S-  ss'   g r   rU   )	node_namenon_contig_indexing_op_counters     rW   _update_negative_op_count=TilingSelect.select_tiling.<locals>._update_negative_op_count  s!     !FDE6A6AQFArV   c                     [        U5      S:H  =(       a@    [        U 5      S:  =(       a+    US   S:  a  US   OUS   [        U 5      -   [        U 5      :  $ Nr5   r   r  )r  r  s     rW   _is_valid_indices5TilingSelect.select_tiling.<locals>._is_valid_indices  sb    
 N+q0 (MA-(  .a0A5 +1-!/!2S]!Bh-(	rV   )r  rq  r  r  r   r   r5   masked_subblock)r2   r  r{  r  gQ?#   rM  
   r  ))rC   rB   rf   r   rn   r
  r'  r   r#  	nelements_select_tiling_indicesr_   r   r   r  enable_tiling_heuristicsrQ  r   r.   r   r  r  r(  r   r  r  r  r  r  indexing_from_argsr  r   ro   
startswithrc   r(   r   r4   r  r  guard_lt)$r4  fn_listvar_sizes_listloop_bodies
all_dtypesr   r  r  r  r  r,  r   r%  r)  r  r  r-  r  reduction_vars
op_counterr$  r  r  r	  r6  arg_idxr   r  op_numnon_contig_indexing_op_numratio_thresholdquantity_thresholdfactor_lowptiling_indice
call_ranger  s$                                      @rW   select_tilingTilingSelect.select_tiling  s    %W-/<
zHZHHHr6M.{1~>qAc "
(_"
 
 
 #E#002<<5<I44]
 %($?&"E  ,)??Kzz222@G" #3{#344 34;;B4   #&e*-o._-. % .0
 BD.(E"'"2"2!3d5??;Q;Q;S6T!TJ%/	%.__%:%:E$||/NN/4|||/K!QR(1(I(I%)>$:)""'**W"5":":1"=)? $5X#N#N-<(-x.&F
 ,1<<<+G )/-36-A(A,1LL:X)*  *%,,<< % 7 78I J J#(<<#M$N $)<<z#A?@Ju||$<$.u||$<$A$<7 &; &0 )@ Z..01-0299;.* #'%'"-1CCQJ2V;N
 r6M (N+q0,!"34 
 Q/0=13DD r6M% *668BBBO%3M$q((5K8H(H$q(MS=M,M '44%&WW%5%5%?%?'6 &@ &
 &3GG,,55j+N,71,<M! 4 %]3kA(3q(8 &4" >"a'%66>"a'%5~EE2vQs   "$R!c           	         / n[        X5       Hd  u  pV[        R                  " U/UQ76 nU[        R                  " UR
                  UR                  5       Vs/ s H  oR                  PM     sn-  nMf     [        [           " 5       n	/ n
[        [           " 5       n[        [           " 5       nU GH  nUR                   GH  n[        R                  " SUR                  5      (       d  M,  [        XU5      nUS:X  a  M@  US:X  aP  U	R                  [        UR                  SS  5      5        U
R!                  [        UR                  SS  5      5        M  [#        S UR                   5       5      (       a)  UR                  [        UR                  SS  5      5        M  UR                  [        UR                  SS  5      5        GM
     GM     X-
  U-
  n[%        US S9u  nn['        U5      ['        U5      -   n['        U	5      S:X  a  US-
  /$ U(       a  [)        U5      SS  $ X-  U-
  n[)        U	5      n['        U5      S:X  a  US   U;   a  US   US-
  :X  a  U$ [)        UU
R*                  S9SS  $ s  snf )	Nz^d\d+$r   r5   c              3   V   #    U  H  n[        U[        R                  5      v   M!     g 7fr  )r   r   SIZEr  rk  s     rW   r  6TilingSelect._select_tiling_indices.<locals>.<genexpr>w  s      S?R!49955?R   ')c                     [        U S   5      $ r   r  r  s    rW   ru  5TilingSelect._select_tiling_indices.<locals>.<lambda>|      s5QR8}rV   r  r  r   )rI  r
   extract_read_writes	itertoolschainreadswritesr   r   rm   rm  r   searchr   r	  r  r  r'  r_   r   sortedcount)r4  r5  r6  r  	all_indexfn	var_sizesrwdepcontig_varscontig_vars_listnon_contig_stride_constnon_contig_stride_otherr   r   r  contig_onlyr  r  num_itervarscontig_and_const_stridecontig_vars_sorteds                         rW   r0  #TilingSelect._select_tiling_indices_  s?    	 9MB11"AyAByrxx/ST/S))/STTI : !o'",S/"3",S/"3E))yyCHH55,UGQ;q[OOC$56$++C,=>Sv?R?RSSS+//CHHQRL0AB+//CHHQRL0AB *  ";>UU!$^9T!U5zC$88{q  1$%%+&rs++1##$ $K0"#q("2&*AA"2&,*::%%(.>.D.DEbcJJK Us   I6rU   )r  r  r  r  r  r3  r   r(  rm   rC  r0  r  rl  rm  s   @rW   r  r    s=    
i 
tCy$s)#	$	iV.K .KrV   r  c                      ^  \ rS rSrU 4S jrS rS\4S jrS\4S jr	S r
S	 rS
 rS\\   4S jrS rS rSS\\   4S jjrS\S\S   4S jrSrU =r$ )r  i  c                    > [         TU ]  UR                  UR                  R                  5        Xl        S U l        S U l        [        R                  " 5       U l
        / U l        g r  )r2  r3  r  wsr=  ra  rR  r,  r   r#  picked_vec_isakernelsr4  ra  r7  s     rW   r3  CppKernelProxy.__init__  sO    **LOO,G,GH(2=2J2J2L(*rV   c                 p    U H0  n[        U[        5      (       d   e[        R                  " U5        M2     g r  )r   r$   r:   propagate_scheduler_node)r4  r  r6  s      rW   data_type_propagation$CppKernelProxy.data_type_propagation  s-    Ee]333388? rV   scheduler_nodec                     [        UR                  [        5      (       d  g[        R                  " U5        [        UR                  5      S   S L=(       a    [        UR                  5      S   (       + $ )NTr   r5   )r   r  r   r:   rl  r
  )r4  ro  s     rW   is_lowp_fp_scheduler#CppKernelProxy.is_lowp_fp_scheduler  s_    ...9944^D!."6"67:$F C).*>*>?BB	
rV   r  c                     S[         R                  R                  4S jnUR                  /[	        UR
                  R                  5       5      -   nU H  nU" UR                  5        M     g )N	sub_graphc                   ^^^^^^^^ S[         R                  R                  S[        [         R                     4S jmS[         R                  R                  S[        [         R                     4S jmS[         R                  R                  S[         R                  4U4S jjmS[         R                  R                  S[         R                  4U4S jjmS[         R                  R                  S[         R                  4UU4S jjn[        U R                  5      n/ mU GHm  nUR                  S	;   a  T" U5      =m[        ;   a  [        UU4S
 jUR                   5       5      (       a  MM  UR                  S   nU R                  U5         U R                  SXC[         R                  4S9mUR                  TU4S j5        [         =R"                  S-  sl        S S S 5        M  UR                  S:X  a  T" U5      =m[        ;   a  UR                  u  pEpgnU" UT5      (       a  GM
  [$        R&                  R)                  U5      mU R+                  U5         U R                  SXGT4S9mUR-                  UT5        [         =R"                  S-  sl        S S S 5        GM  UR                  S:X  a  UR                  u  nmnn	n
U[        ;   a{  T[         R                  [         R.                  [         R0                  [         R2                  4;   d   eUT[        ;   a  [         R                  OT[         R                  U	U
4Ul        GM*  GM-  UR                  S:X  ah  UR                  S   [        ;   aQ  UR                  u  pJm[        UU4S jUR                   5       5      (       a  GM  XJ[         R                  4Ul        GM  UR                  S:X  ay  UR                  S   [        ;   ab  UR                  u  pKm[        UU4S jUR                   5       5      (       a  GM  TR5                  U5        XK[         R                  4Ul        GM.  UR                  S:X  Ga.  UR                  u  pGmnU[        ;   ad  U" Xx5      (       dW  U R+                  U5         U R                  SXGU4S9mUR-                  UT5        [         =R"                  S-  sl        S S S 5        T[        ;   a  [        UU4S jUR                   5       5      (       d{  UR                  S   nU R                  U5         U R                  SXC[         R                  4S9mUR                  TU4S j5        [         =R"                  S-  sl        S S S 5        GMg  GMj  GMm  GMp     S[         R                  R6                  4U4S jjnU" U 5        g ! , (       d  f       GM  = f! , (       d  f       GM  = f! , (       d  f       GN= f! , (       d  f       GM  = f)Nr  r   c                 b   U R                   S:X  a,  [        R                  R                  U R                  S   5      $ U R                   S:X  a  U R                  S   $ U R                   S:X  aD  [        U R                  5      S:  a  U R                  S   $ U R                  R                  SS5      $ g)	z6Get input dtype for nodes that may consumes lowp fp dtr  r5   r  r  r  r   r   N)r  r4   r  r  r  r   r  r  r  s    rW   get_input_dtype]CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.get_input_dtype  s    ;;')77,,TYYq\::[[$6699R=([[J.499~)#yy|+#{{{DAArV   c                 .   U R                   S:X  aG  [        U R                  5      S:X  d   e[        R                  R                  U R                  S   5      $ U R                   S;   a  U R                  S   $ U R                   S:X  a  U R                  S   $ g)	z6Get output dtype for nodes that may produce lowp fp dtrq  r   r5   )r  r{  r  r  r  r   N)r  r   r  r4   r  r  rw  s    rW   get_output_dtype^CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.get_output_dtype  sz    ;;&(tyy>Q...77,,TYYq\::[[$JJ99R=([[$6699Q<'rV   dtc                 2   > U[         ;   d   eT" U 5      U:H  $ )z]Check if the given node produces output with expected low precision floating point data type.)r   )r  r}  r{  s     rW   is_lowp_fp_source_CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source  s!    ]***'-33rV   c                 f   > U[         ;   d   eT" U 5      =n(       a  X!:H  $ U R                  S:X  a  gg)zZCheck if the given node accept input with expected low precision floating point data type.r  TF)r   r  )r  r}  input_dtyperx  s      rW   is_lowp_fp_sink]CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_sink  s<    ]***"1$"77;7&,,[[J. rV   c                 f   >^ T" U T5      =(       a     [        UU4S jU R                   5       5      $ )zCheck if the node is a lowp fp sources which are all directly fed to ops that accepts lowp fp input
thus no need to promote to float
c              3   6   >#    U  H  nT" UT5      v   M     g 7fr  rU   r  userr}  r  s     rW   r  }CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source_no_promote.<locals>.<genexpr>  s      ;:D$OD"--*   r'  users)r  r}  r  r  s    `rW   is_lowp_fp_source_no_promotejCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source_no_promote  s1     )r2 s ;:>**; 8 rV   )rq  r  c              3   6   >#    U  H  nT" UT5      v   M     g 7fr  rU   r  s     rW   r  WCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>       M?444r  r   r  r  c                    > U TL$ r  rU   r  to_type_nodes    rW   ru  VCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<lambda>  s
    A\4IrV   r5   r  r  r{  r  c              3   6   >#    U  H  nT" UT5      v   M     g 7fr  rU   r  s     rW   r  r    r  r  c              3   6   >#    U  H  nT" UT5      v   M     g 7fr  rU   r  s     rW   r  r  "  r  r  r  c              3   6   >#    U  H  nT" UT5      v   M     g 7fr  rU   )r  r  r   r  s     rW   r  r  J  s     Ue < <r  c                    > U TL$ r  rU   r  s    rW   ru  r  R  s
    A\<QrV   rt  c                 Z   > S[         R                  R                  4U4S jjnU" U 5        g )Nrt  c                   >^ S[         R                  R                  4S jnU R                   Vs/ s H  o"R                  S:X  d  M  UPM     nnU Vs/ s H  o!" U5      (       d  M  X"R
                  0PM      nnU H  nUR                  5        H  u  mnTU R                  ;   d  M  [        U4S jU 5       5      (       d!  TT;   d  M:  [        S U 5       5      (       d  MS  TR                  S   nTR                  U5        U R                  T5        M     M     U R                  c  U R                  5         g g s  snf s  snf )Nto_nodec                 :    [        S U R                   5       5      $ )Nc              3   >   #    U  H  oR                   S :H  v   M     g7f)r  N)r  r  usrs     rW   r  ڮCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_to.<locals>.<genexpr>a  s     "U}::#;}s   r  )r  s    rW   _used_by_toڛCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_to`  s    ""Uw}}"UUUrV   r  c              3   `   >#    U  H#  oR                   S    TR                   S    :H  v   M%     g7fr  Nr  )r  r  r  s     rW   r  ڙCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<genexpr>l  s$     #SUcHHRLDIIbM$AUs   +.c              3   L   #    U  H  oR                   S    [        ;   v   M     g7fr  )r  r   r  s     rW   r  r  o  s      ,&QV#(EQVr  r  )r   fxNoder  r  r  r  r'  all_input_nodesreplace_all_uses_with
erase_nodeowning_modulelint)	rt  r  r  all_to_nodesall_to_nodes_and_users
node_usersr  val_nodeto_lowp_fp_legalized_nodess	     `     rW   _eliminate_duplicate_to_nodeچCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_nodeY  s-   VUXX]] V *3$)8KK:<U ! $ 8D.7Ct{SWGX*zz*| + . '=
+5+;+;+=KD%#y6 ##SU#S S S$(,F$F(+ ,&QV,& )& )&
 ,0+?+?+C $ : :8 D ) 4 4T : ,> '=, !..6!( 79$.s   D:D:D?'D?)r   r  Graph)rt  r  r  s     rW   eliminate_to_dtype`CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtypeX  s"    ')EHHNN ')R -Y7rV   )r   r  r  r   r   r(  r  r  r   r'  r  r  inserting_aftercall_methodrn   r  r   cpp_to_dtype_countr4   r  r  inserting_beforereplace_input_withr  r  r  r  r  )rt  r  sub_graph_nodesr6  r2   r   r   	value_varr   r   r  r  r  r}  r   rx  r{  r  r  r  r  s                @@@@@@@@rW   add_to_dtypeDCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype  s    ehhmm  8M  
 uxx}} 
 %++9N 
 4 45;; 4
	!ehhmm 	! 	!588== ekk   #9??3O)+&(LL$::/66=H MMMM **Q-C"2259'0'<'<&c%++-F (= ( 33(*I  22a72 :9 LLG+.u55-G16.CqQ3IrBB GG--d3E"33E:'0'<'<&ce-D (= ( 00LI22a72 ;: \\[0 

!& M1  %!KK!NN!MM!KK	)       +0M+AEKKu!KK*!&
 2& \\Z/EJJrNm4S',zz$SMMMM "%ekk!:EJ\\Z/EJJrNm4S#(::LSRMMMM  /55e<"%%++!6EJ\\%779>6SUI !M1  <IQQ!*!;!;E!B/8/D/D$.ci5P 0E 0" !& 8 8L Q ' : :a ? : "C -  UUUU"'**Q-C!*!:!:5!A/8/D/D$.c%++5N 0E 0" !& ; ;$02Q!" !( : :a ? : "B!A V . o )r*8ehhnn *8X y)y :9  ;:J "C!B  "B!As2   AW>W>W(1AW:
W	
W%	(
W7	:
X
	)r   r  r  r  r(  r   r  r  )r4  r  r  r  r	  s        rW   legalize_lowp_fp_dtype_loopbody.CppKernelProxy.legalize_lowp_fp_dtype_loopbody  sX    Z	*EHHNN Z	*x  **+d93F3F3M3M3O.PP
#I) $rV   c                   ^  [        U 4S jU 5       5      (       a  U H  nUR                  R                  /[        UR                  R                  R                  5       5      -   nU H  nUR                  R                   H{  nUR                  S;   d  M  UR                  (       d   e[        R                  UR                  ;   d   eUR                  [        R                     nUR                  [        ;   a  M{   e   M     M     g U Ho  n[        U[        5      (       d   e[        UR                  [         5      (       d   eUR                  nUR#                  5       (       a  M^  T R%                  U5        Mq     g )Nc              3   t   >#    U  H-  n[        U[        5      =(       a    TR                  U5      v   M/     g 7fr  )r   r$   rq  )r  r6  r4  s     rW   r  8CppKernelProxy.legalize_lowp_fp_dtype.<locals>.<genexpr>  s2      
 um,Q1J1J51QQs   58)rq  r  )r'  r  r  r(  r   r  r  r  r  rx  rA   rw  r   r   r   r$   r   is_memory_copyr  )r4  r  r6  r  r	  fx_noders  r  s   `       rW   legalize_lowp_fp_dtype%CppKernelProxy.legalize_lowp_fp_dtype  s8    

 
 

 #kk445KK))0029 
 ",I#,??#8#8">>->>#*<</<#6#:#:gll#JJ#J;B<< 3 7 7<G $+==M#AA#A $9 ",	  Ee]3333ekk84444"[[D&&((44T: rV   c           	        ^^^ ^!^"^# [        T5      [        T5      :X  d   eU R                  m![        TS S9u  m m"U R                  T T"5        U!U#4S jnUU U"U4S jm#U" [        5      n[
        R                  =R                  UR                  -  sl        [
        R                  =R                  UR                  -  sl        [        R                  U5      U l        U R                  (       a  U R                  (       d6  U/U l        U R                  SS 5        U R                  R!                  U 5        g ["        R$                  R&                  R)                  SS9   [+        5       nUR-                  TT5      u  pg[        U5      [        U5      :X  d   eSn[/        [1        T5      5      n	[3        S U	 5       5      (       a  SnSn
S nU(       a  SnUS	   nUS
-   n[        U R                  R4                  5      U:  aV  U R                  R4                  U   R6                  nU R                  R4                  U   R6                  nU=(       a    U(       + n
[        U5      S
:X  Ga  [8        =R:                  S
-  sl        U R                  R=                  US	   US	   S9nU" [>        US	   US	   5      nUR@                  URB                  -
  nURD                  S	URB                  40Ul#        [&        RH                  RJ                  (       a  U(       a  U" [>        US	   US	   U5      nOUnURD                  /Ul&        URD                  URB                  UR@                  40Ul#        UU/U l        UnGO[        U5      S:X  Ga  US
   [        U R                  5      S
-
  :X  a  US	   US
   :X  d   e[8        =R:                  S-  sl        U R                  R=                  US	   US	   S9nS	URB                  4URB                  UR@                  4S.nUR@                  URB                  -
  nU R                  R=                  US
   US	   S9nS	URB                  4URB                  UR@                  4S.nUR@                  URB                  -
  nU" [N        US	   U5      nURD                  US   URD                  US   0Ul#        / n[&        RH                  RJ                  (       aq  U(       aj  S Hc  u  nnUS:X  a  UOS nUS:X  a  UOS nU" [N        US	   UUU5      nURD                  UU   URD                  UU   0Ul#        URQ                  U5        Me     OU" [>        US	   US	   5      nURD                  US   URD                  US   0Ul#        URD                  /Ul&        URQ                  U5        URD                  US   URD                  S	UR@                  40Ul#        URD                  URD                  /Ul&        URQ                  U5        U/U-   U l        UnOU/U l        U R                  X5        U R                  R!                  U 5        S S S 5        g ! , (       d  f       g = f)Nc                     [        U S   5      $ r   r  r  s    rW   ru  2CppKernelProxy.codegen_functions.<locals>.<lambda>  rM  rV   r  c                    > TR                   " U /UQ76  n[        =R                  S-  sl        T" U5        UsS S S 5        $ ! , (       d  f       g = fr   )
new_kernelr   generated_kernel_count)r*  r  re  ra  runs      rW   codegen_kernel8CppKernelProxy.codegen_functions.<locals>.codegen_kernel  sA    ((4t4 ..!3.F 544s   #A
Ac           	      r  > U R                  TT5      u  pSn[        TT	5       H~  u  pEUTT4[        [        R                  " TT5      5      S44;   a  U(       a   eU" X5        MB  SnUTS4:X  d   SU ST ST 35       eU R                  5          U" US5        S S S 5        M     g ! , (       d  f       M  = f)NFrU   Tzunexpected group: r  r   )r  rI  r   rO  rP  r  )
re  r  r9  	in_suffixrW  rX  r5  r  r  r6  s
         rW   r  -CppKernelProxy.codegen_functions.<locals>.run  s    #)#4#4UO#L DI!$Wn!=O,9??5/BCRH!   )(=t, $I$)  V ,I;d5'OCTUV 
  //14 21 "> 21s   
B''
B6	Finplace_buffersTc              3   2   #    U  H  o[         ;  v   M     g 7fr  )rw   r  s     rW   r  3CppKernelProxy.codegen_functions.<locals>.<genexpr>  s     S
u ::
r  r   r5   )r  r   maintailr  )r  )r  r  )r  r  r  ))r   ra  r_   r  r%  r4   r  removed_buffersinplaced_to_removerP  r  rR  rg  r  rh  aggregate_reduction_buffers
set_kernelr   	_inductorr   patchr  rC  rB   rC   rf   rK  r5  r   generated_cpp_vec_kernel_counttiler  rB  
tiled_sizer   r*  r  enable_loop_tail_vecr+  r!  r  )$r4  r5  r6  r  scalar_kerneltiling_selecttiling_factorsr  could_masked_vecr8  _inner_loop_reduction_outer_not_outer_loopinner_loop_reductionouter_loop_levelinner_loop_levelouter_loop_reductionr  
vec_kernelr  tail_kernel
outer_loopr\  r  
inner_loopinner_rangesr  tile2d_kernelouter_rinner_r_inner_tail_size_outer_tail_sizere  r  ra  r  r  s$    ``                             @@@@rW   codegen_functions CppKernelProxy.codegen_functions  s   7|s>2222((!$^9T!U/		% 	%( 'y1	=#@#@@	""m&F&FF"!6""$--)?DL,,UD9NN%%d+ __##))%)@(NM-:-H-H.*N ~&#n*====#3N74KLJS
SSS#( .3+K',$#1!#4 #3a#7 t~~++,/??+/>>+?+?(,"l ) ,0>>+?+?(,"l ) -I5I1I 4 >"a'66!;6~~**>!+<^TUEV*W+ ."3^A5F
 !II7	,0HHq$//6J+K
(::227G"0$&q)&q)!	#K #0K48HH:M0-1XX7S,T) *K8"^$)"1%T]]);a)??&q)^A->>? 66!;6!^^00"1%nQ.? 1 
 
 5 56'22JOOD  #-//J4I4I"I!^^00"1%nQ.? 1 
 
 5 56'22JOOD  #-//J4I4I"I .#"1%"! NNL$8NNL$8/+ !::227G-( 07&/@Od ) 07&/@Od ) "0+*1-*,," 'NNL,A&NNL,A0, $**62--0 "0$nQ&79J"J #V(<"V(<0J, 2<0@J-&&z2"V(<"JOO(<3M/ 5?NNJNN3SM0&&}5 -<( -,,/ NN%%d+G A@@s   S+Y
Yc                     U H*  nU R                  U5        [        R                  " U5        M,     U R                  X5        g r  )r  r:   propagate_loopbodyr  )r4  r7  r6  r  s       rW   codegen_loop_bodies"CppKernelProxy.codegen_loop_bodies`  s8    D0062248   	{;rV   r  c                    U R                  U5        U R                  U5        [        U5      S:  d   eS nU Vs/ s H  n[        R                  " X#5      PM     nn[        [        R                  [        5      (       a9  [        R                  R                  (       a  S nU Vs/ s H
  o%" U5      PM     nnU Vs/ s H  o3R                  S   PM     nnU R                  XF5        g s  snf s  snf s  snf )Nr5   c                     U R                  5         U R                  5         [        [        R                  [
        5      (       a  U R                  " U6 $ U R                  U5      $ r  )decide_inplace_updatemark_runr   r4   re  r1   r  codegen)r  
index_varss     rW   rW  (CppKernelProxy.codegen_nodes.<locals>.fnl  sG    &&(MMO!(($566zz:..||J//rV   c                 P    [         R                  R                  U 5      nXl        U$ r  )r4   r  localize_functionoriginal_fn)rW  
wrapped_fns     rW   wrap_fn-CppKernelProxy.codegen_nodes.<locals>.wrap_fn{  s)    33EE
 *,&!!rV   )r  rm  r   rS  partialr   r4   r  rJ   r  r  r  )r4  r  rW  r  r5  r  r6  s          rW   codegen_nodesCppKernelProxy.codegen_nodesf  s    ##E*""5)5zQ	0 <AA549$$R.5A q--/ABB&&44" .55Wrwr{WG549:ED**Q-E:w7# B 6:s    C)&C.=C3c                 <    U R                  U R                  X5        g r  )r  rR  )r4  r   r  s      rW   r  CppKernelProxy.codegen_loops  s    BrV   c                 J    U R                    H  nUR                  5         M     g r  )rh  rO  r4  re  s     rW   rO  4CppKernelProxy.update_stores_with_parallel_reduction  s    llF88: #rV   r   c                 D   Uc   eSnU R                    Hw  n[        R                  " 5        nUR                  X5      (       a@  SnUR	                  UR                  5       5        UR                  UR                  5       5        S S S 5        My     g ! , (       d  f       M  = f)N
C10_LIKELYC10_UNLIKELY)rh  r   r   r  r   r   rH  rW  )r4  r   	if_prefixre  r   s        rW   rW  CppKernelProxy.gen_body  s{     	llF%%'5,,T== .I''6KK 12	 (' #''s   AB
B	inner_loop_reduction_outer_notr  	LoopLevelc                   ^  SU 4S jjnT R                   S   nU(       a  U(       d   eU" U5        OZUR                  5         T R                  R                  UR                  5        T R                  R                  UR                  5        T R
                  R                  UR
                  5        T R                  R                  UR                  5        T R                  R                  UR                  5        T R                  R                  UR                  5        T R                  R                  UR                  5        g )Nc           
        > [        TR                  5      S:  d   eTR                  S   nTR                  S   n[        U[        5      (       d   e[	        U5      [
        :X  a^  UR                  UR                  5        UR                  5         TR                  R                  UR                  UR                  -   5        O5UR                  5         TR                  R                  UR                  5        [        5       n[        R                  " 5        nUR                  USU R                  5      (       a:  UR                  UR!                  5       5        UR                  UR"                  5        S S S 5        [        R                  " 5        nUR                  USU R                  5      (       Ga
  UR                  UR!                  5       5        [	        U5      [
        :X  a  UR$                  nU HW  nU SU R                   S['        U R(                  5       S3n[+        UR,                  Xg5        [+        UR"                  Xg5        MY     UR                  [/        UR"                  U R                  U R                   S	3U R(                  U R0                  5      5        OUR                  UR"                  5        S S S 5        UTl        g ! , (       d  f       GNb= f! , (       d  f       N(= f)
Nr   r   r  r  r  r   z_tail - r  rS  )r   rh  r   r  r  r%  r  r  r.  rH  r7   r   r   r  r   r   r   r0  r?  rE   r  r   rM  r   rB  )	r  main_loop_kerneltail_loop_kernel
suffix_bufr   r9  r   r   r4  s	           rW   !aggregate_reduction_prefix_suffixUCppKernelProxy.aggregate_reduction_buffers.<locals>.aggregate_reduction_prefix_suffix  sv   t||$)))#||A#||B/.==== $%2 !::$22 !::<%%,,$55&778
 !::<%%,,-=-N-NO &J%%'5#66jnn  ''
(9(9(;<%%&6&G&GH ( %%'5#66
  ''
(9(9(;<,-:)9)M)M$2D*.uZ^^4DH[YcYnYnMoLppq'rH,-=-D-DdU, 0 A A4 %3 #))6 0 A A *#->>"2% 8 * 5 5 * #))*:*K*KL/ (0 %/D!= (' ('s   AKD.K!
K!
K/r   )r  r  )
rh  r  r.  rH  r0  r1  r2  r3  r4  r6  )r4  r  r  r$  main_kernels   `    rW   r  *CppKernelProxy.aggregate_reduction_buffers  s    6	/p ll1o):-j9113!!(()E)EF!!(()E)EF&&--k.S.ST&&--k.S.ST!!(()I)IJ##**;+M+MN**1155	
rV   )r,  ra  rh  rR  rg  r0  r  )r  r  r  r  r3  rm  r$   rq  r   r  r  r  r  r(  r  r  rO  r   r7   rW  rk   r  r  rl  rm  s   @rW   r  r    s    +@
= 
_* _*B;<v,p<84#6 8BC;3Xl3 3L
.2L
@H@UL
 L
rV   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )rb  i  c                 p   > [         TU ]  UR                  UR                  R                  5        / U l        g r  )r2  r3  r  rf  r=  rd  ri  s     rW   r3  OuterLoopFusedKernel.__init__  s)    **LOO,G,GH%'
rV   c           
         / nU R                    Vs/ s H  oDR                  5       PM     nnU Hg  nUR                  nUc   eUR                  UR	                  [        [        U5      UR                  -
  UR                  S9U5      R                  5        Mi     [        [        UR                  [        U5      5      UR                  S9$ s  snf )Nr  )rd  r  r,  r  r  r  r   r  r  r^   r_   )r4  r  r  kernels_parallel_depthrR  nested_kernelsre  r,  s           rW   r  *OuterLoopFusedKernel.decide_parallel_depth  s    !#48JJ+
4>y  "J 	 +
 %F !,,K***")),,!,/A/M/MM$6$B$B	  !.
 %  "1137M3N +66	
 	
)+
s   C
)rd  )r  r  r  r  r3  r  r  rl  rm  s   @rW   rb  rb    s    (
 
rV   rb  c                        \ rS rSrSrSrSrSrg)ReasonFusedNodesi
  same_vars_reducecompatible_reductioncompatible_ranges_no_reductionrU   N)r  r  r  r  SAME_VARS_REDUCECOMPATIBLE_REDUCTIONCOMPATIBLE_RANGES_NO_REDUCTIONr  rU   rV   rW   r0  r0  
  s    )1%E"rV   r0  c                     ^  \ rS rSrSr\" \R                  \R                  /5      r	\
S\R                  S\\   4S j5       rU 4S jrS\4S jrS	 rS
 rS rS\\   4S jrS rS rS rS\S\S\4S jrS rS rS rS rS\ \!   4S jr"S\#4S jr$S\%\#\&\!4   4S jr'S\S\4S jr(S\S\)\   S\)\   4S  jr*S! r+S" r,S# r-S'S$ jr.S% r/S&r0U =r1$ )(CppSchedulingi  i  devicer   c                     U R                   $ r  )backend_features)r*  r9  s     rW   get_backend_features"CppScheduling.get_backend_features  s    ###rV   c                 `   > [         TU ]  U5        U(       a  U R                  5         SU l        g NF)r2  r3  reset_kernel_group_ready_to_flush)r4  r&  r7  s     rW   r3  CppScheduling.__init__   s'    ###%$rV   statusc                     Xl         g r  rA  )r4  rC  s     rW   _set_flush_statusCppScheduling._set_flush_status&  s    %rV   c                 &    [        S U 5       5      $ )Nc              3      #    U  H<  n[        [        [        R                  R                  R
                  U5      5      v   M>     g 7fr  )r   rx  r4   r  r  r   rH  s     rW   r  )CppScheduling.group_fn.<locals>.<genexpr>*  s/     Mu!U3qww//88!<==us   AA)r   )r4  r  s     rW   group_fnCppScheduling.group_fn)  s    MuMMMrV   c                 "    [        5       U l        g r  )KernelGroupra  r:  s    rW   r@   CppScheduling.reset_kernel_group,  s    'MrV   c                   ^ UR                  5       (       d  UR                  5       (       a  [        R                  " X5      $ UR                  5       (       a-  UR                  5       (       a   e[        R                  " X5      $ U R                  X5      [        R                  :X  Ga  [        U[        [        45      (       d   e[        U[        [        45      (       d   eUR                  u  nu  pEUR                  u  nu  pgUS:X  a  US:X  d   XW45       eU4S jm[        U5      [        U5      :  a  UOUn[        U[        5      (       d   e[        U5      [        U5      :  a  UOUn	T" U	5      n
UR                  U
S9  UR                  u  nu  pCUR                  u  nu  pcXF:X  a  [        R                  " X5      $ T" U5      n[        U	[        5      (       a  U	R                  US9  Op[        U	[        5      (       d   eU	R                   H)  n[        U[        5      (       d   eUR                  US9  M+     [	        U	R                  U	R                  5      n	UR                  u  nu  pCUR                  u  nu  pcXF:X  d   XF45       e[        R                  " X5      $ U R                  X5      (       a%  [         R                  XU R#                  X5      5      $ [        R                  " X5      $ )NrU   c                 f  > [        U [        5      (       a  [        U R                  5      S:  d   U R                  5       eS n[        [
           " 5       nU R                   H;  nT	" U5      u  pEUc  UnX:X  d   XU R                  45       eUR                  U5        M=     U[        U5      4$ [        U [        5      (       d   eU R                  n[        U[        R                  5      (       d   eUR                  5       u  pxnUR                  [        UR                  R                  5       5      4$ r`  )r   r"   r   snodesr   r   updater(  r$   r  r   ComputedBufferget_default_sizes_bodyrv  indexing_exprsr  )
r  rv  rV  snodevexprscomp_bufferr   r  get_indexing_ranges_exprss
            rW   r[  5CppScheduling.fuse.<locals>.get_indexing_ranges_exprsA  s	   !$(:;;"4;;/!3@T[[@3%)
)3C):%)[[E'@'GHA)1-.
#-?PZDKK4PP?*11%8 &1  *4+???)$>>>>&*ii)+r7H7HIIII%0%G%G%I
#T5H5H5O5O5Q0RRRrV   )extra_indexing_constraints)
is_foreachr!   r,  is_templater"   _why_fuse_nodesr0  r6  r   r$   r  r   recompute_size_and_bodyrR  r&  can_fuse_vertical_outer_loopr  _get_outer_loop_fusion_depth)r4  r  r  r   vars1reduce1vars2reduce2node_to_recompref_noderef_indexing_constraints#node_to_recomp_indexing_constraintsrW  r[  s                @rW   r,  CppScheduling.fuse/  s   !1!1!3!3-225@@  ((****%**588 $$U2#BBC "%-9K)LMMMM!%-9K)LMMMM&+kk##E&+kk##E"}BJ8JJ6S& +.e*s5z*Au!.-@@@@$'JU$;5+DX+N(66/G 7  !&:E %:E>-225@@ 7P"73 h66443V 5  &h0BCCCC!))%????557Z 6  "1
  2(2D2DhooVH %:E %:E~5~5~)..u<<225@@277$"C"CE"Q  *..u<<rV   c                    UR                   u  nu  pEUR                   u  nu  pgXF:X  a  XW:X  a  [        R                  $ US:X  a  XFU-   :X  a  [        R                  $ U R	                  X5      (       a  [        R
                  $ g )NrU   )r  r0  r4  r5  &_can_fuse_nodes_with_compatible_rangesr6  )r4  r  r  r   rd  re  rf  rg  s           rW   r`  CppScheduling._why_fuse_nodes  sv    #kkE#kkE>g0#444b=Ugo5#88866uDD#BBBrV   c                    UR                   u  nu  pEUR                   u  nu  pgUS:H  =(       a    US:H  n[        R                  " U5      [        R                  " U5      :H  n	[        U5      S:H  =(       d    [        U5      S:H  n
U(       a  U	(       a  U
(       d  g[        U5      [        U5      :  a  UOUn[        U5      [        U5      :  a  UOUn[	        U[
        5      (       a  g[	        U[        5      (       d   e[	        UR                  [        R                  5      (       a  g[	        UR                  [        R                  5      (       d   eUR                  R                  R                  5       nS n[	        U[
        5      (       a  [        [        [        S4      " 5       nUR                    H  n[	        UR                  [        R                  5      (       a    Ok[	        UR                  [        R                  5      (       d   eUR#                  [        UR                  R                  R                  5       5      5        M     [        U5      S:w  a  g[%        ['        [)        U5      5      5      nOf[	        U[        5      (       d   e[	        UR                  [        R                  5      (       d   eUR                  R                  R                  5       nX:w  a  gg)NrU   r5   F.T)r  r1  rd   r   r   r"   r$   r  r   TemplateBufferrT  dataget_sizer   r   r   rR  r  r(  nextiter)r4  r  r  r   rd  re  rf  rg  c1c2c3rh  ri  ranges2ranges1
ranges_setrW  s                    rW   rn  4CppScheduling._can_fuse_nodes_with_compatible_ranges  s+    $kkE#kkE],w"}YYu5!11Z1_/E
arb"%e*s5z"9uJU35 n&899 .-8888n))2+<+<==.--r/@/@AAAA !%%**335h 233#E#s(O46J!ejj"*;*;<<!%**b.?.?@@@@uUZZ__%=%=%?@A	 ) :!#4Z 012Gh6666hmmR->->????mm((113GrV   c                     [        U[        [        45      (       d   e[        U[        [        45      (       d   e[        S X4 5       5      (       a  gU R	                  X5      S L$ )Nc              3   B   #    U  H  n[        U[        5      v   M     g 7fr  )r   r  r  s     rW   r  :CppScheduling._can_fuse_horizontal_impl.<locals>.<genexpr>  s      
FTdJt899ns   F)r   r"   r$   rf   r`  r4  r  r  s      rW   _can_fuse_horizontal_impl'CppScheduling._can_fuse_horizontal_impl  sk    %"4m!DEEEE%"4m!DEEEE 
GLn
 
 
 ##E1==rV   c                    UR                  5       (       d  UR                  5       (       a  g[        UR                  5       5      [        UR                  5       5      -   [        R                  R
                  :  a  gU R                  X5      $ r?  )r_  r   r1  r   r  max_horizontal_fusion_sizer  r  s      rW   can_fuse_horizontal!CppScheduling.can_fuse_horizontal  sj    %"3"3"5"5!"S):%;;jj334 --e;;rV   r  r  c                    UR                  5       =n(       a  [        UR                  [        R                  5      =(       a    [        UR
                  [        R                  5      =(       a]    [        UR
                  R                  5      S:H  =(       a4    UR
                  R                  S   R                  5       UR                  :H  $ g)Nr5   r   F)get_template_noder   layoutr   MultiOutputLayoutr  MultiOutputr   inputsr  r   )r4  r  r  template_bufs       rW   can_fuse_multi_outputs_template-CppScheduling.can_fuse_multi_outputs_template  s     !2244<4<..0D0DE Iuzz2>>:I

))*a/I JJ%%a(113|7H7HH	 rV   c                 :   Sn[        S X4 5       5      (       d  U$ [        U[        5      (       a  UR                  5       S   OUn[        U[        [
        45      (       d   e[        U[        5      (       a  UR                  5       S   OUn[        U[        [
        45      (       d   eUR                  u  nu  pxUR                  u  nu  pUS:X  a  U	S:X  a  US:w  a  U
S:w  a  U$ [        S X4 5       5      (       a(  UR                  UR                  :X  a  UR                  $ U$ [        [        U5      [        U	5      5      nUS:  aP  US U U	S U :X  aD  [        S X4 5       5      (       a*  [        U5      [        L a  UOUnUR                  U:X  a  U$ U$ U$ U$ )Nr   c              3   \   #    U  H"  n[        U5      [        [        [        4;   v   M$     g 7fr  )r  r  r"   r$   r  s     rW   r  =CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>  s.      
 ' J+-?OP&r!  r  rU   c              3   D   #    U  H  n[        U5      [        L v   M     g 7fr  r#  r  s     rW   r  r    r$  r%  r5   c              3   D   #    U  H  n[        U5      [        L v   M     g 7fr  r#  r  s     rW   r  r    s      FTdT
99nr%  )r'  r   r  r)  r"   r$   r  r+  r^   r   rf   r  )r4  r  r  DISABLE_OUTER_LOOP_FUSION_node1_node2r   rd  re  rf  rg  r+  _compare_nodes                rW   rc  *CppScheduling._get_outer_loop_fusion_depth  s   $%! 
 
 
 

 -, %!<== !!#B' 	
 &#5}"EFFFF %!<== !!#A& 	
 &#5}"EFFFF$llE$llEB;5B;7b=W],,Te^TTT 00E4Q4QQ -- /
 #&c%j#e*"=#q(../59Q:Q3RR GLn   "%[,GGEU  !88<SS2244 /.((rV   c                 T   UR                  5       (       + =(       a    UR                  5       (       + =(       aq    UR                  5       UR                  -  =(       aM    U R                  X5      =(       a    UR	                  5       (       + (       + =(       a    U R                  X5      S:  $ r   )r_  get_operation_names	ancestorsr  r5  rc  r  s      rW   rb  *CppScheduling.can_fuse_vertical_outer_loop  s    !!## E%%''E))+eoo=E ..u< -**,,E 11%?1D		
rV   c                 2    U R                  X5      (       a  ggr(  )rb  r  s      rW   get_fusion_pair_priority&CppScheduling.get_fusion_pair_priority(  s    ,,U::rV   c                 :   UR                  5       (       a  gUR                  5       (       a,  [        X/5      u  p4UR                  5       (       + =(       a    U$ U R                  X5      =(       a    UR                  5       (       + =(       d    U R	                  X5      $ r?  )r_  rM   r5  r  rb  )r4  r  r  template_fusion_supportedr   s        rW   can_fuse_verticalCppScheduling.can_fuse_vertical/  s    +Sw,(% ))++I0II**58UASASAU=U=..u<	=rV   r  c                 N  ^^^^^ [        S U 5       5      (       a  U$ SmSmSnSnSnSnU GH  n[        UR                  [        R                  5      (       d   eUR                  R                  5       u  pxnUR                  R                  5        GH6  u  mn	U	R                  [        5       GH  m[        U4S jUR                   5       5      (       a  TU:w  a  TnUS-  nUS:  a  Us  s  s  $ [        TR                  S   [        R                  R                  R                  5      (       d  M  TR                  S   UR                  ;   d  M  Tc  M  [!        UU4S jUR                  R                  5        5       5      (       d  M  TR                  S   S:  d  M  TR                  S   mTR                  S   mS	nUnGM     GM9     GM     U(       d  U$ SmUUU4S
 jn
U H  nXe:X  d  M
  UR#                  U
S9  M     U H  nXe:w  d  M
  UR#                  TU
S9  M     U$ )a  
Apply loop split optimization.
When one of the indexing_exprs contains a division, we eliminate the division by splitting the loop
to avoid non-contiguous loads, subject to the following conditions:
    1. No reduction and no mudular index for all nodes.
    2. The indexing_exprs of all nodes contain only one (or more, but all the same) division,
       where the divisor is an integer and not too small (the divisor > 8), the dividend is
       one of the iter_vars, and this var, i.e. the dimension that needs to be split, is
       contiguous in all other indexing_exprs.

For example, if the node's var_ranges: {z0: 2, z1: 9216, z2: 960} and indexing_exprs:
{'index0': 8847360*z0 + 960*z1 + z2, 'index1': 32*z0 + (z2//30), 'index2': z2},
we will split z2 -> 30*z2 + z3, then the node's var_ranges will be changed to
{z0: 2, z1: 9216, z2: 32, z3: 30} and indexing_exprs will be changed to
{'index0': 8847360*z0 + 960*z1 + 30*z2 + z3, 'index1': 32*z0 + z2, 'index2': 30*z2 + z3}.
c              3      #    U  H^  n[        UR                  S    S    5      S:g  =(       d4    [        S UR                  R                  R                  5        5       5      v   M`     g7f)r5   r   c              3   J   #    U  H  oR                  [        5      v   M     g 7fr  )r   r   )r  r  s     rW   r  9CppScheduling.try_loop_split.<locals>.<genexpr>.<genexpr>Q  s      6Xd))6Xs   !#N)r   r  rf   r  rV  r  r  s     rW   r  /CppScheduling.try_loop_split.<locals>.<genexpr>O  sd      

 	 

1a !Q&  6:jj6O6O6V6V6X  s   A&A(Nr   Fc              3   F   >#    U  H  nTR                  U5      v   M     g 7fr  )r   )r  r   div_exprs     rW   r  r  e  s     Q9P#HLL--9Pr  r5   c              3   p   >#    U  H+  u  pUT:w  d  M  [        UTR                  S    5      S;   v   M-     g7f)r   r+  N)r	  r  )r  name_expr_r  r   s      rW   r  r  p  s:       0T$} S/x}}Q7GHFR0Ts   6#6r0  Tc                   > U u  p4Uu  pVUR                  T5      nUR                  5       nX7   T-  X'   UR                  US-   T5        [        R                  " XSS9u  u  pnU	R                  5       nUR                  US-   5      nTX   -  U-   X'   [        R                  " XU/XU5      nT(       d/  UR                  [        UR                  R                  5       5      4mX4UX44$ )Nr5   r(  )r  )r   copyinsertr
   index_vars_no_squeezepopr   r   rv  r(  rV  r  )r  r  r  
index_sizereduce_sizer  reduce_vars	split_idxnew_index_sizenew_index_varsr   rv  	iter_varsdivisor_varr]  split_number	split_vars                 rW   
loop_split0CppScheduling.try_loop_split.<locals>.loop_split  s   &+#J&*#J"((3I'__.N(2(=(MN%!!)a->.:.P.PC/+^ '++-I#--	A6K#/)2F#F#TI ;;+.
KD .OO,,3356.*
  -- rV   )recompute_sizes_body_func)r]  r  )rf   r   r  r   rT  rU  rV  r  findr   r  r  r   corenumbersrX  r'  ra  )r4  r  num_div	div_expr_	match_divmatched_noder  r   original_bodyr  r  r  r]  r   r  r  s              @@@@@rW   try_loop_splitCppScheduling.try_loop_split<  s   &  

 
 
 
 L			Ddii):):;;;;"&))"B"B"DAa+::@@B
d $		( 3HQ9P9PQQQ$	1$,	1{$"8==#3UZZ5G5G5O5OPP$MM!,0G0GG ,  0=0L0L0R0R0T   
 %MM!,q0$,MM!$4	'/}}Q'7$(	'+/ !4 C < L%)"	: D#,,z,R  D#,,/I.8 -   rV   r  c                 n  ^^^	 U R                   m[        R                  n/ m/ m	[        U[        5      (       d   eS[        4UUU	4S jjnU" U5      (       d  U[        l        TR                  5         T	R                  5         [        R                  R                  R                  SS9   UR                  5        H]  n[        U[        [        45      (       d   eUR                  5       n[        T5      nUR                  U5        TR!                  Xe5        M_     SSS5        gg! , (       d  f       g= f)z
Generate the code for the outer loop fused scheduler node.
1. Codegen with fused outer loop: depends on the analysis of
    the outer loop fused scheduler node, with or without the local buffer.
2. If failed, fallback to standard codegen.
r  c           	      	  >^ ^^^^^ [        T [        5      (       d   eTR                  5         TR                  5         S[        4S jm/ n0 m[	        UU 4S jT R                  5        5       5      (       Ga  [        [           " 5       mT R                  5        GH  m[        T[        5      (       d   eTR                  TR                  5       5        TR                  5       (       d  [        TR                  5       5      S:w  a  Mn  TR                  5       S   m[	        U 4S jTR                   5       5      (       d  M  TR                   n[        U["        R$                  5      (       d   eUR'                  5       nT R(                  [        T" T5      5      -
  nUU4S jnUR+                  5       (       a  U" 5       (       d  GM-  ["        R,                  " UR.                  UR0                  UR2                  US UR4                  US 5      nUU4S	 jnS
nU" Xa5      n	U	(       dC  ["        R6                  " U S[        U5       3US9n	UR9                  U	5        / TU	R:                  '   TU	R:                     R9                  U5        GM     [=        TR>                  5       n
[        U5      S:  a7  U H1  nUR:                  c   eU
RA                  UTUR:                     5        M3     T R                  5        Hz  n[        U[B        [        45      (       d   e[E        T5      nURG                  UR                  5       5        TR9                  U5        TR9                  UR                  5       5        M|     T RI                  TT R(                  5      (       dF  U
RJ                   H,  n[L        RN                  RJ                  RQ                  U5        M.      SSS5        g[R        RT                  R9                  [R        RV                  " [        T5      [        U
RX                  5      S95        T R[                  T5      nTR]                  U/ [^        R`                  Rc                  T5      Q5        SSS5        g! , (       d  f       g= f)z6
Codegen code with fused outer loop and local Buffer.
r  c                     [        U [        [        45      (       d   eU R                  5       n[	        US S9R
                  u  nu  p4[        U5      [        U5      -   nU$ )Nc                 4    [        U R                  5       5      $ r  )rm   r5  r  s    rW   ru  ~CppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_ranges.<locals>.<lambda>  s    Q^^-=)>rV   r  )r   r$   r"   r1  r_   r  r   )r  r  r   r  r  r,  s         rW   get_call_rangeslCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_ranges  sa    !$8J(KLLLL-1^^-=.1>/% ,+E $ElU?-CC""rV   c              3   f   >#    U  H&  n[        T" U5      5      TR                  S -   :H  v   M(     g7f)r5   N)r   r+  )r  r6  r  r  s     rW   r  fCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.<genexpr>  s2      3E OE*+t/K/Ka/OO3s   .1r5   r   c              3   \   >#    U  H!  oR                   TR                  5       ;   v   M#     g 7fr  )r  r1  )r  r  r  s     rW   r  r    s#      BX$		T^^%55BXs   ),c                  N  >^^ SmSn [        TR                  R                  R                  5       5       H  u  pTX-  -  mX-  n M     TR                  R	                  TR                  5       5      nU4S jmT" U5      =(       a     [        UU4S jTR                   5       5      $ )Nr   r5   c                    > U T:H  $ r  rU   )r  contiguous_index_exprs    rW   is_contiguous_indexږCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.is_contiguous_index  s    '(,A'A ArV   c              3      >#    U  H_  n[        UR                  [        5      =(       a9    T" UR                  R                  R	                  TR                  5       5      5      v   Ma     g 7fr  )r   r  r$   r  get_read_exprr  )r  r  r  scheduler_buffers     rW   r  ڌCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.<genexpr>  s^      Q -CD !+499m D !"$7$(IIOO$A$A(8(A(A(C%&%"!" -Cs   A'A*)r  r  rv  r  get_write_exprr  r'  r  )r  r   rQ  write_index_exprr  r  r  ro  s       @@rW   is_all_write_read_contiguousyCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous  s    451%&F.6 . 4 4 ? ? E E G/
 !6 E 5 &	/
 0>/C/C/R/R 0 9 9 ;0,B $77G#H $S Q -=,B,BQ N rV   Nc                    > U H?  nXR                   :X  d  M  [        U4S jTUR                      5       5      (       d  M=  Us  $    g )Nc              3      >#    U  Hb  nUR                   c  M  [        U4S j[        R                  R                  R
                  UR                      R                   5       5      v   Md     g 7f)Nc              3   ^   >#    U  H"  nUR                   R                  5       T;   v   M$     g 7fr  )r  r  )r  r  visited_scheduler_nodess     rW   r  ڐCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>.<genexpr>  s/      (&50D )-		(:(:(<@W(W50s   *-)r   r'  r4   r  r&  name_to_bufr  )r  global_bufferr  s     rW   r  چCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>  si      S":& (5'9'9%&C (&45GG4E4E4Q4Q,9,>,>5**/%50(& %& %&:&s   A-AA-)r  r'  r   )local_buffer_layoutr  	local_buflocal_to_global_buffersr  s      rW   try_share_local_buffersCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer  sR    -:	#6:J:J#Js S" :Q(1:&S" P" P" ,5$4 .; $(rV   local_buffer_datar   )r   r  F)local_buffer_numberT)2r   r  clearr   r'  r)  r   ro   r1  r$   r  r  r5  r   get_outputsr  r  r   rT  r  r+  is_contiguousFixedLayoutr9  r   rB  r  Bufferr  r   rJ   r  add_local_bufferr"   r  r  r]  r  r4   r  remover   !cpp_outer_loop_fused_inner_countsCppOuterLoopFusedCountr  ri  finalize_kernelrO  rP  from_iterable)r  r  r  global_buffer_layoutsize_offsetr  r  r  local_buf_prefixlocal_buffer_usedscoper  r6  r[  removed_bufferouter_fusion_cpp_kernel_proxyr  r  r  ro  r  rY  ra  
nodes_lists   `               @@@@@rW   $try_outer_loop_fusion_with_local_bufSCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf  s    d$?@@@@!'')#&7 # .0MBD# !113   +5S/*;'&*nn&6N%nmDDDD+//0G0G0IJ&3355~99;<A '5'A'A'CA'F$ BRBXBX   )9(=(=)-9J9JKKKK/</G/G/I,&*&B&BS+N;F '4 1>>@@ < > >$.0nn077066055klC077E	/+(" ,?(,B/-)  102		(8'93};M:N%O':1- *001BCNP34E4J4JK/0A0F0FGNN)s '7z $L$5$56%}%)(5+00<<<..(*A,BSBS*T )6
 "113E%e.@--PQQQQ'5l'C$$225??3DE)001AB%%eoo&78 4 >>)4+G+G  +0*?*? //66~F +@ !+ 76, 99@@2212,/0C0C,D 150O0O)1- ,,1@ioo33J?@? 7H I 76H s   !D3Q6BQ66
RFr  N)ra  r   r  r   r  r  r   r  r   r  r)  r"   r$   r1  r  r  r  )
r4  r  r  r  r6  _nodesr[  rY  ra  r  s
          @@@rW   codegen_outer_loop_node%CppScheduling.codegen_outer_loop_node  s    (()0)O)O&6802
$ ;<<<<_	7R _	 _	B 4D995SG2!'') ''--e-D!113E%e.@--PQQQQ27//2CF'5l'C$$226: 001AJ 4 ED : EDs   *A2D&&
D4c                 r   U R                   n[        U[        5      (       a  U R                  U5        ONUR	                  5       nU R                  U5      n[        U5      nUR                  U5        UR                  XC5        U R                  5       nU[        R                  :  a  U R                  S5        gg)z3
Turn an set of pre-fused nodes into a C++ kernel.
TN)ra  r   r  r  r1  r  r  r  r  _get_scheduled_num_argsr8  MAX_FUSED_KERNEL_ARGS_NUMrF  )r4  r  ra  r  r[  args_nums         rW   codegen_nodeCppScheduling.codegen_nodel  s     ((d788((.)-)9E''.E-l;**51(()9A//1m===""4( >rV   c                 x    [        U[        5      =(       a$    [        UR                  [        R                  5      $ r  )r   r$   r  r   CppTemplateBuffer)r4  r  s     rW   is_cpp_templateCppScheduling.is_cpp_template  s,    $. 
:IIr++4
 	
rV   template_nodeepilogue_nodesprologue_nodesc                 <   U(       a   eU Vs/ s H"  n[        U[        [        45      (       d  M   UPM$     nn[        S   S==   S-  ss'   [        S   S==   [	        U5      -  ss'   U R                  U5      (       d   S5       e[        [        U5      nUR                  u  nu  pVUS:X  d   e[        [        R                  UR                  5      nU Vs/ s H  oR                  PM     n	n[        S U	 5       5      (       d   S5       eS	 n
U
" XqR                  U	5      nUR                  UUU	S
9u  pU   [        UR                  5      (       d  UR                  5         U H  nUR                  5         M     U" 5       nSSS5        [         R"                  " U5         U/UQnU R%                  WUUR&                  5      nSSS5        [        UR                  5      (       a  [	        UR(                  5      S:X  d   S5       eUR(                  S   R*                   H}  n[        UR                  [,        5      (       d   S5       e[        UR                  R                  [        R.                  5      (       d   S5       eUR                  R                  5         M     UR1                  WU5        [         R2                  =R4                  UR4                  -  sl        U R7                  5         gs  snf s  snf ! , (       d  f       GN~= f! , (       d  f       GNO= f)z7
Codegen a CPP template, possibly with fused epilogues
inductorcpp_templated_kernel_counterr5   cpp_epilogue_fusion_counterzlTemplate node passed to CppScheduler.codegen_template must be a SchedulerNode that wraps a CppTemplateBufferrU   c              3   V   #    U  H  n[        U[        R                  5      v   M!     g 7fr  )r   r   rT  )r  r  s     rW   r  1CppScheduling.codegen_template.<locals>.<genexpr>  s"     O=N:a!2!233=NrJ  z9Epilogue nodes must all be instances of ir.ComputedBufferc                    ^ T(       d  gU R                  5       U;   d   eXR                  5          R                  n[        U4S jU 5       5      (       + $ )NFc              3      >#    U  H?  n[        UR                  [        5      =(       a    UR                  R                  T;   v   MA     g 7fr  )r   r  r   )r  r  r  s     rW   r  ZCppScheduling.codegen_template.<locals>.template_buffer_has_other_users.<locals>.<genexpr>  s@       "D 499&78 5IINNn45!s   AA
)r  r  r'  )template_bufferoutputs_by_namer  r  s     ` rW   template_buffer_has_other_usersGCppScheduling.codegen_template.<locals>.template_buffer_has_other_users  sZ     ""++-@@@#$<$<$>?EEE  "   rV   )$flag_template_buffer_has_other_usersr  NzSMulti outputs template should be with 1 output template buffer of MultiOutputLayoutr   z?Multi outputs template should be with ExternKernelSchedulerNodez7Multi outputs template has multi users with MultiOutput)r   r$   r"   r   r   r  r   r  r   r  r  r'  r   make_kernel_renderr)   r  r4   set_kernel_handlerdefine_kernelr  outputsr  r    r  call_kernelr  r  free_buffers_in_scheduler)r4  r  r  r  epilogue_noder   rnumelctbr  epilogue_ir_nodesr!  r#  re  renderr  src_codenode_schedulekernel_namer  s                      rW   codegen_templateCppScheduling.codegen_template  s    "!!
 "0
!/--9K)LM !/ 	 
 	;<A<:;s>?RR;##M22 	
z	
2 ]M:&,,;A||$()=)=}?Q?Q$R*;
*qFFN 	 ;
 O=NOOO 	
G	
O	 0O..0A0
, //1U, 0 

 ,]-?-?@@&&(& 'xH  !!&)*<^<M,,X}fkkRK * %]%7%788 },,-2 e2 &--a066!$))-FGG UG "$)).."..AA MA 		""$ 7 	;,	6#9#99&&(S
 ;
: V *)s)   K0K0K58AK:!#L:
L	
Lc                 6    U R                   R                  5       $ r  )ra  get_num_argsr:  s    rW   r
  %CppScheduling._get_scheduled_num_args  s      --//rV   c                     U R                   $ r  rE  r:  s    rW   ready_to_flushCppScheduling.ready_to_flush  s    ###rV   c                     g r  rU   r:  s    rW   codegen_syncCppScheduling.codegen_sync  s    rV   c                    [         R                  R                  n[        R                  R
                  (       a$  [        U[        R                  R
                  5      OSnSR                  SXTR                  5       /5      n[         R                  R                  (       a  UOSnUR                  [        [        R                  5      U5      nUR                  [        [        R                  5      U5      nUR                  SS5      nUR                  S5      nUR!                  SU5      n	XU	S	-     S
3n
[#        5       nUc  U R$                  R&                  OUnUR)                  5       u    p[         R                  R                  (       d  UR+                  SU< S35        UR-                  USS9  [         R                  R                  (       d  UR+                  S5        UR/                  UUR1                  5       SU
S9  U$ )NrR   r   r  re  z#pragma CMTz//z
extern "C"r   r5   z;
zasync_compile.cpp_pybinding(z, '''T)stripz''')F)gpucpp_definition)r4   r  wrapper_coder   r  descriptive_namesr'   r  next_kernel_suffixcpp_wrapperr  ro   r,   KERNEL_NAMEDESCRIPTIVE_NAMErfindr  r=   ra  r  cpp_argdefsr   rH  r&  getvalue)r4  r/  r  kernel_argsr  
fused_namer1  kernel_decl_name
first_char	last_charkernel_definitioncompile_wrapperr  r   	arg_typess                  rW   r&  CppScheduling.define_kernel  s   ''&& zz++ "%)E)EF 	
 hhz3M3M3OPQ*+''*=*=;8##C(?(?$@BRS##C(D(D$E{S ##M48 ^^L1
MM#z2	'Y]CDCH(*)4)<t  %%+**,1ww""%%(DYMQV&WXxt4ww""%%f-$$&,	 	 	
 rV   c                 :   U R                   R                  5       nU(       aY  U R                  XR                   R                  5      nU R                   R	                  [
        R                  R                  U5        U R                  5         U R                  S5        g r?  )
ra  codegen_groupr&  scheduled_nodesr(  r4   r  rA  r@  rF  )r4  r/  r1  s      rW   flushCppScheduling.flush  st    $$224,,++;;K ))!''*>*>L!u%rV   )rA  ra  r  )2r  r  r  r  r  r   r6   INPLACE_BUFFERSREDUCE_TO_SINGLE_ELEMENTr;  rk  r   r9  r<  r3  rk   rF  rK  r@  r,  r   r0  r`  rn  r  r  r   r  rc  rb  r  r  r(  r$   r  r  r  r	   r"   r  r  r   r2  r
  r8  r;  r&  rV  r  rl  rm  s   @rW   r8  r8    su    !$!**33	
 $%,, $:n;U $ $%& &N*P=dx8H/I 6p>	<
&
/@
	
4)l

=mD$7 m^K)KB)/1C]RS),
$5 
$ 

U)(U) !!23U) !!23	U)n0$#J& &rV   r8  c                   R   ^  \ rS rSrU 4S jrS rS rS rS
S\4S jjr	S r
S	rU =r$ )rN  i  c                   > [         TU ]  5         [        5       U l        [	        5       U l        [        U R
                  5      U l        [        R                  " 5       U l
        U R                  R                  U R                  5        / U l        g r  )r2  r3  r?   r  r7   
loops_codeWorkSharingrf  r   r   r   r   rU  r  s    rW   r3  KernelGroup.__init__  s^    L	&.doo.))+


  )!rV   c                 :    U" U R                   [        5       /UQ76 $ r  )r  r+   )r4  r*  r  s      rW   r  KernelGroup.new_kernel!  s    49924<t<<rV   c                     U =R                   U-  sl         U R                  nU R                  nUR                  X45        g r  )rU  r\  rf  r  )r4  r  r  r   rf  s        rW   r  KernelGroup.finalize_kernel$  s3    %WW  *rV   c                 V    U R                   R                  5       u  pn[        U5      nU$ r  )r  rH  r   )r4  arg_defs
_call_args
_arg_typesr  s        rW   r5  KernelGroup.get_num_args*  s'    +/99+@+@+B(jx=rV   r   c           	      j   U R                   R                  5         U R                  (       d  g[        5       n[        R
                  R                  =(       a    [        R                  S;   nU(       a  UR                  S/5        UR                  [        R                  " 5       5        Uc  [        [        R                  5      OUnUc  [        [        R                   5      OUnU R"                  R%                  5       u  n  nSR'                  S5      R)                  U5      n[+        5       nUR                  SU SU SU S	35        UR-                  5          U(       aH  [.        R0                  R2                  n	U	b  S
[        U	5      -   S-   OSn
UR                  SX-    S3/5        U R"                  R5                  5        H  u  pUR                  SU SU S35        M     UR7                  U R8                  5        S S S 5        UR;                  5       $ ! , (       d  f       UR;                  5       $ = f)NrR   )linuxrP   z!#include <ATen/record_function.h>z,
   zextern "C" z void r$  r   graph_r   zRECORD_FUNCTION("z#", c10::ArrayRef<c10::IValue>({}));r   r   r   )r   r  rU  r7   r   r  enable_kernel_profilesysplatformr   r   r   
cpp_prefixro   r,   rE  rF  r  rH  ljustr  rX   r   r4   r  graph_idaliasesrH  r\  rI  )r4  r   r   rl  rL  r1  rd  r   func_export_declrq  r  oldnews                rW   rT  KernelGroup.codegen_group/  s   

##~ !'

 @ @ !
S\\ V
 F
 !OO@ABy++-. <@<3{667T;?<c+667T..0!Q;;r?''113*+62B1C1XJaP	

 [[]$77++;C;OCM1C7UW+F,@+AAfg
 !II--/se3se156 0KK(  }} ] }}s   B&H
H2c                 `    U R                   R                  5       u  p4nUR                  X$SUS9  g )NF)tritonrQ  )r  rH  generate_kernel_call)r4  r  r1  r   	call_argsrQ  s         rW   r(  KernelGroup.call_kernelW  s3    "&))"7"7"9i$$5I 	% 	
rV   )r  r\  rU  r   rf  r  )r  r  r  r  r3  r  r  r5  ro   rT  r(  r  rl  rm  s   @rW   rN  rN    s.    "=+
&# &P
 
rV   rN  c                   8    \ rS rSrS rS rS rS rS rS r	Sr
g	)
r]  i^  c                 `    Xl         SU l        S U l        [        R                  " 5       U l        g r?  )r   in_parallelr=  r   r   r   )r4  r   s     rW   r3  WorkSharing.__init___  s'    	 ))+
rV   c                    U R                   (       a  XR                  :w  a  U R                  5         U R                   (       d  Xl        SU l         [        R                  R
                  (       a  U R                  R                  S5        OU R                  R                  SU S35        U R                  R                  U R                  R                  5       5        U R                  R                  S5        g g )NTz#pragma omp parallelz!#pragma omp parallel num_threads(r   zint tid = omp_get_thread_num();)r~  r=  r  r   r  rE  r   r   r   r   r   )r4  r  s     rW   r  WorkSharing.parallele  s    +;+; ;JJL&#Dzz))		##$:;		##&GyPQ$RSJJ$$TYY%5%5%78II1  rV   c                 r    U R                   (       a  U R                  R                  S5        U R                   $ )Nz#pragma omp single)r~  r   r   r:  s    rW   r  WorkSharing.singleu  s*    II 45rV   c                 F    U R                   R                  5         SU l        g r?  )r   r  r~  r:  s    rW   r  WorkSharing.closez  s    

 rV   c                 :    U R                   R                  5         U $ r  )r   rz  r:  s    rW   rz  WorkSharing.__enter__~  s    

rV   c                 <    U R                   R                  XU5        g r  )r   r  r}  s       rW   r  WorkSharing.__exit__  s    

Hv6rV   )r   r~  r=  r   N)r  r  r  r  r3  r  r  r  rz  r  r  rU   rV   rW   r]  r]  ^  s     ,  
!7rV   r]  c                      \ rS rSr% Sr\\R                     \S'   Sr	\\R                     \S'   \R                  R                  r\R                  \S'   \R                  R                  r\R                  \S'   \R                  R                  r\R                  \S'   Sr\\S	'   S
r\\S'   S
r\\S'   S
r\\S'   S
r\\S'   S rS rS rSrg)r  i  Nr   rB  rC  r  rD  r   r  Fsimd_ompsimd_vec	collapsedr5  c                 v    [         R                  " 5       nU(       a  UR                  5       U l        g SU l        g r`  )r   r#  r/  simd_nelements)r4  rg  s     rW   __post_init__LoopLevel.__post_init__  s-     .9-E-E-GAO>#;#;#=UVrV   c                    [         R                  " U5      n[        U R                  U R                  5      nX#l        SUl        [        UR                  U5      U-  Ul        U R                  Ul	        SUl
        U R                  Ul        U$ )NTF)r   rX  r  r   rB  rD  r  r   r  r  r  r5  )r4  r  sympy_factorr  s       rW   r  LoopLevel.tile  sl    }}V,499-!
"499l;lJ --rV   c                    [        U R                  5      n[        U R                  5      n[        R                  R
                  (       a  X:X  a  g U R                  (       a   U R                  S:  a  SU R                   S3OSnU R                  (       aL  SnU R                  S:  a  USU R                   S3-  nU R                  (       a  UR                  SSU 35      nO[U R                  (       a  SnOGU R                  (       a  S	U 3nO0U R                  (       d  [        R                  " 5       (       a  S
nOSn[         SU R                   SU 3nU R                   SU 3nU R                   R"                  (       a%  U R                   S[        U R                   5       3nO;U R                   S[        U R                   5       S[        U R                   5       S3nSU SU SU S3nU R$                  (       d  U(       d  U/$ XH/$ )Nr5   zsimd simdlen(z) rR   z#pragma omp forz
 collapse(r   z for z#pragma omp z#pragma GCC ivdepr   r`  <r_  z+=(z == 0 ? 1 : zfor(r   )rE   rC  rB  r   r  no_redundant_loopsr  r  r  r  r  r5  r   ra  rI   r   rD  r  r  )	r4  offset_expr	size_exprsimdline1
offset_strr  	steps_strline2s	            rW   r  LoopLevel.lines  s   !$++.		*	::(([-E }}!4!4q!8 D//03 	
 ==%E}}q :dmm_A66}}gtf~>]]E]]"4&)E""{'9'9';';'EE"|1TXXJa}=
hhZq,::88*B{4::'>&?@I
 88*CDJJ 78 9"4::./q2  zl"XJb1=>>7N~rV   )r  )r  r  r  r  r   r   r   r  r  rB  r   r   rC  r  OnerD  r  rm   r  rk   r  r  r5  r  r  r  r  rU   rV   rW   r  r    s     $C%**	$!%D(5::
%FEJJ% #WW\\J

)E5::#HcHdHdItL$
W	'rV   r  c                       \ rS rSr% SrSr\\\      \	S'   Sr
\\   \	S'   \S\4S j5       rS r\S 5       rS	 rS
 rS\4S jrS rS\4S jrSrg)rP  i  a>  
A loop-nest-like structure. It is built with the `build` method
as a loop nest and then will perform loop-tiling at some depth.

A typical case is for vectorization, where we typically do loop-tiling
at the innermost loop level. A more complicated case is when we do
2D tiling at both the innermost and outer levels.
NrK  re  c                 0   U R                   nU R                  nU R                  nUc   eSn[        [	        X5      5       HG  u  nu  pg[        Xg5      nU(       d  U/nOUR                  U5        XS:  d  M6  U R                  Ul        MI     [        U5      n	U	$ )z4Build a LoopNest with the given `kernel` as the leafN)	r  rW  r-  r   rI  r  r  r5  rP  )
re  r  rW  r-  rK  loop_idxr   rB  r  rR  s
             rW   r  LoopNest.build  s     ?? 00***+/%.s8/D%E!HksS'DT"*$*$7$7! &F UO	rV   c                 ,    [        U R                  5      $ r  )rk   rK  r:  s    rW   __bool__LoopNest.__bool__  s    DJJrV   c                    U R                   c
  [        SSS9$ SnSnU R                   S   R                  n[        R                  " S5      nU R                    H(  nUR                  U:w  a    OXER
                  -  nUS-  nM*     U[        U R                   5      :  a  [        U[        R                  5      (       a  [        U R                   U   R
                  [        R                  5      (       a  US-  U R                   U   R
                  :  ag  UnSnU R                   U   R                  n[        U[        U R                   5      5       H'  nU R                   U   R                  U:w  a    O	US-  nM)     [        X!S9$ )aY  
Maximal allowed depth for parallelism: All reduction or non-reduction levels.
When the range of the first inner loop beyond the maximum parallel depth is much
larger than the range of all outer loops within the maximum parallel depth,
change the starting depth of parallelism to the first inner loop and recalculate
the maximum parallel depth.
r   r  r5   rO  )	rK  r  r5  r   rX  rB  r   r   rQ  )r4  r  	max_depthr5  
loop_sizesr  r   s          rW   r  LoopNest.max_parallel_depth  s?    :: qAA	zz!}11]]1%
JJD  L0#ii/JNI	  DJJ':u}}554::i055u}}EES 4::i#8#=#==#KI::k2??L;DJJ8::a=--=Q	 9 IOOrV   c                    UR                   U R                  5       R                   ::  d   S5       eU R                  c   e[        U R                  5      UR                   :  d   eU R                  UR                     nUR                   Ul        UR                  (       a  [        =R                  S-  sl        [        UR                  S-   UR                   5       H  nSU R                  U   l
        M     g )Nz?Parallel depth cannot exceed the maximal allowed parallel depthr5   T)r  r  rK  r   r  r  r5  r   parallel_reduction_countrQ  r  )r4  r  r  r   s       rW   r  LoopNest.mark_parallel)  s    ''4+B+B+D+S+SS 	
M	
S zz%%%4::)":"::::zz)//0!00,,1,y,,q0)2J2JKA&*DJJqM# LrV   c                     U R                   (       d   eU R                   U   R                  U5      U R                   U'   U R                   U   $ )z
Do loop-tiling at the `depth` level with `factor`.
    for (x0 = 0; x0 < x0_end; x0++)
    ->
    for (x0 = 0; x0 < x0_end; x0 += factor)
See details in Note [tiled_size].
)rK  r  )r4  r  r  s      rW   r  LoopNest.tile6  sA     zzz JJu-226:

5zz%  rV   r   c                 @    U R                   (       d   eU R                   $ r  re  r:  s    rW   r  LoopNest.get_kernelB  s    {{{{{rV   c                     Xl         g r  r  r  s     rW   r  LoopNest.set_kernelF  s    rV   levelc                     U R                   (       d   e[        U R                   5      U:  d   eU[        U R                   5      :X  a  S OU R                   US  n[        X R                  5      $ r  )rK  r   rP  re  )r4  r  rK  s      rW   rc  LoopNest.from_loop_levelI  sT    zzz4::%'''TZZ0djj6H{{++rV   r  )r  r  r  r  r  rK  r   r(  r  r  re  r%  r  r  r  r%   r  r  r  r  r  rm   rc  r  rU   rV   rW   rP  rP    s     (,E8DO$+"&FHY&i  (  $P $PL+
!I ,S ,rV   rP  r  r  )r   dataclassesrS  rO  r1  rU  r   rm  r  collections.abcr   enumr   typingr   r   r   r   r	   r   r   torch.fxtorch._inductorr
   torch._prims_commonr   r   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   _dynamo.utilsr   rR   r   r   r   r   r   r   r  r   r&  r   r   r    r!   r"   r#   r$   utilsr%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   virtualizedr1   r2   r3   r4   commonr6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   	cpp_utilsrB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rn  rT   	lru_cacherX   _logginggetArtifactLoggerr  schedule_logNATIVE_OMP_RTYPESRTYPE_TO_CPPr  PYTHON_TO_CPPCONTAINER_PYTHON_TO_CPPr  r  r   r  rn   rk   r  r  rG  r  rv   r(  r   r  rw   r   r   r   r   r   r  ro   r   rm   r   r   r   r  r	  	dataclassr  r  ro  r  _initialize_pointwise_overridesr  r  r  r%  r  r!  r   r
  r  r  rb  r0  r8  rN  r]  r  rP  rU   rV   rW   <module>r     s-         	 
  $  7 7    ( @ / K K O O % G G        > =      $ llg% T: : ~~//*EBC   !   #&   
NN	MM 
MM	KK	NN	MM	JJ	KK	JJ	KK	KK
* T%++& 
 
KK	NN	MM	KK	JJ1 D- )D %)+) ELL!	+)\-
-jj- - 

	-
 jj- -`3#$  ;;	
 
sCxBV^ V3 V# V -UZZ -ell - - ;uzz ;

 ;PS ; ;| FJ!::!!LL!6>sm! !   ]!"4 ]!@! !Bk; k\  , ,U 3r7l r7j  / / 9  % % '7 7w wtk9 k\Y
l Y
x)$ )$eHU[[4I44O.P )$XbK bKJT	
Y T	
n 
9  
FFt FD&N D&ND
 D
N%7 %7P R R Rj p, p, p,rV   