
    sh#                   6   S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKJr  S SKJrJrJrJrJr  S SKrS SKJr  S SKrS SKrS SKJs  Jr  S SKJr  S SKJ r J!r!  S SK"J#r#  S S	K$J%r%  S S
K&J'r'  S SK(J)r)J*r*J+r+J,r,J-r-  S SK.J/r/  S SK0J1r1  S SK2J3r3  S SK4J5r5J6r6  SSK7J8r8J9r9J:r:  SSK;J<r<  SSK:J=r=J>r>  SSK?J@r@  SSKAJBrB  SSKJCrCJDrDJErEJFrFJGrGJHrHJIrI  SSKJJKrK  SSKLJMrMJNrNJOrOJPrPJQrQJRrRJSrS  SSKTJUrU  SSKVJWrWJXrXJYrY  \(       a  S SKZJ[r[J\r\  S SK]r]SSK^J_r_  \Q" 5       R                  ra\b\R                  \R:                  \d4   re\\:R                  \R4   rgSHS jrhSIS jriSJS  jrjSKS! jrkSLS" jrl\m\d\n4   ro\\b\\n\R,                  4   S#4   \\o/\b\nS#4   4   4   rp SM         SNS$ jjrqSOS% jrr\R                   " S& S'5      5       rt " S( S)5      ru " S* S+5      rv\R                   " S, S-\v5      5       rw\R                   " S. S/\v5      5       rx\R                   " S0 S1\v5      5       ry " S2 S3\v5      rz\R                   " S4 S5\v5      5       r{\R                   " S6 S7\{5      5       r|\R                   " S8 S9\{5      5       r}\R                   " S: S;\{5      5       r~ " S< S=\{5      r\R                   " S> S?\v5      5       r\R                   " S@ SA\5      5       r\R                   " SB SC\5      5       r\dr " SD SE\N5      r " SF SG\5      rg)P    )annotationsN)count)AnyCallableOptionalTYPE_CHECKINGUnion)Expr)dtype)countersdynamo_timed)DebugPrinterManager)MultiKernelState)	cache_dir)CallMethodKeyConvertIntKeyDivideByKeyresolve_unbacked_bindingsSymTypes)_get_qualified_name)
OrderedSet)SingletonInt)symbol_is_typeSymT   )async_compileconfigir)output_code_log)IRNodeReinterpretView)triton_heuristics)DeviceProperties)cache_on_selfget_benchmark_nameLineContextsympy_product	sympy_str
sympy_substriton_version_uses_attrs_dict)V   )ArgNameCodeGenDeferredLineIndentedBufferPythonPrinterWorkspaceArgWorkspaceZeroMode)cexpr)	config_ofshould_unwrap_unspec_argsignature_to_meta)IteratorSequence)GraphLoweringc                    [         R                  R                  U 5      nU R                  5       U R	                  5       [        [         R                  R                  R                  U5      5      4$ N)r+   graphget_allocation_storage_sizeget_device_or_error	get_dtyper(   sizevarssimplify)nodestorage_sizes     s/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/_inductor/codegen/wrapper.pybuffer_reuse_keyrF   T   sU    7766t<L  " 	!''""++L9:     c                   U R                  5       UR                  5       :w  a  gU R                  5       UR                  5       :w  a  g[        R                  R                  R                  [        R                  R                  U 5      5      n[        R                  R                  R                  [        R                  R                  U5      5      n[        U5      [        U5      :X  d`  [        R                  R                  R                  USU-  5      (       a/  [        R                  R                  R                  X25      (       a  gg)NFgffffff?T)
r?   r@   r+   r=   rA   rB   r>   r(   statically_known_geqstatically_known_leq)	input_buf
output_buf
input_sizeoutput_sizes       rE   can_match_buffer_sizerO   `   s     $$&**H*H*JJ
 4 4 66!!**	++I6J ''""++	++J7K 	*;!77 	
--k4*;LMMGG11+JJrG   c                   SSK JnJn  [        U R                  5      nUS:X  a4  U R
                  b!  U R
                  R                  (       a  SU S3$ SU S3$ X2;   a  X#   nU$ UR                  5        HS  u  pV[        R                  " US-   U5      n[        U5      S:X  d  M0  US   nX;   d   S	U S
U 35       eX(   n	U SU	 S3s  $    [        SU 35      e)Nr,   )CONTAINER_PYTHON_TO_CPPPYTHON_TO_CPPTensorzat::&z const&z\[([a-zA-Z_]+)]r   zunsupported z type in convert_arg_type: <>zunsupport python_type: )cpprQ   rR   repr	real_type
alias_infois_writeitemsrefindalllenAssertionError)
argrQ   rR   python_typecpp_typepy_containercpp_containercontainer_matchcontained_typecpp_contained_types
             rE   convert_arg_typeri      s   ; s}}%Kh>>%#..*A*A+a((+g..# - (?'D'D'F#**\4F%FT1$,Q/N!2 |n,GGWX2 "/!>#_A&8%9;; (G 2;-@
AArG   c                    [        U R                  5      nSSS.nUR                  US 5      nUc
   SU 35       eUS:X  a  U R                  b  US-  nU$ )Nz
at::Tensorzstd::vector<at::Tensor>)rS   zList[Tensor]zNYI return type: rS   rT   )rX   rY   getrZ   )retrb   python_to_cpprc   s       rE   convert_return_typern      sk    s}}%K1M
   d3HB#4[M!BB h3>>#=COrG   c                   U R                   R                  nU R                   R                  n[        U5      nUS:  d   S5       eUS:X  a  [	        US   5      nO7US:  a1  SR                  U Vs/ s H  n[	        U5      PM     sn5      nSU S3nU Vs/ s H  n[        U5       SUR                   3PM     nnW SSR                  U5       S	3$ s  snf s  snf )
Nr   z#must have at least one return valuer,   , zstd::tuple<rV    ())_schema	argumentsreturnsr_   rn   joinri   name)	kernelargsrv   num_returnscpp_return_valuertuple_returnsra   cpp_arg_types	            rE   get_cpp_op_schemar      s    >>##Dnn$$Gg,K?AAA?a.wqz:	q		7"K7a#6q#97"KL(q9EIJTc',-Qsxxj9TLJq<!8 9;;	 #L Ks   /C$C.c                  ^ ^^^ [        5       mSS jm  SUU4S jjnSSU UU4S jjjnST  3nU" SU S35        T(       a9  [        R                  R                  (       a  TR                  R                  5       O[        R                  " 5       nTR                  5          U   [        U5      S:X  a  U" US   5      u  pU" S	U 3S	U	 35        O[        U5      S:  d   e[        U5      [        U5      :X  d   e[        [           " 5       n
[        [        X!5      S
 SS9 H  u  pUR                  (       aF  UR                  R                  5        V Vs/ s H  u  pSU  SU 3PM     nn nSR                  U5      nOSnU" U5      u  pSU SU 3nX;   a  Mw  U
R!                  U5        U" USU SU	 35        M     S S S 5        S S S 5        UTR#                  5       4$ s  snn f ! , (       d  f       N.= f! , (       d  f       N7= f)Nc                p    [        U [        R                  5      (       a  U $ [        R                  " U 5      $ r<   )
isinstancesympyr
   Integer)items    rE   _convert_to_sympy_expr@user_defined_kernel_grid_fn_code.<locals>._convert_to_sympy_expr   s&    !$

33tLt9LLrG   c                  > Tb  [        U 5      (       a  X 4$ [        U4S jU  5       5      nTR                  U5      [        R                  R
                  (       a%  TR                  [        U4S jU 5       5      5      4$ S4$ )z
This function return a tuple of two values: the first one is for the real grid
which is used in the generated code; the second one is an example grid with
concreate values which is used in the autotune block to run the generated
kernels at compile time.
Nc              3  4   >#    U  H  nT" U5      v   M     g 7fr<    ).0gr   s     rE   	<genexpr>Kuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   s     Cd1!44ds   c              3  Z   >#    U  H   nTR                  U[        U5      5      v   M"     g 7fr<   generate_example_arg_valuetype)r   r   wrappers     rE   r   r      s,      !+A  ::1d1gFF!+   (+)callabletuplecodegen_python_shape_tupler   tritonautotune_at_compile_time)grid
sympy_gridr   r   s     rE   determine_grid8user_defined_kernel_grid_fn_code.<locals>.determine_grid   s     ?htnn:CdCC
..z: ==99 22 !+ 
 	
 
 	
rG   c                   > TR                  U 5        T(       aV  [        R                  R                  (       a6  TTR                  ;  a%  TR
                  R                  U=(       d    U 5        g g g g r<   )	writeliner   r   r   kernel_autotune_nameskernel_autotune_calls)lineexample_gridrx   outputr   s     rE   r   3user_defined_kernel_grid_fn_code.<locals>.writeline   sW    66G999))33L4HDI : 7 rG   grid_wrapper_for_def z(meta):r,   r   zreturn c                2    [        U S   R                  5      $ Nr,   r_   kwargsxs    rE   <lambda>2user_defined_kernel_grid_fn_code.<locals>.<lambda>      3qt{{3CrG   Tkeyreversezmeta['z'] == z and Trueif z	: return )r   Union[int, sympy.Expr]return
sympy.Expr)r   
TritonGridr<   )r   strr   Optional[str])r0   r   r   r   r   indent
contextlibnullcontextr_   r   r   sortedzipr   r\   rw   addgetvalue)rx   configsgridsr   r   r   fn_namekernel_autotune_calls_indentr   r   seencvalguards	statementr   r   s   `  `           @@rE    user_defined_kernel_grid_fn_coder      s    FM

 
8J J "$(GWIW%& v}}== 	%%,,.##% !
 
6u:?!/a!9Dv&',(@Au:>!>u:W---c?$D "E#)CT 88DEHHNNDTDTyt&fSE2DT   %\\&1F#F%3D%9"!&4&9	$#)s6()L>%JK 7< FOO%%%# 76s8   !G7$B2G&G *AG&>G7 G&&
G4	0G77
Hc                   ^^^^^ [        5       mTR                  U R                  SS9  SSKJm  SSKJm  [        U R                  /5      mUUUUU4S jmT" U 5        TR                  5       $ )z[
Given a triton kernel function pointer collect the transitive closure of
its dependencies
Tstripr   )JITFunction)	constexprc           	     .  > [        S [        R                  " U R                  5       5       5      nU R                  R                  R                  S0 5      nU R                  R                  R                   GH  nUT;   a  M  X0R                  R                  ;   d  M'  U R                  R                  U   n[        UT5      (       aV  T	R                  5         T	R                  S5        T	R                  UR                  SS9  TR                  U5        T" U5        M  [        U[        [        [         T
45      (       a  T	R                  5         [        UT
5      (       a  SUR"                  < S3nOU< nUR                  U5      =n(       aQ  [        U[$        5      (       a  SUR&                   S	UR(                   3nOSU< 3nT	R                  U U S
U 35        OT	R                  U S
U 35        TR                  U5        GM  X1;   d  GM  US:w  d  GM  [+        US5      (       d  GM  UR&                  R-                  S5      (       d  GM  T	R                  SUR&                   SUR(                   SU 35        TR                  U5        GM     g )Nc              3  ^   #    U  H#  nUR                   S :X  d  M  UR                  v   M%     g7f)LOAD_GLOBALN)opnameargval)r   insts     rE   r   ^user_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse.<locals>.<genexpr>4  s)      '
3{{m+ DKK3s   --__annotations__z@triton.jitTr   ztl.constexpr(rs   : . = tl
__module__r   zfrom z import z as )r   disBytecodefn__globals__rk   __code__co_namesr   newliner   splicesrcr   intr   boolvaluer   r   __name__hasattr
startswith)
cur_kernelunqualified_loadsglobal_annotationssymbol_namesymbol
symbol_str
annotationannotation_coder   compile_wrapperr   symbols_includedtraverses           rE   r   Kuser_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse/  sG   
 ' '
Z]]3'
 

 (]]66::;LbQ%==11::K..mm777#22;?fk22#++-#--m<#**6::T*B$((5V$c4(CDD#++-!&)44'4V\\4DA%F
(.z
%7%;%;K%HHzH%j$77"$Z%:%:$;1Z=P=P<Q R , 13:..AO'11*mO+<C
|L (11[MZL2QR$((54#t+55 ))44X>>
 $-- 1 12(6??:K4P[}] %((5[ ;rG   )
r0   r   r   r   r   triton.languager   r   r   r   )ry   r   r   r   r   r   s    @@@@@rE   9user_defined_triton_kernel_transitive_closure_source_coder      sd    
 %&O6::T2 #) "6??"3486 86t V##%%rG   c                  0    \ rS rSr% S\S'   S\S'   S rSrg)	SymbolicCallArgim  r   innerr   
inner_exprc                ,    [        U R                  5      $ r<   )r   r   selfs    rE   __str__SymbolicCallArg.__str__s  s    4::rG   r   N)r   r   __qualname____firstlineno__r   r  __static_attributes__r   rG   rE   r   r   m  s    JrG   r   c                  F   ^  \ rS rSrU 4S jrSS jrSS jrS	S jrSrU =r	$ )
MemoryPlanningStateiw  c                n   > [         TU ]  5         [        R                  " [        5      U l        SU l        g Nr   )super__init__collectionsdefaultdictlist
reuse_pooltotal_allocated_buffer_size)r  	__class__s    rE   r  MemoryPlanningState.__init__x  s-    ##D) 	 12(rG   c                L    [        U R                  R                  US 5      5      $ r<   )r   r  rk   )r  r   s     rE   __contains__ MemoryPlanningState.__contains__  s    DOO''T233rG   c                f    U R                   U   R                  5       nUR                  (       a   eU$ r<   )r  pop	is_reusedr  r   r   s      rE   r  MemoryPlanningState.pop  s+    s#'')>>!!rG   c                f    UR                   (       a   eU R                  U   R                  U5        g r<   )r  r  appendr  s      rE   pushMemoryPlanningState.push  s&    >>!!##D)rG   )r  r  )r   ReuseKeyr   r   )r   r"  r   FreeIfNotReusedLine)r   r"  r   r#  r   None)
r   r   r  r  r  r  r  r   r  __classcell__r  s   @rE   r
  r
  w  s    24
* *rG   r
  c                      \ rS rSrSrg)WrapperLinei  r   Nr   r   r  r  r  r   rG   rE   r(  r(        rG   r(  c                  >    \ rS rSr% S\S'   S\S'   S
S jrSS jrSrg	)EnterSubgraphLinei  PythonWrapperCodegenr   r:   r=   c                b    U R                   R                  U R                   R                  5        g r<   )r   push_computed_sizescomputed_sizesr  s    rE   __post_init__EnterSubgraphLine.__post_init__  s    (()D)DErG   c                n    U R                   R                  U R                  5        UR                  5         g r<   )r   push_codegened_graphr=   	do_indentr  codes     rE   codegenEnterSubgraphLine.codegen  s"    ))$**5rG   r   Nr   r$  r7  r0   r   r$  r   r   r  r  r   r1  r8  r  r   rG   rE   r,  r,    s    !!FrG   r,  c                  4    \ rS rSr% S\S'   SS jrS	S jrSrg)
ExitSubgraphLinei  r-  r   c                V    U R                   R                  5       U R                   l        g r<   )r   pop_computed_sizesr0  r  s    rE   r1  ExitSubgraphLine.__post_init__  s    &*ll&E&E&G#rG   c                X    U R                   R                  5         UR                  5         g r<   )r   pop_codegened_graphdo_unindentr6  s     rE   r8  ExitSubgraphLine.codegen  s    ((*rG   r   Nr:  r;  r<  r   rG   rE   r>  r>    s    !!HrG   r>  c                  4    \ rS rSr% S\S'   S\S'   S	S jrSrg)
EnterDeviceContextManagerLinei  r   
device_idxzOptional[int]last_seen_device_guard_indexc                   [         R                  R                  (       Ga  UR                  S5        [         R                  R                  (       aj  U R
                  c;  UR                  [         R                  R                  R                  5        S35        g U R
                  U R                  :X  d   S5       eg U R
                  cH  UR                  [         R                  R                  R                  5        SU R                   S35        g UR                  SU R                   S35        g UR                  S[         R                  R                  R                  U R                  5       S35        UR                  5         UR                  [         R                  R                  R                  U R                  5      5        g )	N
z) stream_guard(stream, this->device_idx_);z4AOTInductor only supports running on one CUDA devicez device_guard(z);zdevice_guard.set_index(with :)r+   r=   cpp_wrapperr   aot_moderI  
device_opscpp_aoti_stream_guardrH  cpp_aoti_device_guarddevice_guardr5  
set_devicer6  s     rE   r8  %EnterDeviceContextManagerLine.codegen  sM   77NN4 ww 44<NN77--CCEFFop  <<O NO 44<NN77--CCEFnUYUdUdTeegh NN%<T__<MR#PQ NNU177#5#5#B#B4??#S"TTUVWNNNN177--88IJrG   r   Nr;  )r   r   r  r  r   r8  r  r   rG   rE   rG  rG    s    O"//KrG   rG  c                      \ rS rSrSS jrSrg)ExitDeviceContextManagerLinei  c                d    [         R                  R                  (       d  UR                  5         g g r<   )r+   r=   rN  rD  r6  s     rE   r8  $ExitDeviceContextManagerLine.codegen  s     ww"" #rG   r   Nr;  r   r   r  r  r8  r  r   rG   rE   rW  rW    s    rG   rW  c                  >    \ rS rSr% S\S'   S	S jrS
S jrSS jrSrg)MemoryPlanningLinei  r-  r   c                    U $ )zFirst pass to find reuser   r  states     rE   planMemoryPlanningLine.plan  s    rG   c                    g)zSecond pass to output codeNr   r6  s     rE   r8  MemoryPlanningLine.codegen  s    rG   c                |   / n[         R                  " U 5       Hw  nUR                  S:X  a  M  [        XR                  5      nUR	                  UR                   SUR
                  [        R                  L a  UR                  5       OU 35        My     [        U 5      R                   SSR                  U5       S3$ )z6
Emits a string representation that fits on one line.
r   =rr   rp   rs   )dataclassesfieldsrx   getattrr  r   r   Bufferget_namer   rw   )r  rz   fieldr   s       rE   r  MemoryPlanningLine.__str__  s      ''-EzzY&$

+CKK::,a%**		2IsST	 . t*%%&a		$'8::rG   r   Nr_  r
  r   r\  r;  r   r   )	r   r   r  r  r   r`  r8  r  r  r   rG   rE   r\  r\    s    !!);rG   r\  c                  4    \ rS rSr% S\S'   SS jrS	S jrSrg)
AllocateLinei  
BufferLikerC   c           	        U R                   R                  5       [        R                  R                  ;   a  [        U R                  5      $ [        U R                   5      n[        R                  (       aH  X!;   aC  UR                  U5      nSUl        [        U R                  UR                   U R                   5      $ U R                   R                  5       R                  S:X  aj  U R                  R                  U R                   5      nUbB  U=R                   [#        [$        R&                  " [(        R*                  US5      5      -  sl        U $ )NTcpur,   )rC   rj  r+   r=   removed_buffersNullLiner   rF   r   allow_buffer_reuser  r  	ReuseLiner?   r   static_shape_for_buffer_or_noner  r   	functoolsreduceoperatormul)r  r_  r   	free_linestatic_shapes        rE   r`  AllocateLine.plan  s    99177#:#::DLL)) tyy)$$		#I"&IT\\9>>499EE99((*//58<<GG		RL'11S$$X\\<C6 1 rG   c                    U R                   R                  5       [        R                  R                  ;  d   eU R
                  R                  U R                   5      nUR                  U5        g r<   )rC   rj  r+   r=   rt  r   make_buffer_allocationr   r  r7  r   s      rE   r8  AllocateLine.codegen  sK    yy!!#177+B+BBBB||22499=trG   r   Nrm  r;  )r   r   r  r  r   r`  r8  r  r   rG   rE   rp  rp    s    
(rG   rp  c                  B    \ rS rSr% S\S'   SrS\S'   SS jrSS jrS	rg
)r#  i
  rq  rC   Fr   r  c                   [        U R                  R                  5       5      S:  a  U $ [        U R                  R                  [
        R                  5      (       a  U $ U R                  (       a   eU R                  R                  5       [        R                  R                  ;   a  [        U R                  5      $ [        R                  (       a%  UR!                  [#        U R                  5      U 5        U $ r  )r_   rC   get_inputs_that_alias_outputr   layoutr   MultiOutputLayoutr  rj  r+   r=   rt  ru  r   r   rv  r   rF   r^  s     rE   r`  FreeIfNotReusedLine.plan  s    tyy55781<Kdii&&(<(<==K>>!!99177#:#::DLL))$$JJ'		2D9rG   c                    U R                   R                  5       [        R                  R                  ;  d   eU R
                  (       d5  UR                  U R                  R                  U R                   5      5        g g r<   )	rC   rj  r+   r=   rt  r  r   r   make_buffer_freer6  s     rE   r8  FreeIfNotReusedLine.codegen  sR    yy!!#177+B+BBBB~~NN4<<88CD rG   r   Nrm  r;  )	r   r   r  r  r   r  r`  r8  r  r   rG   rE   r#  r#  
  s    
It
ErG   r#  c                  L    \ rS rSr% S\S'   S\S'   SrS\S'   SS jrSS	 jrS
rg)rw  i!  rq  rC   	reused_asTr   
delete_oldc                |   U R                   R                  5       [        R                  R                  ;   aM  U R
                  R                  5       [        R                  R                  ;   d   e[        U R                  5      $ U R
                  R                  5       [        R                  R                  ;  d   eU $ r<   )rC   rj  r+   r=   rt  r  ru  r   r^  s     rE   r`  ReuseLine.plan'  s    99177#:#::>>**,0G0GGGGDLL))~~&&(0G0GGGGrG   c                x   U R                   R                  5       [        R                  R                  ;  d   eU R
                  R                  5       [        R                  R                  ;  d   eUR                  U R                  R                  U R                   U R
                  U R                  5      5        g r<   )
rC   rj  r+   r=   rt  r  r   r   make_buffer_reuser  r6  s     rE   r8  ReuseLine.codegen.  sz    yy!!#177+B+BBBB~~&&(0G0GGGGLL**499dnndooV	
rG   r   Nrm  r;  )	r   r   r  r  r   r  r`  r8  r  r   rG   rE   rw  rw  !  s"    
J
rG   rw  c                      \ rS rSrSrg)ru  i6  r   Nr)  r   rG   rE   ru  ru  6  r*  rG   ru  c                  f    \ rS rSr% S\S'   S\S'   \SS j5       r\SS j5       r\SS j5       rS	r	g
)CommBufferLinei:  r-  r   	ir.BufferrC   c                    SSK Jn  U R                  R                  5       nU R                  R	                  5       nU" U5      (       a  [        SU R                   35      e[        U5      UR                  -  $ )Nr   )is_symbolicz-The size of a comm buffer can't be symbolic: )torch._inductor.utilsr  rC   	get_numelr@   r`   r   itemsize)r  r  numelr   s       rE   sizeCommBufferLine.size?  sd    5		##%		##%u ?		{K  5zENN**rG   c                    U R                   R                  5       n[        U[        R                  5      (       d   eUR
                  $ r<   )rC   get_output_specr   r   CommBufferLayoutcomm_buffer_typer  r  s     rE   r  CommBufferLine.comm_buffer_typeK  s9    **,&""5"56666&&&rG   c                    U R                   R                  5       n[        U[        R                  5      (       d   eUR
                  $ r<   )rC   r  r   r   r  
group_namer  s     rE   r  CommBufferLine.group_nameQ  s9    **,&""5"56666   rG   r   Nr   r   )r   zir.CommBufferTypern  )
r   r   r  r  r   propertyr  r  r  r  r   rG   rE   r  r  :  sG    !!
O	+ 	+ ' '
 ! !rG   r  c                  .    \ rS rSrSS jr\S 5       rSrg)CommBufferAllocateLineiX  c                &   U R                   R                  5       [        R                  R                  ;  d   eU R                   R                  5       nU R                   R                  5       nU R                   R                  5       n[        U R                   R                  5       5      n[        U R                   R                  5       5      nUR                  U R                  U R                  U R                  U R                  UUUUU5      5        g r<   )rC   rj  r+   r=   rt  
get_devicer@   r   get_size
get_strider   make_allocation_liner  r  r   )r  r7  rx   devicer   shapestrides          rE   r8  CommBufferAllocateLine.codegenZ  s    yy!!#177+B+BBBByy!!#%%'		##%dii((*+tyy++-.%%%%		
rG   c                   U [         R                  R                  :X  aT  U SUR                  U5       SUR                  U5       SU SUR                   SU S[
        R                  " SS5       S3$ [        S	U  35      e)
Nz = empty_strided_p2p(rp   z, torch.device("cuda:z"), group_name="z", alloc_id=r   l    rs   zUnsupported comm buffer type: )r   CommBufferTypeSYMM_MEMcodegen_shape_tupleindexrandomrandintNotImplementedError)r  r  r   rx   r  r   r  r  s           rE   r  +CommBufferAllocateLine.make_allocation_linen  s     r00999&-..u56b..v67r' &&,ll^ 4)l +"NN1i89< &01A0BC rG   r   Nr;  )r   r   r  r  r8  staticmethodr  r  r   rG   rE   r  r  X  s    
(  rG   r  c                      \ rS rSrSS jrSrg)CommBufferFreeLinei  c                    U R                   R                  U R                  5      nUR                  U SU R                  R
                   S35        g )Nz # z buffer free)r   r  rC   r   r  r   r  s      rE   r8  CommBufferFreeLine.codegen  s@    ||,,TYY7$s4#8#8#>#>"?|LMrG   r   Nr;  rZ  r   rG   rE   r  r    s    NrG   r  c                    ^  \ rS rSrSrU 4S jr\ S|       S}S jj5       rS~S jrSS jr	S~S jr
SS	 jrS~S
 jr\S~S j5       r\S~S j5       rSS jr\SS j5       rS~S jr  SS jrSS jrS~S jrS~S jrS~S jrSS jrSS jrSS jrS~S jrS~S jrS|SS jjrS rS rS r S r!S r"SS  jr#SS! jr$S~S" jr%SS# jr&SS$ jr'SS% jr(SS& jr)S' r*S( r+            SS) jr,SS* jr-S+ r.S, r/S- r0   S         SS. jjr1S/ r2SS0 jr3S1 r4S2 r5S3 r6S4 r7      SS5 jr8S6 r9SS7 jr:S8 r;S9S:.SS; jjr<S9S:.SS< jjr=SS= jr>SS> jr?SS? jr@SS@ jrASSA jrB S|   SSB jjrCSSC jrDSD rESE rFSF rGSG rH   S         SSH jjrISSI jrJ  SSJ jrKS|SSK jjrLSSL jrMSSM jrNSN rOSO rPSP rQSQ rRSR rSSS rTST rUSU rVSSV jrWSW rXSS9SSSSX. SSY jjrYSZ rZS[ r[S\ r\S|S] jr]SS^ jr^ S|S_ jr_SS` jr`SSa jraSSb jrbSSc jrcSSd jrdSSe jreSSf jrfSg rgS|Sh jrhSi riSSj jrjSk rk        SSl jrlSm rmSn rn    SSo jroSSp jrpSq rqSr rrSs rsSt rtSu ru\Sv 5       rv\Sw 5       rw\Sx 5       rx\Sy 5       ry\Sz 5       rzS{r{U =r|$ )r-  i  z:
Generate outer wrapper in Python that calls the kernels.
c                  >^  [         TT ]  5         [        5       T l        [	        5       T l        [	        5       T l        [	        5       T l        [	        5       T l        [	        5       T l	        [	        5       T l
        [	        5       T l        [	        5       T l        [	        5       T l        [        [           " 5       T l        0 T l        [        5       T l        / T l        ST l        ST l        ST l        ST l        ST l        [2        R4                  R6                  (       a  SOST l        [2        R4                  R6                  (       a  SOST l        S T l        ST l        0 T l         [        [           " 5       T l!        [        5       T l"        S T l#        T RI                  5         / T l%        / T l&        T RO                  5         T RQ                  5         T RS                  5         [2        R4                  RT                  (       dB  [2        R4                  RV                  RY                  5        H  u  pT R[                  X5        M     [        [\           " 5       T l/        [        [\           " 5       T l0        0 T l1        [d        Rf                  " S 5      " T Rh                  5      T l4        [d        Rf                  " S 5      S	U 4S jj5       nUT l5        0 T l6        [        [           " 5       T l7        [q        5       T l9        [        [           " 5       T l:        0 T l;        [y        [z        R|                  R~                  [z        R|                  R                  S9T lA        / T lB        g )
N #r$  z
std::move(rs   Tc                   > TR                   R                  U 5        [        R                  R                  (       a  TR
                  R                  U 5        g g r<   )importsr   r   r   r   r   )r   r  s    rE   add_import_once6PythonWrapperCodegen.__init__.<locals>.add_import_once  s;    LL""4(}}55**44T: 6rG   )debug_printer_leveluse_array_ref)r   r   r   r$  )Cr  r  r   _names_iterr0   r  headerprefixsuffixkernel_declarationswrapper_callkernel_autotune_defsr   subgraph_definitionsr   r   r   src_to_kernelkernel_numel_exprlinesdeclaredeclare_maybe_referenceendingcommentnone_strr+   r=   rN  
move_beginmove_endrI  supports_intermediate_hooksuser_defined_kernel_cacheunbacked_symbol_declsr0  launcher_fn_nameset_launcher_fn_namecodegened_graph_stackcomputed_sizes_stackwrite_headerwrite_prefix!write_kernel_autotune_defs_headerrO  constant_reprsr\   write_constant
BufferName	allocatedfreedreusesry  	lru_cachewrite_get_raw_streamr  _metas
_meta_varsr   multi_kernel_statealready_codegened_subgraphsallocated_workspacesr   r   aot_inductor debug_intermediate_value_printerallow_stack_allocationdebug_printeradditional_files)r  rx   hashedr  r  s   `   rE   r  PythonWrapperCodegen.__init__  s   */'%'$&$&$&#1#3 *,$2$4!%3%5"$2$4!%/_%6" .0HRCE
')$*+''*=*=,2 ww22;?)+/(QS&%/_%6"8B $!!# &("$&!..0ww ! 6 6 < < >##D1 !? $J/1
+-
 57$-$7$7$=%%%
! 
		T	"	; 
#	;
  /&($S/+"2"4+5c?+<(46! 1 & 3 3 T T --DD
 !#rG   Nc                P    U (       a  Uc   eUc   e[        XU5      $ [        5       $ r<   )SubgraphPythonWrapperCodegenr-  )is_subgraphsubgraph_nameparent_wrapperpartition_signaturess       rE   createPythonWrapperCodegen.create  s=      ,,,!---//C  $%%rG   c                    SU l         g )Ncall)r  r  s    rE   r  )PythonWrapperCodegen.set_launcher_fn_name  s
     &rG   c                D    U R                   R                  U SU 35        g )Nz = None  # )r  r   )r  rx   r  s      rE   r  #PythonWrapperCodegen.write_constant  s    k&:;rG   c           	     T   [         R                  R                  R                  5       nSnUb  UR                  b  SUR                   3nSn[        [        R                  R                  5      S:  a  SnU R                  R                  SU S[        R                   SU S3S	S
9  U R                  R                  SS	S
9   SSKJn  U R                  R                  SS	S
9  [        R$                  (       a  U R                  R'                  S5        g g ! [         ["        4 a     NDf = f)Nr  z
# AOT ID: r   zRfrom torch._inductor.codegen.debug_utils import _print_debugging_tensor_value_infoz
                aH  
                from ctypes import c_void_p, c_long, c_int
                import torch
                import math
                import random
                import os
                import tempfile
                from math import inf, nan
                from cmath import nanj
                from torch._inductor.hooks import run_intermediate_hooks
                from torch._inductor.utils import maybe_profile
                from torch._inductor.codegen.memory_planning import _align as align
                from torch import device, empty_strided
                from z import AsyncCompile
                from torch._inductor.select_algorithm import extern_kernels
                from torch._inductor.codegen.multi_kernel import MultiKernelCall
                z
            Tr   a  
                aten = torch.ops.aten
                inductor_ops = torch.ops.inductor
                _quantized = torch.ops._quantized
                assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
                reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
                alloc_from_pool = torch.ops.inductor._alloc_from_pool
                async_compile = AsyncCompile()
            )_SymmetricMemoryzs
                empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
                zfrom torch.cuda import nvtx)torch_guardsTracingContexttry_getaot_graph_namer   r   r  r  r  r   r   r   r  torch._C._distributed_c10dr  AttributeErrorImportErrorannotate_trainingr   )r  contextaot_config_commentaot_inductor_debug_utilsr  s        rE   r  !PythonWrapperCodegen.write_header  sF   --..6687#9#9#E#-g.D.D-E!F#% v""CCDqH'{$#$ % $,,- . ** +#& ) 	 	
, 	  	 	
	 DKK 	   ##KK!!"?@ $ , 		s    D D'&D'c                    g r<   r   )r  r  s     rE   include_extra_header)PythonWrapperCodegen.include_extra_header5      rG   c                ^    U R                   R                  S[        R                   S35        g )Na	  
                import torch
                from torch._dynamo.testing import rand_strided
                from torch._dynamo.utils import preserve_rng_state
                from torch._inductor.select_algorithm import AlgorithmSelectorCache
                from aH   import AsyncCompile

                async_compile = AsyncCompile()
                generate_example_value = AlgorithmSelectorCache.generate_example_value
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
            )r  r   r   r   r  s    rE   r  6PythonWrapperCodegen.write_kernel_autotune_defs_header8  s3    !!((
 $,,- .	
rG   c                   S[         R                   S3n[        R                  R                  (       a]  U R
                  R                  U5        U R
                  R                  [        R                  R                  R                  S5      5        [        R                  R                  (       d]  U R                  R                  USS9  U R                  R                  [        R                  R                  R                  S5      5        g g )NzU
            import triton
            import triton.language as tl
            from z+ import start_graph, end_graph
            get_raw_streamTr   )r"   r   r   r   r   r   r   r   r+   r=   rP  import_get_raw_stream_asrN  r  )r  
import_strs     rE   write_triton_header_once-PythonWrapperCodegen.write_triton_header_onceH  s     $,,- .

 ==11&&--j9&&00"";;<LM ww""LL
$7LL"""";;<LM #rG   c                   [         R                  R                  (       aB  U R                  R	                  [
        R                  R                  R                  S5      5        [
        R                  R                  (       dC  U R                  R	                  [
        R                  R                  R                  S5      5        g g )Nr"  )r   r   r   r   r   r+   r=   rP  r#  rN  r  r  s    rE    write_get_raw_stream_header_once5PythonWrapperCodegen.write_get_raw_stream_header_onceZ  s{    ==11&&00"";;<LM ww""LL"""";;<LM #rG   c                   [        U5      nXR                  ;  a  S[        U R                  5       3nX R                  U'   U R                  R	                  U SU 35        [
        R                  R                  (       a;  U R                  R	                  U SU 35        U R                  R                  U5        U R                  U   $ )Nmetar   )rX   r  r_   r  r   r   r   r   r   r  r   )r  r+  vars      rE   add_meta_once"PythonWrapperCodegen.add_meta_oncee  s    Dz{{"T[[)*+C #KKKK!!SETF"34}}55**44uCv5FG##C({{4  rG   c                ~    U R                  5        Vs/ s H  oR                  U R                  5      PM     sn$ s  snf r<   )get_graph_outputscodegen_referencer  r  r   s     rE   get_output_refs$PythonWrapperCodegen.get_output_refsp  s?     =A<R<R<T
<Tq 1 12<T
 	
 
s   $:c                    g r<   r   r  s    rE   mark_output_type%PythonWrapperCodegen.mark_output_typev      rG   c                6    [         R                  R                  $ r<   )r+   r=   graph_inputsr  s    rE   get_graph_inputs%PythonWrapperCodegen.get_graph_inputsy  s     ww###rG   c                6    [         R                  R                  $ r<   )r+   r=   graph_outputsr  s    rE   r0  &PythonWrapperCodegen.get_graph_outputs~  s    ww$$$rG   c           
     6   U R                  5       R                  5        H  u  p[        U[        R                  [
        R                  45      (       a  M6  U[        R                  R                  ;  d  [        U[
        R                  5      (       a  Mu  [        UR                  5       5      S:X  a  M  U R                  UR                  5       5      nU R                  UR                  5       5      nU R                  R!                  SU SU SU S35        M     g )Nr   zassert_size_stride(rp   rs   )r;  r\   r   r   r
   r   TorchBindObjectr+   r=   graph_input_namesGeneratorStater'   r  r   r  r  r   )r  rx   bufr  r  s        rE   codegen_input_size_asserts/PythonWrapperCodegen.codegen_input_size_asserts  s    ..0668ID#

B,>,>?@@ 177444
R&&9 9  S\\^,1223<<>BD44S^^5EFFKK!!$7vRvRxq"QR 9rG   c                n   U R                   R                  S5        U R                  5       R                  5        Hx  u  p[	        U[
        R                  [        R                  45      (       a  M6  SU S3nU R                   R                  U5        SU S3nU R                   R                  U5        Mz     g )Nz(# make sure graph inputs are not nan/infzassert not z.isnan().any().item()z.isinf().any().item())	r  r   r;  r\   r   r   r
   r   rA  )r  rx   rD  r   s       rE   codegen_input_nan_asserts.PythonWrapperCodegen.codegen_input_nan_asserts  s    HI..0668ID#

B,>,>?@@ &;<DKK!!$' &;<DKK!!$' 9rG   c                :    U R                   R                  S5        g )NzV

            async_compile.wait(globals())
            del async_compile
            )r  r   r  s    rE   write_async_compile_wait-PythonWrapperCodegen.write_async_compile_wait  s    	
rG   c                    SR                  U5      n[        U5      S:X  a  US-  nU R                  R                  U S35        U R                  R                  S5        g )Nrp   r,   ,z = argszargs.clear())rw   r_   r  r   )r  input_nameslhss      rE   
write_argsPythonWrapperCodegen.write_args  sP    ii${q 3JCWo.n-rG   c                    [         R                  (       a  U R                  R                  S5        SnU$ U R                  R                  SU R                   S35        SnU$ )Na  
                class Runner:
                    def __init__(self, partitions):
                        self.partitions = partitions

                    def recursively_apply_fns(self, fns):
                        new_callables = []
                        for fn, c in zip(fns, self.partitions):
                            new_callables.append(fn(c))
                        self.partitions = new_callables

                    def call(self, args):
                r   z
                def z(args):
                r,   )r   graph_partitionr  r   r  r  prefix_indents     rE   !write_launcher_fn_call_get_indent6PythonWrapperCodegen.write_launcher_fn_call_get_indent  sm    !!KK M  KK**+ ,
 MrG   c                6    [         R                  R                  $ r<   )r+   r=   rB  r  s    rE   get_graph_input_names*PythonWrapperCodegen.get_graph_input_names  s    ww(((rG   c                   U R                   c   eU R                  5         U R                  5       nU R                  R	                  U5         [
        R                  R                  (       aA  U R                  R                  [        R                  R                  R                  5       5        [        R                  R                  5       n[
        R                  (       a  U R                  R                  SU S35        U R                  5       =n(       a  U R!                  U5        U R#                  5         U R%                  5         S S S 5        g ! , (       d  f       g = f)Nz0training_annotation = nvtx._device_range_start(''))r  rK  rW  r  r   r   r   debug_sync_graphr   r+   r=   rP  synchronizeget_training_phaser  rZ  rQ  codegen_inputs"codegen_input_size_and_nan_asserts)r  rV  phaserB  s       rE   r  !PythonWrapperCodegen.write_prefix  s    $$000%%'>>@[[.}}--%%agg&8&8&D&D&FGGG..0E''%%FugRP %)$>$>$@@ @ 12!335 /..s   C;E
Ec                    [         R                  (       a  U R                  5         [         R                  (       a  U R	                  5         g g r<   )r   size_assertsrE  nan_assertsrH  r  s    rE   rb  7PythonWrapperCodegen.codegen_input_size_and_nan_asserts  s1    ++-**, rG   c                    U R                  5         SU 3n[        R                  R                  (       aB  U R                  R                  U SU S35        [        R                  R                  (       a  U$ U R                  U SU S35        U$ )Nstream = get_raw_stream(rs   )	r(  r   r   r   r   r   r+   r=   rN  )r  rH  r=   rx   s       rE   r  )PythonWrapperCodegen.write_get_raw_stream  s    --/
|$==11&&00&*:,a8 ww""$1*Q?@rG   c                     U R                   S   $ )N)r  r  s    rE   get_codegened_graph(PythonWrapperCodegen.get_codegened_graph  s    ))"--rG   c                :    U R                   R                  U5        g r<   )r  r  )r  r=   s     rE   r4  )PythonWrapperCodegen.push_codegened_graph  s    ""))%0rG   c                6    U R                   R                  5       $ r<   )r  r  r  s    rE   rC  (PythonWrapperCodegen.pop_codegened_graph  s    ))--//rG   c                P    SSK Jn  U R                  R                  U" U5      5      $ )Nr   )deepcopy)copyrv  r  r  )r  r0  rv  s      rE   r/  (PythonWrapperCodegen.push_computed_sizes   s!    !((//0HIIrG   c                6    U R                   R                  5       $ r<   )r  r  r  s    rE   r@  'PythonWrapperCodegen.pop_computed_sizes  s    ((,,..rG   c                .    [        U R                  5       $ r<   )nextr  r  s    rE   next_kernel_suffix'PythonWrapperCodegen.next_kernel_suffix  s    t''()*rG   c                >   U R                  [        XR                  5      5        [        R                  R
                  (       a  U R                  5         U R                  R                  S[        R                  R                  R                  U5       S35        U R                  R                  5         U R                  R                  [        R                  R                  R                  U5      5        U R                  R                  SU SU S35        Xl        g )NrL  rM  rj  rk  rs   )r   rG  rI  r   r   r   r%  r   r+   r=   rP  rS  r5  rT  )r  rH  s     rE   codegen_device_guard_enter/PythonWrapperCodegen.codegen_device_guard_enter  s    )*6W6WX	
 ==11))+&&00**77
CDAF &&002&&00""--j9 &&00$6zl!D -7)rG   c                    U R                  [        5       5        [        R                  R                  (       a  U R
                  R                  5         g g r<   )r   rW  r   r   r   r   rD  r  s    rE   codegen_device_guard_exit.PythonWrapperCodegen.codegen_device_guard_exit  s6    356==11&&224 2rG   c                    U(       a1  U R                   R                  SSR                  U5      -   S-   5        g U R                   R                  S5        g )Nzreturn (rp   , )z	return ())r  r   rw   )r  output_refss     rE   generate_return$PythonWrapperCodegen.generate_return#  s@    ''
TYY{5K(Ke(ST''4rG   c                    g r<   r   r  results     rE   generate_before_suffix+PythonWrapperCodegen.generate_before_suffix)  r8  rG   c                    [         R                  (       aO  SR                  U R                  5      [	        U R                  5      S:X  a  SOS-   nUR                  SU S35        g g )Nrp   r,   rN  r  z-
                runner = Runner(partitions=[z{])
                call = runner.call
                recursively_apply_fns = runner.recursively_apply_fns
                )r   rT  rw   all_partition_namesr_   r   )r  r  all_partition_name_lists      rE   generate_after_suffix*PythonWrapperCodegen.generate_after_suffix,  se    !!&*ii0H0H&I43349r'# MM--D,E F "rG   c                    g r<   r   r  s     rE   generate_end!PythonWrapperCodegen.generate_end:  r8  rG   c                &    U R                  X5        g r<   )generate_extern_kernel_alloc)r  fallback_kernelrz   s      rE   generate_fallback_kernel-PythonWrapperCodegen.generate_fallback_kernel=  s    ))/@rG   c           
        [        UR                  [        R                  5      nUR	                  5       nUR                  5       nUR                  5       nU R                  n[        R                  (       a  SU;   a  SU 3nU(       a5  U R                  U R                   U SSR                  U5       SU 35        g U R                  U R                   U SU SSR                  U5       SU 35        U R                  (       aR  [        R                  (       a<  Ub8  [        S   S==   S	-  ss'   U R                  S
UR                   < SU S35        g g g g )Nview_as_complexz.clone()rr   rp   rs   r   inductorintermediate_hooksr,   zrun_intermediate_hooks()r   r  r   
NoneLayoutrj  get_origin_nodeget_kernel_namer  r   memory_planningr   r  rw   r  generate_intermediate_hooksr   rx   )r  extern_kernelrz   	no_returnoutput_nameorigin_nodekernel_namer  s           rE   r  1PythonWrapperCodegen.generate_extern_kernel_alloc@  s;    }33R]]C	#,,.#335#335!!&7;&F  x(FNNdll^K=$))D/9J!F8TUNN<<.SQtyy>OqQWPXY 0066+$%9:a?:-k.>.>-AK=PQR , 7 1rG   c                2   [         R                  R                  R                  nUR	                  XAS S S5        UR                  SU(       a  UOU 35        U   U R                  U SSR                  U5       S35        S S S 5        g ! , (       d  f       g = f)Nexternzout=rr   rp   rs   )r+   r=   wrapper_coder  set_printer_argsr  r   rw   )r  ry   outout_viewrz   r  debug_printer_managers          rE   generate_extern_kernel_out/PythonWrapperCodegen.generate_extern_kernel_out]  sw     !" 4 4 B B..tT4Rdx8S9:;"NNfXQtyy&7q9: #""s   'B
Bc                  ^  UR                   nUR                  nU(       a$  [        S U 5       5      n[        S U 5       5      nUR                  R	                  5        S3nSR                  U 4S jU 5       5      nSR                  U 4S jU 5       5      n[        R                  T UR                  5      nSnU SUR                   S	3nU SU SU SU 3n	U S
U	 S3n
U
$ )Nc              3  t   #    U  H.  n[         R                  R                  R                  U5      v   M0     g 7fr<   r+   r=   rA   atomically_apply_size_hintr   ds     rE   r   EPythonWrapperCodegen._generate_tma_descriptor_call.<locals>.<genexpr>p  s*     VQUA))DDQGGQU   68c              3  t   #    U  H.  n[         R                  R                  R                  U5      v   M0     g 7fr<   r  r  s     rE   r   r  q  s+      HR1  ;;A>>
r  z.data_ptr()rp   c              3  P   >#    U  H  n[         R                  TU5      v   M     g 7fr<   r-  val_to_arg_strr   dimr  s     rE   r   r  w  s$     XSWC-<<T3GGSW   #&c              3  P   >#    U  H  n[         R                  TU5      v   M     g 7fr<   r  r  s     rE   r   r  x  s%      
FPs //c::jr  z$triton.tools.experimental_descriptorz.create_d_tma_descriptorrr   rs   )
dims
block_dimsr   tensorr1  rw   r-  r  element_sizerank)r  descapply_size_hintsr  r  ptrr  r  r   rz   r  s   `          rE   _generate_tma_descriptor_call2PythonWrapperCodegen._generate_tma_descriptor_calll  s    yy__
VQUVVD HR J ..01=yyXSWXXYY 
FP
 

 ,::4ARARS7xx		{*:;bbB|n=QtfArG   c                    U R                  U5      nUR                   SU U R                   3nU R                  U5        g Nr   )r  rx   r  r   )r  r  r  r   s       rE   generate_tma_descriptor,PythonWrapperCodegen.generate_tma_descriptor  s:    11$7))Cvdkk]3trG   c                   U SSR                  [        [        U5      5       3nUR                  S5      (       a  USR                  S/U-   5      -  nOU(       a  US[	        U5       3-  nUS-  nU R                  U5        g )Nrr   rN  zaten.scatter_reducerp   r  z	, reduce=rs   )rw   mapr   r   rX   r   )	r  r   inputscpp_kernel_namepython_kernel_namesrc_is_tensorrz  r   r   s	            rE   generate_scatter_fallback.PythonWrapperCodegen.generate_scatter_fallback  s~     %%QsxxC0@'A&BC(()>??DIIrdVm,,D)DL>22trG   c                v    SSR                  U5       S3nX&XE/nU R                  U R                  X5      5        g )N[rp   ])rw   r   wrap_kernel_call)r  ry   r   indicesvalues
accumulateindices_strrz   s           rE   generate_index_put_fallback0PythonWrapperCodegen.generate_index_put_fallback  s;    $))G,-Q/3t,,V:;rG   c           	     V    U R                  U SU SSR                  U5       S35        g )Nr   rr   rp   rs   )r   rw   )r  buf_namer  r  codegen_argsop_overloadraw_argsoutputss           rE   ,generate_fallback_kernel_with_runtime_lookupAPythonWrapperCodegen.generate_fallback_kernel_with_runtime_lookup  s0     	(3'9&:!DIIl<S;TTUVWrG   c                p    [        S5         U R                  U5      sS S S 5        $ ! , (       d  f       g = f)NzPythonWrapperCodegen.generate)r   	_generate)r  is_inferences     rE   generatePythonWrapperCodegen.generate  s#    9:>>,/ ;::s   '
5c                0    [         R                  (       a  gg)Nr   r,   )r   rT  r  s    rE   get_wrapper_call_indent,PythonWrapperCodegen.get_wrapper_call_indent  s    !!rG   c                
   [         R                  (       a  U R                  5         [        5       nUR	                  U R
                  5        UR                  S5        UR	                  U R                  5        [        R                  R                  (       aH  [        R                  R                  (       a)  [        R                  R                  (       a
  [        5       nUR	                  U R                  5        [        R                  " 5        nUR!                  U R"                  R%                  5       5        [         R&                  (       a  U R)                  U5        [         R                  (       a  U R+                  5         U(       a&  [         R,                  (       a  U R/                  5         OU R1                  5         [         R2                  R4                  (       a/  [         R2                  R6                  (       d  U R9                  5         U R:                   HP  n[=        U[>        5      (       a  URA                  U R"                  5        M5  U R"                  R                  U5        MR     U RC                  5       nU RE                  5         [         R2                  RF                  (       aA  U R"                  R                  [        R                  RH                  RK                  5       5        [         R                  (       a  U RM                  5         [         R2                  R4                  (       a/  [         R2                  R6                  (       d  U RO                  5         [         R2                  R6                  (       a  U RQ                  5         [         RR                  (       a0  [         R                  (       d  U R"                  R                  S5        U RU                  U5        S S S 5        U RW                  5         UR	                  U RX                  5        U R[                  5       nUR%                  U5         UR	                  U R"                  5        S S S 5        U R]                  U5        UR	                  U R^                  5        U Ra                  U5        U Rc                  U5        U Re                  U5        URg                  5       U Rh                  Rg                  5       4$ ! , (       d  f       GN= f! , (       d  f       N= f)Nr  z+nvtx._device_range_end(training_annotation))5r   profile_bandwidthr%  r0   r   r  r   r  r+   r=   rO  rN  is_const_graphr  r   	ExitStackenter_contextr  r   profiler_mark_wrapper_call#generate_profiler_mark_wrapper_callgenerate_start_graphr  memory_planmemory_plan_reuser   store_cubinr   !generate_reset_kernel_saved_flagsr  r   r(  r8  r3  r6  r^  rP  r_  generate_end_graph generate_save_uncompiled_kernelsgenerate_and_run_autotune_blockr  r  finalize_prefixr  r  r  r  r  r  add_benchmark_harnessgetvaluewithlinemapr  )r  r  r  stackr   r  wrapper_call_indents          rE   r  PythonWrapperCodegen._generate  s/   ##))+!dll#dkk" 77 3 38N8N#%F 	d//0!!#u 1 1 8 8 :;0088?''))+  6 6  "&&(}}((1W1W668

dK00LL!2!23%%//5	 # ..0K!!#}}--!!++AGG,>,>,J,J,LM'''')}}((1W1W557}}55446 ''0B0B!!++A   -S $V 	dkk""::<]]./MM$++, 0 	##F+dkk"""6*&!""6* &&($$88:
 	
w $#` 0/s   KS':S9'
S69
Tc                B   U R                   R                  S5        0 nU R                   R                  5       S-   U R                  R                  5       -   n[        R
                  [        R                  :X  aj  [        R                  " [        5       SSS9 nUR                  UR                  S5      5        UR                  nSSS5        [        R                  " SW5         [        X!5        g! , (       d  f       N2= f! [          a  n[#        S	U 35      UeSnAff = f)
z
Compose self.kernel_autotune_defs and self.kernel_autotune_calls into a single block of
code and execute it to trigger Triton kernel compilation and auto-tuning
zQ
            async_compile.wait(globals())
            del async_compile
        rK  z.pyF)dirr  deletezutf-8NzAuto-tuning code written to %sz%Failed to run autotuning code block: )r  r   r   r   r   levelloggingDEBUGtempfileNamedTemporaryFiler   writeencoderx   debugexec	ExceptionRuntimeError)r  scopetuning_codef	file_pathes         rE   r  4PythonWrapperCodegen.generate_and_run_autotune_block  s   
 	!!((	
 %%..0((1134 	
   GMM1 ,,Ke**734FF		
 !!0
	S$   	S!FqcJKQRR	Ss$   -C/#D  /
C= 
D
DDc                \    SSK Jn  U" U 5      R                  U R                  5      U l        g )Nr,   )MemoryPlanner)r  r  r`  r  )r  r  s     rE   r   PythonWrapperCodegen.memory_plan'  s     2"4(--djj9
rG   c                "   [         R                  R                  5       nU R                  (       a  [	        U R                  S   [
        5      (       a  U R                  S   R                  R                  U;  av  U R                  R                  5         U R                  (       aK  [	        U R                  S   [
        5      (       a)  U R                  S   R                  R                  U;  a  Mv  [        5       /n/ n[        [        U R                  5      5       H  nU R                  U   n[	        U[
        5      (       a#  UR                  US   5      U R                  U'   MJ  [	        U[        5      (       a  UR                  [        5       5        Mz  [	        U[        5      (       d  M  UR                  UR                  5       5        M     UR                  UR                  5       5        [        U5      S:X  d   e[!        S U 5       5      ng )Nrn  r   c              3  8   #    U  H  oR                   v   M     g 7fr<   )r  )r   ss     rE   r   9PythonWrapperCodegen.memory_plan_reuse.<locals>.<genexpr>I  s      +
3Ga))3Gs   )r+   r=   get_output_namesr  r   r\  rC   rx   r  r
  ranger_   r`  r,  r  r>  sum)r  	out_namesplanning_statespast_planning_statesir   _total_allocated_buffer_sizes          rE   r  &PythonWrapperCodegen.memory_plan_reuse,  s   GG,,.	 JJ4::b>+=>>

2##((	9 JJNN JJ4::b>+=>>

2##((	9 /01!s4::'A::a=D$ 233 $		/"*= >

1D"344&&':'<=D"233$++O,?,?,AB ( 	##O$7$7$9:?#q(((
 (+ +
3G+
 (
$rG   c           	       ^	 U R                   m	[        R                  " S 5      U	4S j5       n[        R                  " S 5      U	4S j5       n[        U[        R
                  5      (       aM  [        U[        R                  5      (       a  X#;   a  g T	R                  U SU 35        UR                  U5        g [        U[        R                  5      (       a  [        UR                  5       5       H^  u  pg[        U[        R                  5      (       d  M&  Xs;  d  M-  T	R                  U SU" U5       SU S35        UR                  U5        M`     [        UR                  5       5       H^  u  ph[        U[        R                  5      (       d  M&  X;  d  M-  T	R                  U SU" U5       SU S35        UR                  U5        M`     g [        U[        R                  5      (       a  g [        U[        R                  5      (       a  g [         R"                  R$                  R&                  (       a  g [)        S[+        U5       35      e)Nc                <   > TR                  U  SU  S35        U  S3$ )Nz_size = z.size()_sizer   rx   r7  s    rE   sizeofDPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.sizeofU  s(    NNdV8D69:V5>!rG   c                <   > TR                  U  SU  S35        U  S3$ )Nz
_stride = z	.stride()_strider1  r2  s    rE   strideofFPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.strideofZ  s)    NNdV:dV9=>V7##rG   r   r  r  zUnknown value type: )r  ry  r  r   r   r
   Symbolr   r   r   	TensorBox	enumerater  r  rA  rC  r  	_inductorr   rT  r`   r   )
r  rx   r   
bound_varsr3  r7  r  r  r  r7  s
            @rE   codegen_input_symbol_assignment4PythonWrapperCodegen.codegen_input_symbol_assignmentM  s    {{			T	"	" 
#	" 
		T	"	$ 
#	$ eUZZ((eU\\22e6INNeWCv./NN5!r||,,&u~~'78	dELL11d6LNNdV3vd|nAcU!#DENN4( 9  ))9)9);<fell338PNNfXS$0@#a#HINN6*  = r1122r0011%%55$';DK=%IJJrG   c           	        [         [        R                     " 5       nU R                  5       nUR	                  5        VVs/ s H)  u  p4[        U[        R                  5      (       d  M&  X44PM+     snnUR	                  5        VVs/ s H)  u  p4[        U[        R                  5      (       a  M&  X44PM+     snn-   nU H  u  pgU R                  XgU5        M     gs  snnf s  snnf )z$Assign all symbolic shapes to localsN)r   r   r9  r;  r\   r   r>  )r  r=  r:  kvr  rx   r   s           rE   ra  #PythonWrapperCodegen.codegen_inputsw  s    -/
 ,,.+113
3tqz!U\\7RFQF3
 , 2 2 4X 4Jq%,,<WVaV 4XY "KD00jI "
Xs    %C)C%C /C c                f   [        U[        R                  5      (       a  [        U[        R
                  5      (       ar  XR                  ;   a  g U R                  R                  U5        [        R                  R                  R                  U   nU R                  U S[        U5       35        g g g r  )r   r   r9  r   r   PRECOMPUTED_SIZEr0  r   r+   r=   rA   inv_precomputed_replacementsr   pexpr)r  symexprs      rE   ensure_size_computed)PythonWrapperCodegen.ensure_size_computed  s    c5<<((^CAVAV-W-W)))##C(77##@@EDNNcU#eDk]34 .X(rG   c                    g r<   r   r  s    rE   r  $PythonWrapperCodegen.finalize_prefix  r  rG   TrB   c                   [        S5      e)Nz8codegen_cpp_sizevar is only implemented for cpp_wrapper!)r  r  r   rB   s      rE   codegen_cpp_sizevar(PythonWrapperCodegen.codegen_cpp_sizevar  s    UVVrG   c                   [        XS9$ )NrN  )rG  rP  s      rE   codegen_python_sizevar+PythonWrapperCodegen.codegen_python_sizevar  s    Q**rG   c                $    U R                  U5      $ r<   )rT  r2  s     rE   codegen_sizevar$PythonWrapperCodegen.codegen_sizevar  s    **1--rG   c                    U SU S3$ )Nr  r  r   )r  basenamerx   r  s       rE   codegen_tuple_access)PythonWrapperCodegen.codegen_tuple_access  s    1UG1%%rG   c                    / [        U R                  U5      Qn[        U5      S:X  a  g[        U5      S:X  a	  SUS    S3$ SSR                  U5       S3$ )Nr   ()r,   rr   r  rp   rs   )r  rT  r_   rw   )r  r  partss      rE   r   /PythonWrapperCodegen.codegen_python_shape_tuple  s^    :#d1159:u:?u:?uQxj$$499U#$A&&rG   c                $    U R                  U5      $ r<   )r   )r  r  s     rE   r  (PythonWrapperCodegen.codegen_shape_tuple  s    ..u55rG   c                    SR                  SR                  U[        U5      [        U5      U R	                  U5      U R	                  U5      /5      5      $ )Nzalloc_from_pool({})rp   )formatrw   rG  r   r   )r  rx   offsetr   r  r  s         rE   codegen_alloc_from_pool,PythonWrapperCodegen.codegen_alloc_from_pool  sS    $++II&MJ33E:33F;

 
	
rG   c                   X!R                   R                  :X  al  X1R                   R                  :X  aS  XAR                   R                  :X  a:  Ub&  XaR                  :w  a  SUR                  5        SU S3$ UR                  5        $ U R                  U5      nU R                  U5      nU R                  U5      nUb/  XaR                  :w  a   SUR                  5        SU SU SU SU S3$ SUR                  5        SU SU SU S3	$ )Nzaten.view.dtype(rp   rs   z#aten.view.dtype(reinterpret_tensor(z), zreinterpret_tensor()r  r  r  re  r   rj  r   rW  )r  datar  r  re  r   r   s          rE   codegen_reinterpret_view-PythonWrapperCodegen.codegen_reinterpret_view  s    KK$$$++,,,++,,, Ujj%8)$--/):"UG1EE--/*+2248D44V<F))&1F Ujj%8<T]]_<MRPTvUWX^W__abhaiilmrlsstuu *$--/):"TF"VHBvhVWXrG   c                8    U R                  U SU SU S35        g )Nz.copy_(rp   rs   r1  )r  r   dstnon_blockings       rE   codegen_device_copy(PythonWrapperCodegen.codegen_device_copy  s!    #gcU"\N!<=rG   c                `    U R                  U R                   U SU U R                   35        g r  )r   r  r  )r  rx   r   s      rE   codegen_multi_output)PythonWrapperCodegen.codegen_multi_output  s)    $,,vS}EFrG   c                   S UR                    5       u  n[        UR                  5      S:X  a#  U R                  UR                   SU S35        GOw[        UR                  5      S:X  aE  [        UR                  S   [        5      (       a#  U R                  UR                   SU S35        GO[        UR                  5      S:X  a  [        UR                  S   [        5      (       a  U R                  UR                   SU S35        U R                  S	UR                   S
UR                  S   R                   SUR                   SUR                  S   R                   S3	5        U R                  UR                   SUR                   SUR                  S   R                   35        O[        SUR                   35      eU R                  UR                  5        S35        g )Nc              3  @   #    U  H  oR                  5       v   M     g 7fr<   )r1  )r   ts     rE   r   >PythonWrapperCodegen.codegen_dynamic_scalar.<locals>.<genexpr>  s     >+Q&&((+s   r   r   .item()r,   z = 1 if z.item() else 0z_undivided = zassert z_undivided % z
 == 0, f'{z_undivided} not divisible by 'z_undivided // unrecognized keypath z = None)r  r_   keypathr   rH  r   r   r   divisorr`   rj  )r  rC   ri  s      rE   codegen_dynamic_scalar+PythonWrapperCodegen.codegen_dynamic_scalar  s   >$++>t||!NNdhhZs4&89!#
4<<?M(R(RNNdhhZxv^DE!#
4<<?K(P(PNNdhhZ}TF'BCNN$((=a1H1H0I Jxxj >t||A?V?V>WWXZ NN88*CzQ8O8O7PQ !#8!GHH 	$--/*'23rG   c           
     r  ^ ^ UU 4S jnU4S jnU4S jnTR                  / SQ5        TR                  5          TR                  SSS9  [        R                  R
                  R                  5        HT  u  pVTR                  SU 35        U" XVR                  5       UR                  5       UR                  UR                  5        MV     [        [        R                  R                  5      S	:  a^  TR                  S
5        [        R                  R                  R                  5        H!  u  pWTR                  SU 35        U" XW5        M#     [        R                  R                  R                  5        GH9  u  pV[        U[         R"                  5      (       aI  [        [        R                  R$                  R&                  R)                  US 5      [*        5      (       a  Mn  [        U[,        R.                  5      (       ad  [        [        R                  R                  5      S	:X  a  TR                  S
5        TR                  SU 35        U" XVR1                  5       5        M  [        U[         R2                  5      (       a2  U" U[        R                  R$                  R5                  USS95        GMB  [        U[,        R6                  5      (       a$  U" USUR                  R8                   S35        GM  UR;                  5        Vs/ s H+  n[        R                  R$                  R5                  USS9PM-     n	nUR=                  5        Vs/ s H+  n[        R                  R$                  R5                  USS9PM-     n
nU" UU	U
UR?                  5       URA                  5       5        GM<     SSRC                  [        R                  R                  RE                  5       5       S3nTR                  SU 35        TR                  S5        S S S 5        g s  snf s  snf ! , (       d  f       g = f)Nc                   > TR                  U  STR                  U5       STR                  U5       SU SU S3
5        g )Nz = rand_strided(rp   
, device='	', dtype=rs   )r   r   )rx   r  r  r  r   r   r  s        rE   add_fake_inputFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_fake_input  sT    &(2259:"226:; <!()E7!5rG   c                2   > TR                  U  SU 35        g r  r1  )rx   r   r   s     rE   add_expr_inputFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_expr_input  s    vS./rG   c                   > SS K n[        U[        R                  5      (       d   eTR	                  U  SUR                  U5      < S35        g )Nr   z = pickle.loads(rs   )pickler   r  ScriptObjectr   dumps)rx   r   r  r   s      rE   add_torchbind_inputKPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_torchbind_input  sE    eU%7%78888v%5fll56I5LANOrG   )r  r  z3def benchmark_compiled_module(times=10, repeat=10):z
                from torch._dynamo.testing import rand_strided
                from torch._inductor.utils import print_performance
                Tr   zglobal r   zimport pickle*   fallbackztorch.cuda.default_generators[z].graphsafe_get_state()zcall([rp   z])zfn = lambda: z8return print_performance(fn, times=times, repeat=repeat))#
writelinesr   r   r+   r=   	constantsr\   r   r  r  r  r   r_   torchbind_constantsr:  r   r   r9  rA   
var_to_valrk   r   r   rA  get_real_objr
   	size_hintrC  r  r  r  r  r@   rw   keys)r  r   r  r  r  rx   r   torchbind_objr   r  r  call_strs   ``          rE   benchmark_compiled_module.PythonWrapperCodegen.benchmark_compiled_module  sa   		0	P 	K	
 ]]_MM     !ww00668   74&!12**,ekk	  9 177../!3  1+,77+F+F+L+L+N'D $$wtf%56'<	 ,O  !ww3399;eU\\22zGG$$//33E4@,8 8 eR%7%7881776671<((9$$wtf%56'.@.@.BCuzz22
 #4)9)9)C)CETV)C)WXr'8'899"89K9K8LLcd "'!1!1A ((221r2B!1   "'!1!1!3!3A ((221r2B!3   #((*)E  <T  		!''*>*>*C*C*E FGrJH}XJ78WXK _fo _s+   KP(2P P(2P#BP(
P((
P6c                
   [         R                  (       d  gU R                  U5        UR                  / SQ5        UR	                  5          UR                  SS[        5        S3/5        SSS5        g! , (       d  f       g= f)z<
Append a benchmark harness to generated code for debugging
N)r  r  zif __name__ == "__main__":zBfrom torch._inductor.wrapper_benchmark import compiled_module_mainzcompiled_module_main('z', benchmark_compiled_module))r   benchmark_harnessr  r  r   r%   r  r   s     rE   r  *PythonWrapperCodegen.add_benchmark_harnessL  sh     ''&&v.@A]]_X,-?-A,BB_` __s    A44
Bc                .   [         R                  R                  (       aC  SU SU 3nU R                  R	                  U5        [
        R                  R                  (       a  g U(       a  U S3OSnSU U SU 3nU R                  R	                  U5        g )Nz

r   rK  r  )	r   r   r   r  r   r+   r=   rN  r  )r  r  kernel_bodymetadatagpucpp_definitionbodymetadata_comments           rE   define_kernel"PythonWrapperCodegen.define_kernel^  s     ==11+c+7D%%,,T2ww"".6hZr?B&'}C}E4 rG   c                :    U R                   R                  U5        g r<   )r  r   )r  fn_codes     rE   define_subgraph_launcher_fn0PythonWrapperCodegen.define_subgraph_launcher_fnq  s    !!((1rG   c                R  ^^*^+^,^-^.^/ SSK Jn  SSKJnJn	Jn
  SSKJm*JnJ	nJ
nJn  SSKJnJn  U" 5         UR                  n/ m/0 m-/ m,/ nU,U/4S jm+S)U*U+U-U4S	 jjn[!        UR"                  5       GH  u  nnUUR$                  ;   a  U" UT*" US
9SS9  M%  UT;  a  M-  TU   nTU   c  U" UT*" US
9SS9  MF  ['        U[(        R*                  5      (       a  U" UU" US
95        Mt  ['        U[(        R,                  5      (       a-  U" UU" UUR/                  5       UR1                  5       S95        M  ['        U[(        R2                  5      (       aM  U" UU" UUR4                  R/                  5       UR1                  5       UR6                  R8                  S95        GM,  ['        U[:        [<        R>                  45      =(       a*    [@        RB                  RD                  RG                  US5      nU" UU" UU5      US9  GM     [I        T/S T,UR"                   Vs/ s H  n[K        U5      PM     snS9nU[L        RN                  " [@        RB                  RQ                  5       5      0 T-E[R        RU                  US5      E[W        T/T,S9/S.nU(       a  [Y        U5      US'   U(       a  [Y        U5      US'   [[        U5      S:X  a0  U	R]                  5       n/ [_        [<        R`                  US   5      QnOS*U.4S jjn0 m.U Vs/ s H  n/ [_        UU5      QPM     nnU(       a  [[        U5      [[        U5      :X  d   e/ n[c        [e        Xb5      S SS9 HA  u  nn URg                  U" U 5      / [_        [h        U5      Q/ [_        [j        U5      QS.5        MC     U
R                  U/ [_        [l        T.Ro                  5       5      QS.n/ T.Rq                  5       Qn[s        URt                  5      /n![[        U5      S:  aY  TRo                  5        HE  n['        U[(        R,                  [(        R2                  45      (       a  M4  U!Rg                  U5        MG     U!Rg                  [m        U5      5        U!Rw                  [m        U5      5        [Y        U!5      n!U!U Rx                  ;   a  / U Rx                  U!   QUP7$ U S[[        U Rx                  5       3n"[{        5       n#[|        R                  R~                  (       a  U#R                  SU"< S35        OU#R                  SU< S35        U"US'   UR                  UR                  5       5        U#R                  U" 5       5        U#R                  S/ [_        X5      Q< S U< S!U< S"35        [        U5      n$[|        R                  R~                  (       a  U$R                  S#U S$3S#U" S$35      n$U#R                  U$5        [@        RB                  RQ                  5       n%U#R                  S%U%R                   S&35        [        R                  " URt                  5      u  n&n'[        R                  " URt                  5      n(S'U( S(U' 3n)U R                  U"U#R                  5       U)5        U"U4U Rx                  U!'   U"UU4$ s  snf s  snf )+Nr   )patch_triton_dtype_reprr   )config_to_dict	FixedGridPrecomputedGridr,   )ConstexprArgKernelArgTypeSizeArg	TensorArgTMADescriptorArg)gen_common_triton_importsTritonKernelc                J   > TR                  U5        TR                  U 5        g r<   )r  )idxra   arg_indices	signatures     rE   add_to_signaturePPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_to_signature  s    S!s#rG   c                  > U(       aE  [        5       (       a  T" X5        UR                  T;   a  TUR                     TUR                  '   g g UR                  T;   d   eU(       a?  [        5       (       a  T" U T" UR                  S95        OT" X5        STUR                  '   g U(       a6  [        5       (       a  T" U T" UR                  S95        S TUR                  '   g T" X5        g )Nrx   r,   )r*   rx   )	r  ra   is_constexprequals_1equals_noner  r  r  r   s	        rE   add_argGPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_arg  s    133 %S.88v% +1*:Ichh' & xx6)))577
 )l.IJ(2*+Ichh' 577 )l.IJ*.Ichh'$S.rG   r  T)r  )r  )rx   bufferr   )rx   r  r   re  )r  )
size_dtyper  argdefs)r  )r  r  r  r   restore_valuereset_to_zeroc                t  > [        U [        R                  5      (       al  / U R                  QnU(       d  U $ UR	                  [
        S9  U H0  nUT;   a  M  [        R                  " S[        T5       35      TU'   M2     [        U T5      $ [        U [        5      (       d   e[        R                  " U 5      $ )N)r   _launcher_s)r   r   r
   free_symbolssortr   r9  r_   r)   r   r   )rI  symbolsrH  extra_launcher_argss      rE   rename_sizes_for_launcherYPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.rename_sizes_for_launcher  s    dEJJ//2 1 12G"#LLSL)&"55$38<<)#.A*B)CD4+C0  ' &d,?@@!$,,,,}}T**rG   c                2    [        U S   R                  5      $ r   r   r   s    rE   r   HPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.<lambda>-  r   rG   r   )r   pythonrW   )	grid_typeprecomputed_gridsr  _zasync_compile.triton(z, '''r  zG
            @triton_heuristics.user_autotune(
                configs=z ,
                inductor_meta=z,
                triton_meta=z{,
                filename=__file__,
                custom_kernel=True,
            )
            @triton.jit
            r   rr   z''', device_str='r]  z# Original path: rM  )FFF)rI  r   r   r   )Ltorch.utils._tritonr  runtime.triton_heuristicsr  r  r  commonr  r  r  r  r  r   r  r  r   r;  	arg_names
constexprsr   r   TMADescriptorri  rj  r@   r!   ri  r  re  r   r   r   r+   r=   rA   statically_known_equalsr7   r-   r#   r  get_current_device_or_throwdictfromkeysr5   r   r_   setup_grid_as_argsr  sympifyr   r   r  rG  r4   r   r  r  idr   extendr  r0   r   unique_user_kernel_namesr   updateinductor_meta_commonr   r   replacer   inspectgetsourcelinesgetsourcefiler  r   )0r  ry   r   r   restore_value_argsreset_to_zero_argsr   r  r  r  r  r  r  r  r  r  r  original_nameequal_to_1_argsr  r  r   ra   r  r   triton_signaturetriton_metainductor_metaextra_launcher_call_argsr  r   r  cfg	cache_keyrx   r   
kernel_srccurrent_devicer  linenosrcfiler  r  r  r  r  r  r  s0      `                                      @@@@@@rE   !define_user_defined_triton_kernel6PythonWrapperCodegen.define_user_defined_triton_kernelt  sM    	@	
 	

	
 	
 	D!)+	$&	!#%'	$"	/ "	/H "&"2"23HCf'''\s3$G& +Cc{"\s3Fc2#3#344(!$  RYY//!!$#&<<>"%--/  R%7%788 !!$#&88#4#4#6"%--/#&::#4#4	  *c5==1   ''**BB  Cc!2XFc 4f -)/)9)9:)9AWQZ)9:	
 *&--agg.Q.Q.ST--3
 ''
, +01C+DK(+01C+DK(u:?,5,H,H,JM'FU]]E!H)E'F$+  EGINO<s4d;<EOSZ3w<777 "#E#)CT	c "(("0"5"5Ct$4"52UD!12 -55%6'PS2E2L2L2N)O'PM
 (E)<)A)A)C'D$ VYY-	w<!}}!#		23E3E'FGG$$S) ' 	[)*]+,)$	666//	:( 
  #d&D&D"E!FG(*==11%%(=dXU&KL%%(=m=Ne&TU'+m$\>>@A8:;83~78; <,/ 0(O ,			
 OvV
==11#++d=/,CtD6QR^TJz*<<>!!$5n6I6I5J""MN**6995	6''		2&wiq9$$&	
 6:;4G&&y1[":::I ;j Ps   Z Z$c                    U SUR                    S3nUb  USU 3-  nU R                  U S[        UR                  5       35        [	        XBR                  5      $ )Nr  r  r   )r  r   rG  r  r   )r  r  treer  rI  s        rE   generate_numel_expr(PythonWrapperCodegen.generate_numel_expry  s`    a}E2axL D$s5#4"567 tZZ00rG   c                   UR                  5       n[        X5      nUR                  [        R                  :X  a  U R                  U5        GOBUR                  [        R                  :X  a2  U R                  U5        U R                  U R                  U5      5        OUR                  [        R                  :X  a  U R                  R                  U5      nU(       a]  [        U[        5      (       a  [        UR                  [        5      (       d   e[        R                  " UR                  U5      Ul        OUU R                  U5        U R                  U R                  U5      5        X0R                  U'   O[        UR                  5      e[         R"                  R$                  (       a  U R&                  R                  [(        R+                  U UUR,                  UR.                  [0        R2                  R4                  R7                  UR8                  5      4SS95        UR                  [        R                  :w  a/  U R&                  R                  [(        R                  X5      5        g g g )N)r,   )r  r  )rj  rp  	zero_moder3   UNINITIALIZEDr   ZERO_ON_CALLmake_zero_bufferZERO_PER_GRAPHr  rk   r   rC   r2   maximumr`   r   r   r   r   r-  make_allocationr  r   r+   r=   rA   r  r   )r  wsrx   r   priors        rE   generate_workspace_allocation2PythonWrapperCodegen.generate_workspace_allocation  s   {{}D%<<,:::NN4 \\.;;;NN4 NN40067\\.===--11$7E!%66:JJ< <   *11%**bA
t$t44T:;26))$/ ..==11&&00$44IIHH77++55bhh?A 5 	 ||0>>>**44(99$E ? 2rG   c                v    UR                   [        R                  :w  a  U R                  [	        X5      5        g g r<   )r  r3   r  r   r#  )r  r
  s     rE   generate_workspace_deallocation4PythonWrapperCodegen.generate_workspace_deallocation  s,    <<,;;;NN.t89 <rG   c                $    U SU R                    3$ )Nz.zero_())r  )r  rx   s     rE   r  %PythonWrapperCodegen.make_zero_buffer  s    x}--rG   c                H    U SSR                  U5       SU R                   3$ )Nrr   rp   rs   )rw   r  )r  rx   	call_argss      rE   r  %PythonWrapperCodegen.wrap_kernel_call  s'    q9-.a}==rG   c                    U R                   R                  S5        U R                   R                  S[        R                  R                   S35        UR                  U R                   R                  5       5        g )Nz*from torch.profiler import record_functionzwith record_function('graph_z_inductor_wrapper_call'):)r  r   r+   r=   graph_idr  r   )r  r  s     rE   r  8PythonWrapperCodegen.generate_profiler_mark_wrapper_call  sb    ##$PQ##*177+;+;*<<UV	
 	D--4467rG   c                :    U R                   R                  S5        g )Nzstart_graph())r  r   r  s    rE   r  )PythonWrapperCodegen.generate_start_graph  s    ##O4rG   c                `    U R                   R                  S[        R                  < S35        g )Nz
end_graph(rs   )r  r   r   profile_bandwidth_outputr  s    rE   r  'PythonWrapperCodegen.generate_end_graph  s'    ##j1P1P0SST$UVrG   c                ^    U R                   R                  S[        R                   S35        g )NU
            for kernel in globals().values():
                if isinstance(kernel, zU.CachingAutotuner):
                    kernel.cuda_kernel_saved = False
            r  r   r"   r   r  s    rE   r   6PythonWrapperCodegen.generate_reset_kernel_saved_flags  s2      ''8'A'A&B C	
rG   c                ^    U R                   R                  S[        R                   S35        g)a  
Precompile and save the CUBINs of the Triton kernels that haven't
been precompiled and saved as a side effect of running the generated
JIT model (Python wrapper). This can happen when the model contains
control flow: only one pass through the control flow operators covers
the kernels that are saved, the remaining kernels are not launched,
hence not saved. The main purpose of this codegen is to compile and
save the Triton kernels outside the active control flow path for
subsequent AOTInductor code generation and compilation.
r  a  .CachingAutotuner):
                    if not kernel.cuda_kernel_saved:
                        if len(kernel.launchers) == 0:
                            kernel.precompile()
                        kernel.save_gpu_kernel(
                            grid=(0, 0, 0),   # use dummy grid
                            stream="stream",  # use dummy stream
                            launcher=kernel.launchers[0],
                        )
            Nr   r  s    rE   r  5PythonWrapperCodegen.generate_save_uncompiled_kernels  s4     	  ''8'A'A&B 	C	
rG   c                B    S nU Vs/ s H
  o2" U5      PM     sn$ s  snf )Nc                   [        U [        5      (       a  [        U 5      (       a  U S-   $ U $ [        U [        [        [
        [        45      (       a  [        U 5      $ [        [        R                  R                  R                  U 5      5      $ )Nrx  )r   r   r6   r   floatr   r   rG  r+   r=   rA   rB   )ra   s    rE   wrap_argAPythonWrapperCodegen.prepare_triton_kernel_call.<locals>.wrap_arg  sg    #s##*B3*G*GsYPSPC#udO!DEE3xQWW--66s;<<rG   r   )r  r  r'  ra   s       rE   prepare_triton_kernel_call/PythonWrapperCodegen.prepare_triton_kernel_call  s%    	= *33#333s   c                h  ^  [        U[        5      (       Ga  [        U[        R                  5      (       a:  UR                  R                  5       n[        R                  R                  U5      nOS[        R                  R                  U5      b"  Un[        R                  R                  U5      nOUc   S5       eSU 3nUn[        S UR                  5        5       5      n[        S [        R                  R                  U5       5       5      n[        S UR                  5        5       5      n	UR                  5       n
UR                  5       n[        R                  R                   R#                  UR%                  5       R&                  [(        R*                  S9nSU SU	 S	U
 S
U SU SU S3nT R,                  R/                  U SU 35        [        U[        R                  5      (       a2  T R1                  USS9nUnT R,                  R/                  U SU 35        U$ [3        U[4        R6                  5      (       d  [        U[8        5      (       a  [        U[:        5      (       a  UT R<                  ;   a  U$ Uc  gUn[        U[8        5      (       a  UR>                  nU[        R                  R                   R@                  ;   a'  [        R                  R                   R@                  U   n[;        [        R                  R                   RC                  U[(        R*                  S95      $ [        U[:        [D        [F        [H        45      (       a  [;        U5      $ [        U[J        5      (       a  SSRM                  U 4S jU 5       5       S3$ [O        S[Q        U5       35      e)NzBV.graph.get_buffer(arg) and raw_arg can't be None at the same timetmp_arg_c              3     #    U  H;  n[         R                  R                  R                  U[        R
                  S 9v   M=     g7fr  Nr+   r=   rA   r  r   unbacked_symint_fallbackr   r  s     rE   r   BPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>  sA      
 (A	   ;;#<< <  (   AAc              3     #    U  H;  n[         R                  R                  R                  U[        R
                  S 9v   M=     g7fr.  r/  r1  s     rE   r   r2    sA      $
 :A	   ;;#<< <  :r3  c              3     #    U  H;  n[         R                  R                  R                  U[        R
                  S 9v   M=     g7fr.  r/  r1  s     rE   r   r2    sA      
 *A	   ;;#<< <  *r3  r  zgenerate_example_value(rp   z, 'z', rs   r   T)r  r  r$  r  c              3  Z   >#    U  H   nTR                  U[        U5      5      v   M"     g 7fr<   r   )r   ar  s     rE   r   r2  C  s(      ZVYQR!@!@DG!L!LVYr   r  zUnsupported type ))r   torch_dtyper   r  r  rj  r+   r=   
get_buffertry_get_bufferr   r  get_allocation_sizer  r  r@   rA   r  
get_layoutre  r   r0  r   r   r  
issubclassr   Basicr   r   r  r   rF  r  r   r&  r   r  rw   r  r   )r  ra   arg_typeraw_argr  r  rD  r  allocation_sizer  r  r   re  r   s   `             rE   r   /PythonWrapperCodegen.generate_example_arg_value  s6   h,,'2#3#344">>224gg((2'',8gg((-* X* &eW- 
  D $ $
 44S9$ O  
 ) F ^^%FMMOEWW%%// ''88 0 F .dV2fXSE7RTU[T\\^_n^oopqE&&00H:S1HI'2#3#344 :: %) ;  **44zUG5LMO%++..*S/2R2R#s##$//)J?!#//nnagg&&CCCgg&&CCCH  ;;&"A"A <   c3t455s8OT""tyy ZVY ZZ[[\]]%(9$s)&EFFrG   c                   ^  [        U[        5      (       a!  SSR                  U 4S jU 5       5      -   S-   $ [        U5      $ )Nr  rp   c              3  F   >#    U  H  nTR                  U5      v   M     g 7fr<   )_grid_dim_str)r   r   r  s     rE   r   5PythonWrapperCodegen._grid_dim_str.<locals>.<genexpr>J  s     R\T 2 24 8 8\s   !r  )r   r  rw   rG  )r  grid_per_dims   ` rE   rE  "PythonWrapperCodegen._grid_dim_strG  s?    lD))diiR\RRRUXX &&rG   )r  r   	arg_typesr  r  c          
        U=(       d    [         R                  R                  5       nU(       d1  UR                  S:w  d!  U R	                  U R                  X5      5        gU R                  U5      nSR                  U5      n[        R                  XR                  [         R                  5      n	U(       d$  SU	 S3n
U R	                  U SU SU SU
 S35        gU R                  5         [        R                  R                  (       Ga  XR                  ;  Ga  Ub  [!        U5      [!        U5      :X  d   S5       e0 n/ nUc  S/[!        U5      -  nO[!        U5      [!        U5      :X  d   S	5       e[#        [%        X%U5      5       H  u  nu  pnSn['        U[(        5      (       a#  S
[)        U5      ;   a  UR+                  S
5      u  nn['        U[,        5      (       aE  [.        R0                  " SU5      (       a  UnUX'   O5X;  a  U R3                  XUU5      nUX'   OX   nOU R3                  XUU5      nUR5                  Uc  UOU S
U 35        M     U R6                  R	                  U SSR                  U5       SU	 S35        U R6                  R	                  SSR                  S UR9                  5        5       5       S35        U R                  R;                  U5        [         R                  R<                  (       a  g[         R                  R>                  R@                  nURC                  X!US5        U   U R	                  U SU SU	 S35        SSS5        g! , (       d  f       g= f)z
Generates kernel call code.

triton: Defines whether the backend uses Triton for codegen. Otherwise it uses the CUDA language when gpu=True,
        and C++ when gpu=False.
rs  Nrp   z	c_void_p(rs   r   rr   z$call_args and arg_types do not matchz#call_args and raw_args do not matchre  z^(workspace|semaphore)z.run(z	, stream=del c              3  $   #    U  H  ov   M     g 7fr<   r   )r   ra   s     rE   r   <PythonWrapperCodegen.generate_kernel_call.<locals>.<genexpr>  s      E0D0D   rK  )"r+   r=   r  r   r   r  r)  rw   r-  r  r  r%  r   r   r   r   r_   r;  r   r   r   splitr8  r]   matchr   r  r   r  r   rN  r  r  r  )r  r  r  r  r   rI  r  r  call_args_strstream_name
stream_ptrtensor_argsall_argsr+  ra   r?  r@  r   arg_strr  s                       rE   generate_kernel_call)PythonWrapperCodegen.generate_kernel_callO  s%   " @177>>@&++.NN400HI77	B		-0*??,,
 $[M3JNN-qQ}oR
|1M %%' MM222#=#== (S^s9~-M 6M KH 6C	N28}I6 96 09I(30++C7 c3''C3s8O"yy~HCh44 xx 93??"%+2(/"&"A"A7A# ,3("-"2"==cWVWXG3;se1WI<NO/02 &&00-uTYYx%8$9;-qQ &&00tyy E0B0B0D EEFbI &&**;7ww"" !" 4 4 B B..yyRVW"NNk]%i}TUVW #""s   M%%
M3c                :    U R                   R                  U5        g r<   )r  r  )r  r   s     rE   r   PythonWrapperCodegen.writeline  s    

$rG   c                8    U H  nU R                  U5        M     g r<   r1  )r  r  r   s      rE   r  PythonWrapperCodegen.writelines  s    DNN4  rG   c                L    U R                   R                  [        U5      5        g r<   )r  r  r&   )r  ctxs     rE   r  "PythonWrapperCodegen.enter_context  s    

+c*+rG   c                  ^ ^ SSK JnJn  U" 5       (       a  SS Kn[	        U[
        5      (       a  [        UR                  R                  5      $ [	        U[        R                  5      (       a  [        U5      $ [	        U[        [        45      (       aB  [        R                   " S S5      5       m[        [!        U5      " UU 4S jU 5       5      5      $ [	        U["        R$                  R&                  5      (       a  [)        U5      $ [	        U[*        R,                  [*        R.                  [0        45      (       a  UR3                  5       $ U" 5       (       a-  [	        UWR4                  R6                  5      (       a  U" U5      $ [	        U[*        R8                  5      (       a  UR3                  5       $ [        U5      $ )Nr   )dtype_to_stringhas_triton_packagec                  &    \ rS rSr% S\S'   S rSrg)1PythonWrapperCodegen.val_to_arg_str.<locals>.Shimi  r   refc                    U R                   $ r<   )re  r  s    rE   __repr__:PythonWrapperCodegen.val_to_arg_str.<locals>.Shim.__repr__  s    88OrG   r   N)r   r   r  r  r   rg  r  r   rG   rE   Shimrd    s    $rG   ri  c              3  \   >#    U  H!  nT" [         R                  TU5      5      v   M#     g 7fr<   r  )r   r7  ri  r  s     rE   r   6PythonWrapperCodegen.val_to_arg_str.<locals>.<genexpr>  s)     VTUq1@@qIJJTUs   ),)r  ra  rb  r   r   r   rG  rC   rI  r   r
   r   r  rf  	dataclassrX   r   r  _ops
OpOverloadr   r   ri  
MutableBoxr!   r1  languager   rC  )r  r#  type_ra  rb  r   ri  s   `     @rE   r  #PythonWrapperCodegen.val_to_arg_str  s@   Ka""%%5::&&8OE4=))""$ $ #$ QVTUVV  5::0011&q))BIIr}}oFGG&&((!!jFOO4I4I&J&J"1%%2,,--&&((7NrG   c                :   UR                  5       nUR                  5       n[        UR                  5       5      n[        [        R
                  R                  U5      5      n[        UR                  5       5      nU R                  UR                  5       X#XFU5      $ r<   )
r  r@   r   r  r+   r=   r;  r  r	  rj  )r  r  r  r   r  allocation_shaper  s          rE   r  +PythonWrapperCodegen.make_buffer_allocation  s    ""$  "foo'( !<!<V!DEv((*+##OOve=M
 	
rG   c           
     &   Uc  UnU R                  U5      nU R                  U5      nU R                  U5      n	UR                  S;   a  U SUR                   SU SU	 SU S3
n
OU SU SU	 SUR                   SU S3
n
Xx:w  a  U
S	U SU	 S3-   n
U
$ )
N)rs  cudaxpuz = empty_strided_rr   rp   rs   z = empty_strided(r  r  z.as_strided()r   r   )r  rx   r  r   r  r  rt  r  codegen_allocation_shape_tuplecodegen_stride_tupler  s              rE   r	  $PythonWrapperCodegen.make_allocation  s     #$"==eD)-)H)H*
&  $>>vF;;00 &)&++a12"'('  &)12"'( )!;;-yq:  @,':&;2>R=SSTUUC
rG   c           	     `    U R                    U SU U R                   SU R                   SU 3	$ )Nr     rq   )r  r  r  )r  new_nameold_namer  s       rE   make_tensor_alias&PythonWrapperCodegen.make_tensor_alias	  s6    ,,zXJt{{m2dll^STU\T]^^rG   c                (    SUR                  5        3$ )NrK  )rj  )r  r  s     rE   r  %PythonWrapperCodegen.make_buffer_free
	  s    foo'())rG   c                8    SSR                  S U 5       5       3$ )NrK  rp   c              3  $   #    U  H  ov   M     g 7fr<   r   )r   rx   s     rE   r   :PythonWrapperCodegen.make_free_by_names.<locals>.<genexpr>	  s     >rN  )rw   )r  names_to_dels     rE   make_free_by_names'PythonWrapperCodegen.make_free_by_names	  s    dii>>>?@@rG   c           	     `    U R                    U SU U U R                   SU R                   S3	$ )Nr   r}   reuse)r  r  r  )r  r  r~  del_lines       rE   codegen_exact_buffer_reuse/PythonWrapperCodegen.codegen_exact_buffer_reuse	  s@    ../zXJxjQUQ\Q\P]]_`d`l`l_mmsttrG   c                   UR                  5       UR                  5       :X  d   eUR                  5       nUR                  5       nSnU[        R                  R	                  5       ;  a  U(       a  SU R                  U5       3nUR                  5       UR                  5       :X  a4  UR                  5       UR                  5       :X  a  U R                  XEU5      $ U R                  XR                  5       UR                  5       SU R                  R                  5      nU R                   U SU U SU R                   S3$ )N;z; r   r   r}  r  )r@   rj  r+   r=   r%  r  r  r  r  rj  r  r   r  r  )r  oldnewr  r  r~  r  reinterpret_views           rE   r  &PythonWrapperCodegen.make_buffer_reuse	  s   }}#--/111<<><<>1773355*D11#678H<<>S\\^+0@CNNDT0T228xPP88!11d6G6G6Q6Q
 ,,z-=,>xj4<<.X^__rG   c                    U R                  [        UU R                   U SUR                  5        U R                   SU R
                   S35      5        g )Nr   r}  z alias)r   r/   r  r1  r  r  )r  rx   views      rE   codegen_deferred_allocation0PythonWrapperCodegen.codegen_deferred_allocation#	  sS    <<.c$*@*@*B)CDKK=PRSWS_S_R``fg	
rG   c                   UR                  5       nU[        R                  R                  ;   d.  X R                  ;   d  [        U[        R                  5      (       a  g U R                  R                  U5        [        UR                  5       [        R                  [        R                  45      (       a  UR                  5       (       d  g UR                  5       n[        U[        R                  5      (       a  g [        U[        R                  5      (       a  g [        U[        R                   5      (       GaV  [        UR"                  [        R$                  5      (       d*   S['        UR"                  5       SUR"                   35       e[        UR"                  R(                  [        R*                  5      (       d$   ['        UR"                  R(                  5      5       e[        UR"                  R(                  R(                  [        R,                  5      (       d$   ['        UR"                  R(                  5      5       eU R/                  UR"                  R(                  R(                  5        U R1                  X#R"                  5        g [        U[        R2                  5      (       a  U R5                  [7        X5      5        g U R5                  [9        X5      5        g )Nzunexpected r   )rj  r+   r=   rt  r  r   r   DonatedBufferr   get_defining_opExternKernelAllocMultiOutputshould_allocater  MutationLayoutSHOULDREMOVEr  NonOwningLayoutr  r!   r   ri  
StorageBoxri  codegen_allocationr  r  r   r  rp  )r  r  rx   r  s       rE   r  'PythonWrapperCodegen.codegen_allocation+	  s     AGG+++~~%&""2"2334 &&(%%r~~6  **,,'')fb;;<<fbmm,,fb0011fkk2+=+=>> d6;;/06;;-@> fkk..>>VV[[EUEU@VV>fkk..33RYY??WfkkFVFVAWW?##FKK$4$4$9$9:,,T;;?fb1122NN1$?@|D12rG   c                   UR                  5       n[        U[        R                  [        R                  45      (       a!  U R                  U R                  U5      5        g [        UR                  5       [        R                  5      (       a  U R                  [        X5      5        g U R                  U5      (       d  g U R                  R                  U5        U R                  [        X5      5        g r<   )rj  r   r   InputBufferrA  r   r  r  r  r  	can_reuser  r   r#  )r  r  rx   s      rE   codegen_free!PythonWrapperCodegen.codegen_freeS	  s      fr~~r/A/ABCCNN40089f,,.0C0CDD NN-d;<~~f%%

t*489rG   c                2   UR                  5       nU[        R                  R                  ;   =(       d    U[        R                  R                  ;   =(       a:    [        [        R                  R                  U   [        R                  5      (       + =(       dz    U[        R                  R                  ;   =(       dV    U[        R                  R                  ;   =(       d2    U[        R                  R                  ;   =(       d    X0R                  ;   (       + $ r<   )rj  r+   r=   rt  r:  r   graph_inputs_originalr   r  r  r  never_reuse_buffersr  )r  input_bufferoutput_bufferrx   s       rE   r  PythonWrapperCodegen.can_reuseg	  s    $$&AGG+++ 
",,, "GG11$79I9I 
" qww(((
" qww222
" qww222
" zz!
 	
rG   c                    UR                  5       U R                  ;   =(       a.    U R                  UR                  5          UR                  5       :H  $ r<   )rj  r  )r  r  reused_buffers      rE   	did_reusePythonWrapperCodegen.did_reusew	  sC     OO, KFOO-.-2H2H2JJ	
rG   c                z   [        X5      (       d   eU R                  U5        U R                  R                  UR	                  5       5        U R
                  R                  UR	                  5       5        UR	                  5       U R                  UR	                  5       '   U R                  [        XU5      5        g r<   )	rO   r  r  r   rj  r  r  r   rw  )r  r  r  s      rE   codegen_inplace_reuse*PythonWrapperCodegen.codegen_inplace_reuse	  s    $\AAAA-

|,,./=11340<0E0E0GM**,-y]CDrG   c                    [        U5      nX R                  ;   a  U$ U R                  R                  U5        U R                  U-   $ r<   )r   r  r   r  )r  r   rx   s      rE   codegen_unbacked_symbol_decl1PythonWrapperCodegen.codegen_unbacked_symbol_decl	  sA    6{---K &&**40<<$&&rG   c                :  ^^^^ [        [        R                  R                  R                  U5      nU(       d  g UR                  5        HM  u  nmSU4S jjmUUUU4S jnU R                  U R                  U5       SU" 5        U R                   35        MO     g )Nc                  > US:X  a  U $ [        U5      S:  ai  [        US   [        5      (       aQ  [        US   [        R                  5      (       a/  T" U  SUS   R
                   SUS   R                   S3USS  5      $ [        US   [        5      (       a  T" U  SUS   R
                   S3USS  5      $ [        US   [        R                  5      (       a^  [        R                  R                  (       a   T" S	US   R                   S
U  S3USS  5      $ T" U  SUS   R                   S3USS  5      $ [        US   [        5      (       a  T" U  SUS   R                   S3USS  5      $ [        SU 35      e)Nr   r   r   r,   r   rr   rs   r^  z	std::get<z>(r  r  z.__floordiv__(rz  )r_   r   r   pytreeSequenceKeyrx   r  r+   r=   rN  r   r|  r`   )rI  r{  gos     rE   r  IPythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs.<locals>.go	  s   b=K LA%"71:}=="71:v/A/ABB&'!*//!2!GAJNN3C1Ewqr{   
M::a
'8;WQR[II
F,>,>?? 77.. Ywqz~~&6ba@'!"+N  4&'!*..)9 ;WQR[I
  
K88 nWQZ5G5G4HJGTUTVKXX(+@	)JKKrG   c                   > [         R                  R                  (       a  [        T5      S:X  a`  TS   n T" TS   R	                  5       [        U [        R                  5      (       a"  [        U R                  5      S:w  a	  TSS  5      $ T5      $ [        TS   [        R                  5      (       d   eT" TTS   R                     R	                  5       TSS  5      $ T" TT5      $ )Nr,   r   )r+   r=   rN  r_   rj  r   r   r  r  r  r  r  )r  r  r{  r  r  s    rE   go_outerOPythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs.<locals>.go_outer	  s    77&&
 7|q(%aj  "#AJ//1)#r~~>>3s{{CSWXCX $ABK   ")	    *'!*f6H6HIIII!''!*.."9"B"B"DgabkRRk733rG   r   )rI  r   r{  zpytree.KeyPath)	r   r+   r=   rA   	shape_envr\   r   r  r  )r  r  r  unbacked_bindingsr#  r  r  r{  s    ``   @@rE   (codegen_unbacked_symbol_defs_for_outputs=PythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs	  s     6GG&&(9
 ! ,113JAw
L<4 4. NN44Q78HJ<}Uu 4rG   c                  ^ ^^^ UU U4S jnUU U4S jn T R                  TR                  5        T R                  T R                   STR                   35        U" 5         [
        R                  n[
        R                  " TR                  5         TR                  R                  US9  S S S 5        U" 5         T R                  5         g ! , (       d  f       N&= f! T R                  5         f = f)Nc                   > [        TR                  R                  5      [        T5      :X  d   e[        TR                  R                  T5       H3  u  pTR	                  TR
                   U  SU TR                   35        M5     g r  )r_   r=   r:  r   r   r  r  )inner_inputouter_inputouter_inputsr  subgraphs     rE   _codegen_subgraph_prefixSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_prefix	  sr    x~~223s<7HHHH,/++\-( ||n[M[M$++O-rG   c                   > [        TR                  R                  5      [        T5      :X  d   e[        TR                  R                  T5       H5  u  pTR	                  U SU R                  5        TR                   35        M7     g r  )r_   r=   r>  r   r   r1  r  )inner_outputouter_outputouter_outputsr  r  s     rE   _codegen_subgraph_suffixSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_suffix	  st    x~~334M8JJJJ.1,,m/* #nC(F(F(H'I$++W/rG    subgraph: )parent_graph)	r4  r=   r   r  rx   r+   set_graph_handlercodegen_subgraphrC  )r  r  r  r  r  r  r  s   ````   rE   codegen_subgraph_by_inlining1PythonWrapperCodegen.codegen_subgraph_by_inlining	  s    			'%%hnn5NNdll^;x}}oFG$&77L$$X^^4//!- 0  5 %&$$& 54 $$&s$   A<C C,C 
CC C/c                L   [        U5      [        UR                  R                  5      :X  d!   SUR                  R                   SU 35       e[        UR                  R                  U5       H3  u  pEU R	                  U R
                   U SU U R                   35        M5     g )Nzgraph_input_names:z, outer_inputs: r   )r_   r=   rB  r   r   r  r  )r  r  r  r  r  r  s         rE   codegen_subgraph_prefix,PythonWrapperCodegen.codegen_subgraph_prefix
  s     < C(H(H$II 	
 !A!A BBRS_R`a	
I ),NN,,l)
$K NNdll^K=K=VW)
rG   c           	     v   UR                   nUR                  nSR                  UR                  5       5      [	        U5      S:X  a  SOS-   nU Vs/ s H  ofR                  5       PM     nnSR                  U5      [	        U5      S:X  a  SOS-   nU R                  SU SU S35        UR                  5        V	V
s/ s H  u  pU
(       d  M  U	PM     nn	n
U(       a#  U R                  SSR                  U5       35        U R                  S	U S
U SU S35        U R                  SU S35        gs  snf s  sn
n	f )z'Generate code to call a graph partitionrp   r,   rN  r  	partition	_args = [r  rK  rr   z) = self.partitions[z](partition_args)zdel partition_argsN)input_deallocationoutput_nodesrw   r  r_   rj  r   r\   )r  partition_idr  r  r  r  rC   output_namesr  rx   
deallocater  s               rE   codegen_partition_call+PythonWrapperCodegen.codegen_partition_call
  s=    2DD+88-2245)*a/CR
 5AALDLA))L)C4E4JSPRS 	<.	&CD *<)A)A)C
)C%TzD)C 	 
 NNT$))L"9!:;< 	y,\N+l^SYZ	
 	|nE:;! B
s   D0;D5D5c                V    [        U5       Vs/ s H  nSU 3PM
     snU l        g s  snf )N
partition_)r&  r  )r  num_partitionsr  s      rE   set_all_partition_names,PythonWrapperCodegen.set_all_partition_names7
  s*    BGBW#XBW3j$6BW#X #Xs   &c           	        UR                   R                  nSR                  U5      n[        U5      S:X  a  US-  nSR                  U5      [        U5      S:X  a  SOS-   nU R	                  UR                   R
                   SU S35        US [        U5        H  nU R	                  SU 35        M     U R	                  SU S	UR                   R
                   SUR                   R
                   S
35        g )Nrp   r,   rN  r  r  r  rK  rr   z) = r  )r=   rB  rw   r_   r   rx   )r  r  r  r  rO  inner_inputsouter_output_namesr  s           rE   codegen_subgraph_call*PythonWrapperCodegen.codegen_subgraph_call:
  s    nn66yy-{q CL!YY}5}%*C

 	(..--.i~QGH&':\):;KNNT+/0 < 	"#4(;(;'<Ahnn>Q>Q=RRXY	
rG   c                   [         R                  R                  (       a  U R                  XU5        g U R	                  UR                  5        U R                  S5        U R                  U R                   SUR                   35        U R                  XU5        [         R                  nUR                  UR                  l	        UR                  R                  U R                  ;  a  [         R                  " UR                  5         [        R                  " SS5         UR                  R                  5       u  pVS S S 5        S S S 5        U R                  R                  UR                  R                  5        U R!                  WR"                  5        U R%                  XU5        g ! , (       d  f       Ns= f! , (       d  f       N|= f)Nr  r  rT  F)r+   r=   rO  r  r4  r   r  rx   r  rN  r  r  r   patchr8  r   r  r   r  )r  r  r  r  r  subgraph_coder  s          rE   r  %PythonWrapperCodegen.codegen_subgraphO
  s<    77--hmT!!(..1r$,,{8==/BC$$X]Kww%1%=%=">>d&F&FF $$X^^4\\"3U;'/~~'='='?$M < 5
 ,,001D1DE,,]-@-@A""8=I <; 54s$   
F="F,?F=,
F:	6F==
Gc                z   UR                  5       nU R                  U S[        UR                  5       35        UR                   Vs/ s H  o3R                  5       PM     nn[        [        UR                  5      5       Vs/ s H
  oR SU S3PM     nnU R                  UR                  XF5        g s  snf s  snf )N = [None] * r  r  )	rj  r   r_   r  r  r1  r&  r  r  )r  invoke_subgraphrx   rD  r  r+  r  s          rE   codegen_invoke_subgraph,PythonWrapperCodegen.codegen_invoke_subgraphk
  s    '')$|C0G0G,H+IJK;J;Q;QR;QC--/;QR16s?;R;R7S1TU1TA61#Q1TUo66T SUs   B3B8c                   UR                  5       nUR                   Vs/ s H  o3R                  5       PM     nn[        [	        UR
                  5      5       Vs/ s H
  oR SU S3PM     nnUR                  R                  5       n[        UR                  [        R                  5      (       d  U S3nU R                  U S[	        UR
                  5       35        U R                  SU S35        U R                  [        XR                  R                  5      5        U R                  UR                  XF5        U R                  [        U 5      5        U R                  S5        U R                  [        XR                   R                  5      5        U R                  UR                   XF5        U R                  [        U 5      5        g s  snf s  snf )Nr  r  rx  r  r   rM  zelse:)rj  operandsr1  r&  r_   r  	predicater   r   ShapeAsConstantBufferr   r,  true_subgraphr=   r  r>  false_subgraph)r  conditionalrx   rD  r  r+  r  r  s           rE   codegen_conditional(PythonWrapperCodegen.codegen_conditionals
  su   ##%;F;O;OP;OC--/;OP16s;;N;N7O1PQ1PA61#Q1PQ));;=	+//1I1IJJ$+W-I$|C0C0C,D+EFGYKq)*(/H/H/N/NOPk77U'-.w(/I/I/O/OPQk88,V'-.! QQs   GGc                   UR                  5       nUR                   Vs/ s H  o3R                  5       PM     nnUR                   Vs/ s H  o3R                  5       PM     nnU R	                  U S[        U5       35        [        U5       H  u  pgU R	                  U SU SU 35        M      / [        [        U5      5       Vs/ s H
  ob SU S3PM     snQUQnU S3/n	[        U5      n
U
S [        U5       nU R	                  S5        U R	                  [        XR                  R                  5      5        U R                  UR                  X5        U R	                  SU	S    S	35        U R	                  [        U 5      5        U R	                  [        XR                  R                  5      5        U R                  UR                  X5        U R	                  [        U 5      5        g s  snf s  snf s  snf )
Nr  r  z] = r  _cond_resultzwhile True:zif not r   z: break)rj  carried_inputsr1  additional_inputsr   r_   r;  r&  r  r,  cond_subgraphr=   r  r>  body_subgraph)r  
while_looprx   rD  outer_carried_inputsouter_additional_inputsr+  inpcond_outer_inputscond_outer_outputsbody_outer_inputsbody_outer_outputss               rE   codegen_while_loop'PythonWrapperCodegen.codegen_while_loop
  s   ""$/9/H/H 
/H!!#/H 	  
 0:/K/K#
/K!!#/K 	  #
 	$|C0D,E+FGH 45FANNdV1QCtC512 6
&+C0D,E&FG&Fas!n&FG
$
 "&l34 
 //J5I1JK}%(/G/G/M/MNO$$&7	
 	(+,G4	
 	'-.(/G/G/M/MNO$$&7	
 	'-.M 
#
 Hs   G.G3G8c                     [        U SS 5      (       a  g [        U [        5      (       a  U $ [        R                  R
                  R                  U 5      nUc  U$ [        U5      $ ! [         a     g f = f)Nr  )rh  r   r   r+   r=   
_shape_env_maybe_evaluate_staticr  )r   r   s     rE   statically_known_int_or_none1PythonWrapperCodegen.statically_known_int_or_none
  sl    	q.$// !S!!''$$;;A>C{
s8O 		s!   A% A% -A% 
A% %
A21A2c                r    / nU  H.  n[         R                  U5      nUc    g UR                  U5        M0     U$ r<   )r-  r  r  )lstr  r   nums       rE   %statically_known_list_of_ints_or_none:PythonWrapperCodegen.statically_known_list_of_ints_or_none
  s<    A&CCAFC{MM#	 
 rG   c                0    [         R                  U 5      S L$ r<   )r-  r  )r  s    rE    is_statically_known_list_of_ints5PythonWrapperCodegen.is_statically_known_list_of_ints
  s     !FFsKSWW	
rG   c                H    [         R                  U R                  5       5      $ r<   )r-  r  r  r  s    rE   rx  4PythonWrapperCodegen.static_shape_for_buffer_or_none
  s    #IIOO
 	
rG   c                0    [         R                  U 5      S L$ r<   )r-  rx  r#  s    rE   !can_prove_buffer_has_static_shape6PythonWrapperCodegen.can_prove_buffer_has_static_shape
  s    #CCFKSWWWrG   )*r  r  r  r  r  r  r  r  r  r  r  r0  r  r  r  r  r  r  r  r  r   r  r   r  r  rI  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r<   )r  r   r  r   r  Optional[PythonWrapperCodegen]r  $Optional[ir.GraphPartitionSignature]r:  )rx   r   r  r   r   r$  )r  r   )r+  TritonMetaParamsr   r   r   	list[str]r   z>dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr]]r   zlist[IRNode])rO  r,  r  )rH  r   r   r   rn  )rH  r   r   r$  )r  r,  r   r$  r  r0   r   r$  )ry   r   r  r   r  r   rz   r,  r  r   r   r$  )F)NNN)
r  r   r  r   r  r   r  r,  r  zOptional[torch._ops.OpOverload])rx   r   r   zir.TensorBoxr=  zOrderedSet[sympy.Symbol])rH  zsympy.Symbol)r   r
   rB   r   r   r   )r   r
   r   r   )rZ  r   rx   r   r  r   r   r   )r  zSequence[Expr]r   r   )r   zCallable[..., None]r   r   )rn  r   )NTN)
r  r   r  r   r  r   r  r   r  r   )r  r   )r   z"list[list[Union[int, sympy.Expr]]])r  r   r  r   )r
  r2   )NN)r  r   )r  rq  )r  )r  z%Union[BufferLike, ir.TorchBindObject])r  r,  )r  r   r~  r   r  r   )r  rq  r  rq  r  r   )rx   r   r  zir.ReinterpretViewr   r$  r  r  )r  r  r  r  )r  r   r  r   r  z,Optional[dict[sympy.Symbol, pytree.KeyPath]]r   r$  )r  r   r  zir.GraphPartitionSignature)r  r   )}r   r   r  r  __doc__r  r  r  r  r  r  r  r  r$   r%  r(  r-  r3  r6  r;  r0  rE  rH  rK  rQ  rW  rZ  r  rb  r  ro  r4  rC  r/  r@  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r>  ra  rJ  r  rQ  rT  rW  r[  r   r  rf  rj  ro  rr  r}  r  r  r  r  r  r   r  r  r  r  r  r  r  r   r  r)  r   rE  rW  r   r  r  r  r  r	  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   rx  r&  r  r%  r&  s   @rE   r-  r-    s   P#d 
 FJ	&&$& 7& C	& &'<;Az
   "  	! 
 

$	G$
%S$	(
.8)6(-.10J
/+7&5
5A:;; ;  	;
 ; ; 
;,
&< 8<
X
X  
X 	
X
  
X 5
X0M
^!SF:

B(K(K (K -	(KTJ"5 @D W CG +.&'6
(  ' 
:>G4*[Yz, #'(,!! !  	!
 ! &!&2C; 2C;J1%N:.>85W

4
4QGf' _X_XB !, F
 DH@_*Au` 
&3P:(
 
E'NN N H	N
 
N`+'Z	X<< 9<>Y
*J8U/*(/T     
 

 
 

 X XrG   r-  c                     ^  \ rS rSrSr S     SU 4S jjjrSS jrSS jrS rS r	S r
SS	 jrSS
 jrSS jrSS jr  SS jrSS jrSS jrSU 4S jjr\SS j5       r\SS j5       rSrU =r$ )r   i
  z
A wrapper codegen that generates code for a subgraph. For most of the
methods, we rely on the implementation in the PythonWrapperCodegen. But we
override a few functions to produce cleaner code (like avoiding writing
imports twice in the output code)
c                F   > Xl         X l        X0l        [        TU ]  5         g r<   )r  r  r  r  r  )r  r  r  r  r  s       rE   r  %SubgraphPythonWrapperCodegen.__init__
  s"     +,$8!rG   c                &    U R                   U l        g r<   )r  r  r  s    rE   r  1SubgraphPythonWrapperCodegen.set_launcher_fn_name
  s     !% 2 2rG   c                    g r<   r   r  s    rE   r  )SubgraphPythonWrapperCodegen.write_header
  r  rG   c                    g r<   r   r  s     rE   r  2SubgraphPythonWrapperCodegen.add_benchmark_harness
  r  rG   c                    g r<   r   r  s     rE   r  6SubgraphPythonWrapperCodegen.benchmark_compiled_module
  r  rG   c                    g r<   r   r  s    rE   rK  5SubgraphPythonWrapperCodegen.write_async_compile_wait  r  rG   c                6    U R                   R                  5       $ r<   )r  r}  r  s    rE   r}  /SubgraphPythonWrapperCodegen.next_kernel_suffix  s    ""5577rG   c                    g r<   r   r  s     rE   r  2SubgraphPythonWrapperCodegen.generate_after_suffix  r8  rG   c                \    U R                   R                  SU R                   S35        SnU$ )Nz
            def z(args):
            r,   )r  r   r  rU  s     rE   rW  >SubgraphPythonWrapperCodegen.write_launcher_fn_call_get_indent  s<    &&' (	

 rG   c                    gr   r   r  s    rE   r  4SubgraphPythonWrapperCodegen.get_wrapper_call_indent  s    rG   c                |    U R                   =n(       a  UR                  nU$ [        R                  R                  nU$ r<   )r  input_nodesr+   r=   r:  )r  r  r  s      rE   r;  -SubgraphPythonWrapperCodegen.get_graph_inputs  s=     11191**F  WW))FrG   c                    U R                   =n(       a%  [        UR                  R                  5       5      nU$ [        R
                  R                  nU$ r<   )r  r  rH  r  r+   r=   rB  )r  r  namess      rE   rZ  2SubgraphPythonWrapperCodegen.get_graph_input_names   sI    11191..3356E  GG--ErG   c                |    U R                   =n(       a  UR                  nU$ [        R                  R                  nU$ r<   )r  r  r+   r=   r>  )r  r  r  s      rE   r0  .SubgraphPythonWrapperCodegen.get_graph_outputs'  s;    11191,,G  gg++GrG   c                   > UR                  5       nU R                  =n(       a  X#R                  ;   a  g [        TU ]  U5        g r<   )rj  r  rH  r  r  )r  r  rx   r  r  s       rE   r  /SubgraphPythonWrapperCodegen.codegen_allocation.  s=     222I2@U@U8U "6*rG   c                8    U R                   R                  5         g r<   )r  r%  r  s    rE   r%  5SubgraphPythonWrapperCodegen.write_triton_header_once8  s     	446rG   c                8    U R                   R                  5         g r<   )r  r(  r  s    rE   r(  =SubgraphPythonWrapperCodegen.write_get_raw_stream_header_onceA  s     	<<>rG   )r  r  r  r  r<   )r  r   r  r-  r  r)  r:  rn  r/  r  r-  r+  r.  r0  )r   r   r  r  r1  r  r  r  r  r  rK  r}  r  rW  r  r;  rZ  r0  r  r$   r%  r(  r  r%  r&  s   @rE   r   r   
  s     FJ	 - C	 3
8	G+ 7 7 ? ?rG   r   )rC   rq  r   r"  )rK   rq  rL   rq  )ra   torch.Argumentr   r   )rl   rU  r   r   )ry   ztorch._ops.OpOverloadr   r   r<   )
rx   r   r   zlist[triton.Config]r   zlist[TritonGrid]r   r(  r   ztuple[str, str]rn  )
__future__r   r  r   rf  r   ry  r  r  r{  r  r]   r  	itertoolsr   typingr   r   r   r   r	   r   r
   r  
torch._opstorch.utils._pytreeutils_pytreer  r   r8  torch._dynamo.utilsr   r   #torch._inductor.codegen.debug_utilsr   $torch._inductor.codegen.multi_kernelr   %torch._inductor.runtime.runtime_utilsr   %torch.fx.experimental.symbolic_shapesr   r   r   r   r   torch.fx.noder   torch.utils._ordered_setr    torch.utils._sympy.singleton_intr   torch.utils._sympy.symbolr   r   r  r   r   r   	codecacher   r    r!   runtimer"   runtime.hintsr#   r$   r%   r&   r'   r(   r)   r*   virtualizedr+   r  r-   r.   r/   r0   r1   r2   r3   	cpp_utilsr4   triton_utilsr5   r6   r7   collections.abcr8   r9   r   r=   r:   doprintrG  r   r  r   r"  ri  rq  rF   rO   ri   rn   r   r  r   r*  r   r   r   rl  r   r
  r(  r,  r>  rG  rW  r\  rp  r#  rw  ru  r  r  r  r  r-  r   r   rG   rE   <module>rn     sr   "    
      	   @ @     $ $ & 6 C A ;  . / 9 : ( ( ' ( ' ,       P P 2% 	 u{{C/0299l*+
	>B>$<$ S> 	%UZZ
 #
%&2B1CU3PS8_1T(UU
 /3	U&
U& U& U& ,	U&
 U&pJ&Z   * **	 	 	 	 	 {   KK K KD;  ; ; ;2 %  : E, E E, 
" 
 
(	! 	 ![ ! !: &^ & &R N N N 
N!X7 N!XbBl?#7 l?rG   