
    shi                       S SK Jr  S SKrS SKrS SKJrJrJr  S SKrS SK	J
r
  S SKrS SKJr  S SKJr  SSKJrJr  SS	KJrJrJr  S
SKJrJrJrJrJrJr  S
SKJ r J!r!J"r"  \(       a  S SKJ#r#  SSK$J%r%J&r&  SSK'J(r(J)r)  S
SKJ*r*  \RV                  S\RX                  S\RZ                  S\R\                  S\R^                  S\R`                  S\Rb                  S\Rd                  S\Rf                  S0	r4S%S jr5 " S S\5      r6 " S S\5      r7\7Rq                  S5         " S  S!\!5      r9\Rt                  S&S" j5       r; " S# S$\"5      r<g)'    )annotationsN)AnyOptionalTYPE_CHECKING)
PRECEDENCE)ExprPrinter)ValueRanges   )get_bounds_index_exprget_kernel_metadata)ops
OpsWrapperV   )CSEVariableDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferOpOverridesPythonPrinter)IterationRangesEntry
SIMDKernelSIMDScheduling)Union)ReductionType	StoreMode)	SchedulerSchedulerNode)OpVarTboolcharshortintlongucharfloathalfbfloatc                    [        U [        5      (       a<  U [        R                  :X  a  gU [        R                  * :X  a  gX :w  a  g[	        U 5      $ [        U [
        5      (       a  U (       a  S$ S$ [	        U 5      $ )N	HUGE_VALFz
-HUGE_VALFNANtruefalse)
isinstancer&   torchinfstrr    )vals    o/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/_inductor/codegen/mps.pyvalue_to_metalr4   2   sc    #u%))UYYJZ3x	C		v)')s8O    c                  d    \ rS rSrSS jrSS jrSS jrSS jrSS jrSS jr	SS jr
SS	 jrS
rg)MetalExprPrinter@   c                    UR                   u  p#U R                  U5      nU R                  U5      nUR                  (       a	  SU SU S3$ SU SU S3$ )N() / ()metal::floor(argsdoprint
is_integer)selfexprxdivs       r3   _print_FloorDiv MetalExprPrinter._print_FloorDivA   sY    LLOll3??qcse1%%qcse1--r5   c                    UR                   u  p#nU R                  U5      nUS:w  a5  U R                  U5      nUR                  (       a
  SU SU S3nO	SU SU S3nU R                  U5      nSU SU S3$ )Nr   r:   r;   r<   r=   z) % (r>   )rB   rC   rD   rE   mods        r3   _print_ModularIndexing'MetalExprPrinter._print_ModularIndexingI   s    iiLLO!8,,s#Cs%uA&#A3eC52ll31#U3%q!!r5   c                    [        UR                  5      S:w  a  [        S5      eSSR                  [	        U R
                  UR                  5      5       S3$ )Nr
   z$metal::min only supported for 2 argszmetal::min(, r<   lenr?   RuntimeErrorjoinmap_printrB   rC   s     r3   
_print_MinMetalExprPrinter._print_MinU   F    tyy>QEFFTYYs4;;		'BCDAFFr5   c                    [        UR                  5      S:w  a  [        S5      eSSR                  [	        U R
                  UR                  5      5       S3$ )Nr
   z$metal::max only supported for 2 argszmetal::max(rM   r<   rN   rT   s     r3   
_print_MaxMetalExprPrinter._print_MaxZ   rW   r5   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr   metal::abs(r   r<   rO   r?   rS   rT   s     r3   
_print_AbsMetalExprPrinter._print_Abs_   s9    499~"""T[[167q99r5   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr   zstatic_cast<long>(metal::rint(r   z))r]   rT   s     r3   _print_RoundToInt"MetalExprPrinter._print_RoundToIntc   s9    499~"""/DIIaL0I/J"MMr5   c                    [        UR                  5      S:X  d   eUR                  u  p#UR                  (       a  US:  d   e[        SU S35      eU R	                  U[
        S   5      nSU SU SU*  S	3$ )
Nr
   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .Mulz!static_cast<float>(metal::rint(1e * z) * 1er<   )rO   r?   rA   
ValueErrorparenthesizer   )rB   rC   numberndigits
number_strs        r3   _print_RoundDecimal$MetalExprPrinter._print_RoundDecimalg   s    499~"""))Q;;abiajjkl  &&vz%/@A
27)3zl&RYQYPZZ[\\r5   c                l    UR                   u  p#SU R                  U5       SU R                  U5       S3$ )Nstatic_cast<float>(z) / static_cast<float>(r<   )r?   rS   )rB   rC   lhsrhss       r3   _print_IntTrueDiv"MetalExprPrinter._print_IntTrueDivs   s9    99$T[[%5$66MdkkZ]N^M__`aar5    N)rC   
sympy.Exprreturnr1   )__name__
__module____qualname____firstlineno__rF   rJ   rU   rY   r^   ra   rl   rr   __static_attributes__rt   r5   r3   r7   r7   @   s/    .
"G
G
:N
]br5   r7   c                  N   \ rS rSr\  S7         S8S jj5       r\        S9S j5       r\S:S j5       r\S;S j5       r\S<S j5       r	\S=S j5       r
\S>S	 j5       r\S?S
 j5       r\S?S j5       r\S?S j5       r\S?S j5       r\S@S j5       r\S@S j5       r\S@S j5       r\S@S j5       r\S@S j5       r\S@S j5       r\S@S j5       r\S@S j5       r\S@S j5       r\S@S j5       r\S@S j5       r\S@S j5       r\S@S j5       r\S@S j5       r\SAS j5       r\S@S j5       r\S@S j5       r \S@S j5       r!\S@S  j5       r"\S@S! j5       r#\S@S" j5       r$\S@S# j5       r%\S@S$ j5       r&\S@S% j5       r'\S?S& j5       r(\S@S' j5       r)\S@S( j5       r*\S?S) j5       r+\S@S* j5       r,\S?S+ j5       r-\S@S, j5       r.\SBS- j5       r/\SBS. j5       r0\          SCS/ j5       r1\S@S0 j5       r2\S?S1 j5       r3\S?S2 j5       r4\S@S3 j5       r5\S@S4 j5       r6\S@S5 j5       r7S6r8g)DMetalOverridesy   Nc                "    S[         U    SU  S3$ )Nzstatic_cast<>(r<   DTYPE_TO_METAL)rD   dtype	src_dtypeuse_compute_typess       r3   to_dtypeMetalOverrides.to_dtypez   s     nU34Bqc;;r5   c                "    S[         U    SU  S3$ )Nz*reinterpret_cast<thread z*>(&r<   r   )rD   r   r   s      r3   to_dtype_bitcastMetalOverrides.to_dtype_bitcast   s     +>%+@*AaSJJr5   c                    [        U 5      $ Nr4   )r2   r   s     r3   constantMetalOverrides.constant   s    c""r5   c                :   [         R                  R                  [         R                  R                  U 5      5      n[         R                  R                  R                  [         R                  R                  U[        U 5      S9n[        R                  " X15      $ )N)bounds)
r   kernelindex_to_strprepare_indexingcsegeneratecomputer   r   r   )rC   r   idx_strvars       r3   
index_exprMetalOverrides.index_expr   sj    ((''(A(A$(GHhhll##HHg.CD.I $ 
 ||C''r5   c                    [         R                  R                  X5       nU" 5       nS S S 5        WR                  R                  (       a  [        U5      n[        R                  " WXB5      $ ! , (       d  f       NK= fr   )r   r   
mask_loadsr   is_boolr    r   where)maskbodyothernew_maskresults        r3   maskedMetalOverrides.masked   sV     XX  -VF . ==  KEyy611 .-s   A--
A;c                (    U  SU S[        U5       3$ )Nz ? z : r   )abcs      r3   r   MetalOverrides.where   s    Cs#nQ/011r5   c                   [        U[        5      (       a/  UR                  b"  UR                  R                  (       d  U  SU 3$ [        U [        5      (       a$  U R                  [        R
                  :w  a  SU  S3OU n[        U[        5      (       a$  UR                  [        R
                  :w  a  SU S3OUnU SU SU SU S3$ )N % ro   r<   z - z * metal::floor( / )r.   r   r   is_floating_pointr/   r&   r   r   float_afloat_bs       r3   	remainderMetalOverrides.remainder   s     q+&&#GG--SA3< ![))agg.D "!A& 	 ![))agg.D "!A& 	
 #gY&6wis7)1MMr5   c                D    SU  SU SU  S3nSU  SU SU S3nSU SU S3$ )Nstatic_cast<decltype(+)>(r<   zc10::metal::max(rM   rt   r   r   
typecast_a
typecast_bs       r3   maximumMetalOverrides.maximum   K    ,QCq3qc;
,QCq3qc;
!*R
|1==r5   c                D    SU  SU SU  S3nSU  SU SU S3nSU SU S3$ )Nr   r   r   r<   zc10::metal::min(rM   rt   r   s       r3   minimumMetalOverrides.minimum   r   r5   c                    U  SU 3$ )Nz || rt   r   r   s     r3   
logical_orMetalOverrides.logical_or       D}r5   c                    U  SU 3$ )Nz && rt   r   s     r3   logical_andMetalOverrides.logical_and   r   r5   c                    SU  S3$ )Nzmetal::isnan(r<   rt   rD   s    r3   isnanMetalOverrides.isnan       qc##r5   c                    SU  S3$ )Nzmetal::isinf(r<   rt   r   s    r3   isinfMetalOverrides.isinf   r   r5   c                    SU  S3$ )Nzmetal::log(r<   rt   r   s    r3   logMetalOverrides.log       QCq!!r5   c                    SU  S3$ )Nzmetal::exp(r<   rt   r   s    r3   expMetalOverrides.exp   r   r5   c                    SU  S3$ )Nr\   r<   rt   r   s    r3   absMetalOverrides.abs   r   r5   c                    SU  S3$ )Nzmetal::signbit(r<   rt   r   s    r3   signbitMetalOverrides.signbit        1%%r5   c                    SU  S3$ )Nzmetal::precise::sin(r<   rt   r   s    r3   sinMetalOverrides.sin       %aS**r5   c                    SU  S3$ )Nzc10::metal::sinc(r<   rt   r   s    r3   sincMetalOverrides.sinc       "1#Q''r5   c                    SU  S3$ )Nzmetal::precise::cos(r<   rt   r   s    r3   cosMetalOverrides.cos   r   r5   c                    SU  S3$ )Nzc10::metal::i0(r<   rt   r   s    r3   i0MetalOverrides.i0   r   r5   c                    SU  S3$ )Nzc10::metal::i1(r<   rt   r   s    r3   i1MetalOverrides.i1   r   r5   c                    SU  S3$ )Nzc10::metal::erf(r<   rt   r   s    r3   erfMetalOverrides.erf   s    !!A&&r5   c                    SU  S3$ )Nzc10::metal::erfinv(r<   rt   r   s    r3   erfinvMetalOverrides.erfinv   s    $QCq))r5   c                    SU  S3$ )Nzc10::metal::log_gamma(r<   rt   r   s    r3   lgammaMetalOverrides.lgamma  s    's!,,r5   c                    SU  SU S3$ )Nzc10::metal::polygamma(rM   r<   rt   )rD   ys     r3   	polygammaMetalOverrides.polygamma  s    's"QCq11r5   c                    SU  S3$ )Nzc10::metal::digamma(r<   rt   r   s    r3   digammaMetalOverrides.digamma	  r   r5   c                    SU  S3$ )Nzmetal::tan(r<   rt   r   s    r3   tanMetalOverrides.tan  r   r5   c                    SU  S3$ )Nzmetal::asin(r<   rt   r   s    r3   asinMetalOverrides.asin      aS""r5   c                    SU  S3$ )Nzmetal::acos(r<   rt   r   s    r3   acosMetalOverrides.acos  r  r5   c                    SU  S3$ )Nzmetal::atan(r<   rt   r   s    r3   atanMetalOverrides.atan  r  r5   c                    SU  S3$ )Nzmetal::sqrt(r<   rt   r   s    r3   sqrtMetalOverrides.sqrt  r  r5   c                    SU  S3$ )Nzmetal::rsqrt(r<   rt   r   s    r3   rsqrtMetalOverrides.rsqrt!  r   r5   c                    SU  S3$ )Nzmetal::tanh(r<   rt   r   s    r3   tanhMetalOverrides.tanh%  r  r5   c                    SU  S3$ )Nzmetal::atanh(r<   rt   r   s    r3   atanhMetalOverrides.atanh)  r   r5   c                H    U  SU 3nU  SU 3nSU  SU SU SU SU SU S	3$ )
Nr   r   z((z
 < 0) != (z	 < 0) ? (z != 0 ? z - 1 : z) : r<   rt   )r   r   quotrems       r3   floordivMetalOverrides.floordiv-  sR     Cs|3qclA3j9SE$wtfDQUPVVWXXr5   c                    SU  S3$ )Nr=   r<   rt   r   s    r3   floorMetalOverrides.floor4  r   r5   c                    SU  S3$ )Nzmetal::sign(r<   rt   r   s    r3   signMetalOverrides.sign8  r  r5   c                D    SU  SU SU  S3nSU  SU SU S3nSU SU S3$ )Nr   r   r   r<   zmetal::fmod(rM   rt   r   s       r3   fmodMetalOverrides.fmod<  sK    ,QCq3qc;
,QCq3qc;
j\J<q99r5   c                    SU  S3$ )Nmetal::trunc(r<   rt   r   s    r3   truncMetalOverrides.truncB  r   r5   c                    U R                   [        R                  :w  a  SU  S3OU nUR                   [        R                  :w  a  SU S3OUnSU SU S3$ )Nro   r<   r)  /)r   r/   r&   r   s       r3   truncdivMetalOverrides.truncdivF  sY     125;;0F's!,A015;;0F's!,Awiq	33r5   c                    SU  S3$ )Nzmetal::ceil(r<   rt   r   s    r3   ceilMetalOverrides.ceilN  r  r5   c                    SU  SU S3$ )Nzc10::metal::rand(rM   r<   rt   seedoffsets     r3   randMetalOverrides.randR  s    "4&6(!44r5   c                    SU  SU S3$ )Nzc10::metal::randn(rM   r<   rt   r4  s     r3   randnMetalOverrides.randnV  s    #D6F8155r5   c           	          SU  SU SU SU S3	$ )Nzc10::metal::randint64(rM   r<   rt   )r5  r6  lowhighs       r3   	randint64MetalOverrides.randint64Z  s%     (vRxr#baHHr5   c                    SU  S3$ )Nzmetal::round(r<   rt   r   s    r3   roundMetalOverrides.round`  r   r5   c                D    SU  SU SU  S3nSU  SU SU S3nSU SU S3$ )Nr   r   r   r<   zmetal::pow(rM   rt   )r   r   cast_acast_bs       r3   powMetalOverrides.powd  sK    (1QCs1#Q7(1QCs1#Q7VHBvha00r5   c                    SU  SU S3$ )Nzc10::metal::zeta(rM   r<   rt   r   s     r3   zetaMetalOverrides.zetaj  s    "1#Rs!,,r5   c                    SU  S3$ )Nz c10::metal::spherical_bessel_j0(r<   rt   r   s    r3   spherical_bessel_j0"MetalOverrides.spherical_bessel_j0n  s    1!A66r5   c                    SU  S3$ )Nzc10::metal::xlog1py(r<   rt   r   s    r3   xlog1pyMetalOverrides.xlog1pyr  r   r5   c                    SU  S3$ )Nzc10::metal::entr(r<   rt   r   s    r3   entrMetalOverrides.entrv  r   r5   rt   )NT)
rD   r   r   torch.dtyper   zOptional[torch.dtype]r   r    rv   r1   )rD   r   r   rU  r   rU  rv   r1   )r2   zUnion[bool, float, int]r   rU  rv   r1   )rC   ru   r   rU  rv   r1   )r   r   r   ru   r   r   rv   r1   )r   r   r   r   r   r   rv   r1   )r   r   r   r   rv   r1   )r   r   r   r   rv   r1   )rD   r   rv   r1   )rD   r   r   r   rv   r1   )r5  r   r6  r   rv   r1   )
r5  r   r6  r   r=  r   r>  r   rv   r1   )9rw   rx   ry   rz   staticmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r	  r  r  r  r  r  r  r   r#  r&  r*  r.  r1  r7  r:  r?  rB  rG  rJ  rM  rP  rS  r{   rt   r5   r3   r}   r}   y   s    ,0"&	<<< )<  	<
 
< < KK*K7BK	K K
 # # ( ( 2 2 2 2 N N( > >
 > >
     $ $ $ $ " " " " " " & & + + ( ( + + & & & & ' ' * * - - 2 2 + + " " # # # # # # # # $ $ # # $ $ Y Y $ $ # # : :
 $ $ 4 4 # # 5 5 6 6 II#.I5@IHSI	I I
 $ $ 1 1
 - - 7 7 + + ( (r5   r}   mpsc                  p  ^  \ rS rSr\rSrSrSr\	" 5       R                  r\" 5       R                  r\r      SU 4S jjrSS jrSS jr S         SS	 jjrS\R(                  " 5       4       SS
 jjr          SS jrSS jrSS jrSSS jjrSSS jjr          SS jrSrU =r$ )MetalKerneli~  ;auto i   c                j   > [         TU ]  " U40 UD6  [        R                  " 5       U l        SU l        g )NF)super__init__	itertoolscountacc_var_idsmultistage_reduction)rB   tilingkwargs	__class__s      r3   r^  MetalKernel.__init__  s.    
 	*6*$??,$)!r5   c                    [         U   $ r   r   )rB   r   s     r3   dtype_to_strMetalKernel.dtype_to_str  s    e$$r5   c                   U R                   R                  U5      nU R                  U5      nU SU R                  U5       S3nU R                  R                  U R                  U[        R                  R                  U5      S9$ )z"Codegen a load from an InputBuffer[]r   )
r?   inputr   r   r   r   loadsr   graph	get_dtype)rB   nameindexr   lines        r3   loadMetalKernel.load  sq    iiood#%%e,a))%013xx  T9J9J49P QQr5   Nc                   U R                   R                  U5      nU R                  U5      nU R                  [        R
                  R                  U5      5      nU SU R                  U5       SU SU S3nU R                  (       a%  U R                  R                  [        X5      5        g U R                  R                  [        X5      5        g )Nrk  z] = static_cast<r   );)r?   outputr   rh  r   rp  rq  r   inside_reductionr   	writeliner   stores)rB   rr  rs  valuemoder   	dtype_strrt  s           r3   storeMetalKernel.store  s     iit$%%e,%%agg&7&7&=>	a))%011A)BugUWX  LL""<#;<KK!!,t":;r5   c           	     R   S[        U R                  5       3n[        R                  R	                  XCU5      nU(       a6  U R
                  R                  SU R                  U5       SU SU S35        U$ U R
                  R                  SU R                  U5       SU S35        U$ )Ntmp_acc_zthreadgroup  rk  z];rZ  )nextra  r   r   create_cse_varindexing_coder{  rh  )rB   r   
elem_countr   var_namer   s         r3   _new_accvarMetalKernel._new_accvar  s     d4#3#3456hh%%h>((t0078(1ZLPRS 
 ((t0078(1E 
r5   c                	   [        S U R                   5       5      n[        UR                  U R                  5      nUS:X  a  U R                  U5      nU R                  R                  U S35        U R                  R                  S5        U R                  R                  SU SU S35        U R                  R                  S5        U$ US;   a  U R                  X&5      nU R                  (       al  US	:X  a  S
OSu  pU R                  R                  U SUR                   SU	 S35        U R                  R                  U SUR                   SU
 SU S35        O.U R                  R                  U SUR                   SU S35        U R                  R                  U R                  SU SU SU S3[        U   S9$ US;   Ga*  U R                  X&5      nU SUR                   S3n[         U   nU R                  (       dS  U R                  R                  U SU SU S35        U R                  R                  U R                  SU SU SU S3US9$ UR#                  S5      (       a  SOSnU R                  R                  U SU SU S35        UR%                  S 5      (       a  [        S! U R&                  R)                  5        5       5      nU R                  [*        R,                  U5      nUS":X  a  S#OS$nU SUR                   S3nU R                  R                  U S%35        U R                  R                  SU S&U S&U SU S'U S(U S'UR                   S)35        U R                  R                  U R                  U S*U SU SU S+3US9$ U R                  R                  U S,U SU SU S35        U R                  R                  U R                  SU SU SU S3US9$ US-:X  a  U R                  (       a
   S.U 35       eU R                  X&5      nU R                  R                  U SUR                   SU S35        U R                  R                  U R                  SU SU SU S35      n[.        R0                  " U S/3U S03U R2                  R4                  45      $ [7        U5      e)1zCodegen a reduction operationc              3  J   #    U  H  oR                   (       d  M  Uv   M     g 7fr   is_reduction.0ts     r3   	<genexpr>(MetalKernel.reduction.<locals>.<genexpr>  s     K(81NNQQ(8   #	#anyz	 = false;z7threadgroup_barrier(metal::mem_flags::mem_threadgroup);z
                if (z) {
                    z' = true;
                }
            )prodsumr  )r   r   )r   *rk  z] = rZ  z] z= zc10::metal::threadgroup_r:   rM   r<   rm  )maxminargminargmaxrl  z = static_cast<r   rx  r  lowestz = ::metal::numeric_limits<z>::z();argc              3  J   #    U  H  oR                   (       d  M  Uv   M     g 7fr   r  r  s     r3   r  r    s      =!AA=r  r  ><z = -1;r   = z;
                    z$;
                }
                z[c10::metal::threadgroup_z)]z = ::c10::metal::welford_reducez+Multistage reduction not yet supported for z.xz.y)r  range_treesr  numelmax_threadgroup_sizer  r  r{  r   splicer|  rb  rr  r   r   r   r   endswith
startswithrange_tree_nodesvaluesr/   r$   r   _unwrapfeaturesreduction_numelNotImplementedError)rB   r   r   reduction_typer}  reduction_dimacc_buf_sizeaccacc_bufdefault_valreduction_opacc_thread_varsrc_metal_typelim_fnidx_varidx_acc_bufcmp_opidx_thread_varwf_ress                      r3   	reductionMetalKernel.reduction  sk    K(8(8KK=..0I0IJU"""5)C((C5	):;((I LLG E  KK!!I J_,&&y?G(( .% 7HX * "",,iq!3!3 4DQG ##iq!3!3 4B|nBugQO ##wiq1C1C0DDq$QR88$$*>*:!G9B|nTUV07 %  
 ??&&y?G 'y-*<*<)=Q?N+I6N,,##%&on5ERwbQ xx((KK.~.>ay<.XYZ )  
 "0!8!8!?!?XUF((!""=n=MSQWPXX[\ ((// #44;;=  #..uzz<H .( :$/=-2D2D1EQ!G""))^,<F*CD## )G1VHAn%5 6#$Cw /#$C~ 6%  xx((KK"m#<^<LAgYVXYeXffhi )  
 LL""!""3N3C1^DTTVW\V]]_` 88$$*>*:!G9B|nTUV %  
 --00 =n=MN0 &&y?GLL7)1]-?-?,@UG1 MNXX&&*>*:!G9B|nTUVF %%826("t}}/L/LM  ".11r5   c                :   U R                  UR                  5      nU R                  U5      nUR                  (       a(  UR                  R
                  U R                  :  U l        UR                  (       a  U R                  (       d9  U R                  R                  U R                   SUR                   SU S35        g UR                  R
                  U R                  -   S-
  U R                  -  nU R                  R                  SUR                   SUR                   SU SUR                   S	3	5        U R                  R                  5          U R                  R                  U R                   SUR                   SU S
U SUR                   S3
5        X@R                  -  UR                  R
                  :w  a@  U R                  R                  SUR                   SUR                  R
                   S35        S S S 5        g ! , (       d  f       g = f)Nr  r  rZ  r   z	for(auto z
_cnt = 0; z_cnt < z; ++z_cnt) {rf   z + z_cnt;if ( >= z) break;)rename_indexingrC   sexprr  rootr  r  rb  r  r{  index_dtyperr  r   indent)rB   entryr   	index_str	loop_sizes        r3   codegen_iteration_ranges_entry*MetalKernel.codegen_iteration_ranges_entry!  s   ))%**5
JJz*	(-

(8(84;T;T(TD%!!)B)B((##$Aejj\YKqA 
 JJt8881<&&'	 			

|:ejj\4PUPZPZ|[cd	
 YYII##$Aejj\YKs9+SQVQ[Q[P\\ab 444

8H8HH		##d5::,d5::;K;K:LH$UV  s   B/H
Hc                   U R                   (       a  U R                  R                  5          U R                  R                  U R                  5        U R                  R                  U R
                  5        SSS5        U R                  R                  S5        SU l         OJU R                  R                  U R                  5        U R                  R                  U R
                  5        U R                  R                  U R                  5        U R                  R                  5         U R
                  R                  5         U R                  R                  5         g! , (       d  f       N= f)z
Concat output code from index_code, loads, compute, stores,
suffix into self.body.

For pointwise kernels, this is called just once at the end.

For reduction kernels, this generates a loop over the reduction
axis.
N}F)	rb  r   r  r  ro  r   r{  r|  clear)rB   s    r3   codegen_bodyMetalKernel.codegen_body<  s     $$!!#		  ,		  . $ II$(-D%IITZZ(IIT\\*		%

 $#s   AE  
E.c                P   U R                  5         [        5       nUR                  S5        U R                  5       nUR	                  5          UR                  SSS9  U R                  (       a  UR                  S5        UR                  S5        UR	                  5          U R                  R                  R                  5        H\  u  pEX@R                  ;   a  M  U R                  [        R                  R                  U5      5      nUR                  SU SU S	35        M^     U R                  R                  R                  5        HK  u  pEU R                  [        R                  R                  U5      5      nUR                  S
U SU S	35        MM     U R                  R                   R                  5        H  u  pEUR                  SU S	35        M     [#        U5      S:  d   S5       e[#        U5      S:  a  S[#        U5       3OSn[#        U5      S:X  a  US   R$                  OSnU R                  (       a  S	OSn	UR                  U SU SU	 35        U R                  (       a  UR                  U S35        SSS5        UR                  S5        UR	                  5          [#        U5      S:  aB  ['        U5       H3  u  pUR                  SUR$                   S[)        SU
-   5       S35        M5     UR                  U R*                  5        UR                  U R,                  5        SSS5        UR                  S5        SSS5        UR                  S5        UR/                  5       $ ! , (       d  f       N= f! , (       d  f       NY= f! , (       d  f       NQ= f)z3Called at the end to generate a final kernel stringzcompile_mps_shader("""z
            #include <c10/metal/random.h>
            #include <c10/metal/special_math.h>
            #include <c10/metal/utils.h>
            T)stripz&#include <c10/metal/reduction_utils.h>zkernel void generated_kernel(zdevice z* ,z	constant zconstant long&    z%Up to 3 index variables are supportedr   uintr   
thread_pos r  z [[thread_position_in_grid]]z- group_pos [[thread_position_in_threadgroup]]Nz) {r[  z = thread_pos.x   rZ  r  z"""))r  r   r{  active_range_treesr  r  rz  r?   output_buffersitemsremoved_buffersrh  r   rp  rq  input_bufferssizevarsrO   rr  	enumeratechrr  r   getvalue)rB   rr  codeidx_varsouterinnerr  thread_pos_dtypethread_pos_var_namethread_pos_suffixidxr   s               r3   codegen_kernelMetalKernel.codegen_kernelT  s   /0**,[[]KK
    $$GHNN:;$(II$<$<$B$B$DLE 4 44  $ 1 1!''2C2CE2J KINNWYKr%#BC	 %E
 %)II$;$;$A$A$CLE $ 1 1!''2C2CE2J KINNYykE7!#DE %D %)II$6$6$<$<$>LENN_UG1#=> %?8}q(Q*QQ(.1(ma.?d3x=/*V ! ),H(:HQK$$ $ ,0+@+@Cb!'(*=)>>Z[lZmn ((NN+,,YZ/ 4 NN5!x=1$$-h$7#CHH:^Cc	N;K1M %8 D../DII&  NN3_ ` 	v}}M 6 O ]s?   AN G"M5)N+BN3N5
N	?N
N	N
N%c           	        [         R                  R                  n/ U R                  R                  R                  5       QU R                  R                  R                  5       QnU Vs/ s H  oUU R                  ;  d  M  UPM     nnX@R                  R                  R                  5        Vs/ s H  n[        U5      PM     sn-  n[        U R                  5       5      S:  a  U R                  5        Vs/ s HZ  nU R                  UR                  (       a+  [        R                  " UR                   U R"                  5      OUR                   5      PM\     nnUSSR%                  U5       S3/-  nU R&                  (       a  U R                  5        Vs/ s HP  nUR                  (       a:  U R                  [        R                  " UR                   U R"                  5      5      OSPMR     nnUSSR%                  U5       S3/-  nUR)                  UU[*        R,                  " S5      SS	9  g
s  snf s  snf s  snf s  snf )zCodegen a call to this kernelr   z	threads=[rM   rl  1zgroup_size=[cpuF)devicetritonN)r   rp  wrapper_coder?   r  keysr  r  r  r1   rO   r  pexprr  sympyMinr  r  rQ   rz  generate_kernel_callr/   r  )rB   rr  nodewrapperr?   r  vthreadss           r3   call_kernelMetalKernel.call_kernel  s   ''&&R))..0R4993J3J3O3O3QR#Gt$2F2F'FtG!3!3!8!8!:;!:AQ!:;; t&&()A- 002 3A 

~~ IIaggt'@'@A
 3   y7!3 4A677D  
 002	 3A >> 

599QWWd.G.GHI 3	   |DIIg$6#7q9::D$$<<&	 	% 	
/ H;s    )H/ H/.H47A!H9AH>c                (   U(       d  U(       d  g U R                  U5      nU(       a  U S3OSnU(       a  U SU R                  U5       3OSnU(       a  U(       a
  SU SU S3nOSU U S3nU R                  R                  U R                  US	S
9  g )Nz < 0r  r  zif ((z) && (z	)) returnr  z) returnF)
assignment)r   r   r   r   )	rB   rC   sizelowerupperexpr_str
lower_expr
upper_exprrt  s	            r3   check_boundsMetalKernel.check_bounds  s      $$T**/z&R
CHzd&7&7&=%>?b
U:,fZL	BD*j\:D$,,?r5   )ra  rb  )rc  zdict[str, sympy.Expr]rd  r   rv   None)r   rU  rv   r1   )rr  r1   rs  ru   rv   r   r   )
rr  r1   rs  ru   r}  r   r~  r   rv   r
  )r   rU  r  zOptional[int]r   zValueRanges[Any]rv   r   )
r   rU  r   rU  r  r   r}  +Union[CSEVariable, tuple[CSEVariable, ...]]rv   r  )r  r   rv   r
  rv   r
  )rr  zOptional[str]rv   r1   )rr  r1   r  r   rv   r
  )
rC   ru   r  ru   r  r    r  r    rv   r
  )rw   rx   ry   rz   r}   	overridessuffixnewvar_prefixr  r   r@   r  r7   r  kexprr^  rh  ru  r  r	   unknownr  r  r  r  r  r  r  r{   __classcell__re  s   @r3   rY  rY  ~  s`   IFMO##E&&EE*%* * 
	*%R SW
<
< *
<3>
<FO
<	
< %)#.#6#6#8	 " !	
 
$g2g2 g2 &	g2
 ;g2 
5g2RW608t 
D@@&0@9=@FJ@	@ @r5   rY  c                 ,    SS K n U R                  SSS9  g )Nr   ztorch.compile for Metal is an early protoype and might not work as expected. For details see https://github.com/pytorch/pytorch/issues/150121r
   )
stacklevel)warningswarn)r  s    r3   _warn_prototyper    s    MM	L  r5   c                  J   ^  \ rS rSr\rSU 4S jjr        SS jrSrU =r	$ )MetalSchedulingi  c                   > [         TU ]  U5        [        5         [        R                  R
                  nUb  UR                  R                  S5        g g )NzDfrom torch._inductor.runtime.runtime_utils import compile_mps_shader)r]  r^  r  r   rp  r  headerr  )rB   	schedulerr  re  s      r3   r^  MetalScheduling.__init__  sE    #''&&NN!!V r5   c                   [         R                  R                  nXR                  ;   a  UR                  U   nU$ SUR	                  5        3nU S3nXTR                  U'   [        X$5      u  pxU SU 3n	UR                  XaU	5        U$ )Nmps_lib_z.generated_kernel
)r   rp  r  src_to_kernelnext_kernel_suffixr   define_kernel)
rB   src_codenode_scheduler   r  kernel_namemps_lib_nameoriginsdetailed_originsmetadata_comments
             r3   r$  MetalScheduling.define_kernel  s     ''&&,,,!//9K  &g&@&@&B%CDL)N*;<K.9!!(+(;M(S%G")"-=,>?!!,:JKr5   rt   )r  zOptional[Scheduler]rv   r
  )r%  r1   r&  zlist[SchedulerNode]r   rY  rv   r1   )
rw   rx   ry   rz   rY  kernel_typer^  r$  r{   r  r  s   @r3   r  r    s7    K,?IT	 r5   r  )r2   z)Union[float, int, bool, str, CSEVariable]rv   r1   r  )=
__future__r   	functoolsr_  typingr   r   r   r  sympy.printing.precedencer   r/   torch.utils._sympy.printersr   ExprPrinter_torch.utils._sympy.value_rangesr	   utilsr   r   virtualizedr   r   r   commonr   r   r   r   r   r   simdr   r   r   r   ops_handlerr   r   r  r   r   r   r    int8int16int32int64uint8r&   r'   bfloat16r   r4   r7   r}   _initialize_pointwise_overridesrY  cacher  r  rt   r5   r3   <module>rB     s   #   / /  0  C 7 > , ,  C B 64 
JJ	JJ	KK	KK	KK	KK	KK	JJ	NNH
6b| 6br([ (D  . .u 5@@* @@F
  n r5   