
    sh                    h   S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKJr  S SK	J
r
  S SKJrJrJrJrJrJr  S SKrS SKrS SKrSSKJr  SSKJr  SS	KJrJr  SS
KJrJr  SSKJ r   SSK!J"r"J#r#  SSK$J%r%  SSK#J&r&  SSK'J(r(J)r)  SSK*J+r+  SSK,J-r-J.r.  SSK/J0r0J1r1J2r2J3r3J4r4  SSK5J6r7J8r8  SSK9J:r:J;r;J<r<J=r=J>r>J?r?J@r@JArAJBrB  SSKCJDrD  SSKEJFrF  SSKGJHrHJIrIJJrJ  \(       a  S SKKJLrL  SSK*JMrMJNrN  \R                  " \P5      rQS rR " S S\S5      rT " S S \@5      rU\U" 5       R                  rW\@" 5       R                  rX\R                  S!\R                  S"\R                  S#\R                  S$\R                  S%\R                  S&\R                  S'\R                  S(\R                  S)\R                  S*\R                  S+\R                  S,\R                  S-0rfS. rgS/ rh " S0 S1\?5      ri\iR                  S25         " S3 S4\;5      rk\R                   " S5 S65      5       rmS7 rnS8 ro " S9 S:\I5      rp " S; S<\J5      rqg)=    )annotationsN)defaultdict)inf)AnyCallablecastOptionalTYPE_CHECKINGUnion   )is_integer_dtype)
OrderedSet)FloorDivModularIndexing)symbol_is_typeSymT)ValueRanges   )configir)HalideCodeCache)get_reduction_combine_fn)is_metric_table_enabledlog_kernel_metadata)AddParenHandler)HalideInputSpec
HalideMeta)get_bounds_index_exprget_kernel_metadataparallel_num_threadssympy_index_symbol
sympy_subs)_opsV   )	BackendFeatureCSEVariableDeferredLineIndentedBufferKernelArgTypeOpOverridesPythonPrinterSizeArg	TensorArg)DTYPE_TO_CPP)cexpr)constant_repr
SIMDKernelSIMDScheduling)Sequence)ReductionType	StoreModec                >   [        U [        5      (       aZ  SU s=::  a  S::  dM  O  [        R                  " [        R                  5      nXR
                  :X  a  gXR                  :X  a  gSU < S3$ [        U [        5      (       a  S[        U 5       S3$ [        U 5      $ )Ni   izhl.Int(64).min()zhl.Int(64).max()zhl.i64()zhl.f64()

isinstanceinttorchiinfoint64minmaxfloatr1   repr)valinfos     r/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/_inductor/codegen/halide.pyhalide_constantrE   =   s    #s[C%E:%E{{5;;'((?%((?%q!!#us+,A..9    c                  ,   ^  \ rS rSrSU 4S jjrSrU =r$ )UnsupportedJ   c                *   > [         TU ]  SU 35        g )Nz!halide backend does not support: )super__init__)selfthing	__class__s     rD   rL   Unsupported.__init__K   s    <UGDErF    returnNone)__name__
__module____qualname____firstlineno__rL   __static_attributes____classcell__rO   s   @rD   rH   rH   J   s    F FrF   rH   c                     ^  \ rS rSr\S 5       r\S 5       rS rS rS r	S r
\
rS rS	 rS
 rS rS rS rS rS rS rS rS rS rS rS rS rU 4S jrS r\rS rS rSr U =r!$ )HalidePrinterO   c                D    S[         R                  R                   SU  S3$ )Nhl.cast(, r8   )r$   kernelindex_dtypeexprs    rD   
cast_indexHalidePrinter.cast_indexP   s"    !((../r$q99rF   c                    SU  S3$ )Nhl.cast(hl.Float(32), r8   rQ   rd   s    rD   
cast_floatHalidePrinter.cast_floatT   s    'vQ//rF   c                    SU S3$ )Nhl.f32(r8   rQ   rM   re   s     rD   _print_FloatHalidePrinter._print_FloatX   s    a  rF   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr%   rm   r   r8   lenargs_printrn   s     rD   _print_ToFloatHalidePrinter._print_ToFloat[   s9    499~"""TYYq\23155rF   c                    [        UR                  5      S:X  d   eU R                  SU R                  UR                  S   5       S35      $ )Nr%   	hl.floor(r   r8   rs   rt   rf   ru   rn   s     rD   _print_floorHalidePrinter._print_floor_   B    499~"""4;;tyy|+D*EQGHHrF   c                    [        UR                  5      S:X  d   eU R                  SU R                  UR                  S   5       S35      $ )Nr%   	hl.trunc(r   r8   rz   rn   s     rD   _print_TruncHalidePrinter._print_Truncc   r}   rF   c                    [        UR                  5      S:X  d   eU R                  SU R                  UR                  S   5       S35      $ )Nr%   hl.ceil(r   r8   rz   rn   s     rD   _print_ceilingHalidePrinter._print_ceilingi   sB    499~"""$++diil*C)DAFGGrF   c                J    SU R                  U R                  U5      5       S3$ Nzhl.sqrt(r8   )rj   ru   rn   s     rD   _helper_sqrtHalidePrinter._helper_sqrtm   s$    $//$++d*;<=Q??rF   c                    U R                  UR                  S   5      nU R                  UR                  S   5      nU R                  UR                  S   5      nSU SU SU S3$ )Nr   r%   r   
hl.select(ra   r8   )doprintrt   )rM   re   cpqs        rD   _print_WhereHalidePrinter._print_Wherep   s_    LL1&LL1&LL1&A3b2aS**rF   c                n   [        UR                  5      S:X  a  U R                  UR                  S   5      $ [        UR                  5      S-  nU R                  [        R                  " UR                  S U 6 5      nU R                  [        R                  " UR                  US  6 5      nSU SU S3$ )Nr%   r   r   hl.min(ra   r8   )rs   rt   ru   sympyMinrM   re   midabs        rD   
_print_MinHalidePrinter._print_Minv   s    tyy>Q;;tyy|,,$))n!KK		499Tc?34KK		499ST?342aS""rF   c                n   [        UR                  5      S:X  a  U R                  UR                  S   5      $ [        UR                  5      S-  nU R                  [        R                  " UR                  S U 6 5      nU R                  [        R                  " UR                  US  6 5      nSU SU S3$ )Nr%   r   r   hl.max(ra   r8   )rs   rt   ru   r   Maxr   s        rD   
_print_MaxHalidePrinter._print_Max   s    tyy>Q;;tyy|,,$))n!KK		499Tc?34KK		499ST?342aS""rF   c                    [        UR                  5      S:X  d   eU R                  SU R                  UR                  S   5       S35      $ )Nr%   hl.abs(r   r8   rz   rn   s     rD   
_print_AbsHalidePrinter._print_Abs   sB    499~"""TYYq\)B(C1EFFrF   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr%   zhl.cos((r   r8   rr   rn   s     rD   _print_OpaqueUnaryFn_cos&HalidePrinter._print_OpaqueUnaryFn_cos   9    499~"""$++diil34A66rF   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr%   z	hl.cosh((r   r8   rr   rn   s     rD   _print_OpaqueUnaryFn_cosh'HalidePrinter._print_OpaqueUnaryFn_cosh   9    499~"""4;;tyy|45Q77rF   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr%   z	hl.acos((r   r8   rr   rn   s     rD   _print_OpaqueUnaryFn_acos'HalidePrinter._print_OpaqueUnaryFn_acos   r   rF   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr%   zhl.sin((r   r8   rr   rn   s     rD   _print_OpaqueUnaryFn_sin&HalidePrinter._print_OpaqueUnaryFn_sin   r   rF   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr%   z	hl.sinh((r   r8   rr   rn   s     rD   _print_OpaqueUnaryFn_sinh'HalidePrinter._print_OpaqueUnaryFn_sinh   r   rF   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr%   z	hl.asin((r   r8   rr   rn   s     rD   _print_OpaqueUnaryFn_asin'HalidePrinter._print_OpaqueUnaryFn_asin   r   rF   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr%   zhl.tan((r   r8   rr   rn   s     rD   _print_OpaqueUnaryFn_tan&HalidePrinter._print_OpaqueUnaryFn_tan   r   rF   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr%   z	hl.tanh((r   r8   rr   rn   s     rD   _print_OpaqueUnaryFn_tanh'HalidePrinter._print_OpaqueUnaryFn_tanh   r   rF   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr%   z	hl.atan((r   r8   rr   rn   s     rD   _print_OpaqueUnaryFn_atan'HalidePrinter._print_OpaqueUnaryFn_atan   r   rF   c                  > UR                   (       a  [        TU ]	  U5      $ UR                  u  p#U R	                  U R                  U5      5      nU R	                  U R                  U5      5      nU R                  SU SU S35      $ )Nry   z / r8   )
is_integerrK   _print_FloorDivrt   rj   r   rf   )rM   re   xdivrO   s       rD   r   HalidePrinter._print_FloorDiv   sp    ??7*400OODLLO,oodll3/01#SQ788rF   c                    [        UR                  5      S:X  d   eU R                  SU R                  UR                  S   5       S35      $ )Nr%   	hl.round(r   r8   rz   rn   s     rD   _print_RoundHalidePrinter._print_Round   r}   rF   c                0    UR                   u  p#SU SU S3$ )N() / (z+hl.f32(0)))rt   )rM   re   r   r   s       rD   _print_IntTrueDivHalidePrinter._print_IntTrueDiv   s"    yy1#U1#[))rF   c                    UR                   u  p#U R                  U5      n[        U5      nSSU* -  < SU SSU-  < S3$ )Nrm   g      $@z)*hl.round((z	)*hl.f32()))rt   ru   r:   )rM   re   rB   ns       rD   _print_RoundDecimal!HalidePrinter._print_RoundDecimal   sJ    kk#F1"(SE47+RPPrF   rQ   )"rU   rV   rW   rX   staticmethodrf   rj   ro   rv   r{   r   _print_TruncToIntr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _print_RoundToIntr   r   rY   rZ   r[   s   @rD   r]   r]   O   s    : : 0 0!6II %H@+##G7887887889I %*
Q QrF   r]   z	hl.Bool()zhl.BFloat(16)zhl.Float(16)zhl.Float(32)zhl.Float(64)z	hl.Int(8)z
hl.Int(16)z
hl.Int(32)z
hl.Int(64)z
hl.UInt(8)zhl.UInt(16)zhl.UInt(32)zhl.UInt(64)c                    [         U    $ N)_halide_typedtypes    rD   halide_typer      s    rF   c                
   [        U 5      (       a5  U R                  (       a$  U [        R                  :w  a  [        R                  n U [        R
                  [        R                  4;   a  [        R                  n [        U 5      $ r   )	r   	is_signedr;   r=   int32float16bfloat16float32r   r   s    rD   halide_acc_typer      sP    5??u7K//urF   c                  .   \ rS rSr\  SD   SES jj5       r\SFS j5       r\S 5       r\S 5       r	\S 5       r
\S 5       r\S	 5       r\S
 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r \S 5       r!\S 5       r"\S  5       r#\S! 5       r$\S" 5       r%\S# 5       r&\S$ 5       r'\S% 5       r(\S& 5       r)\S' 5       r*\S( 5       r+\S) 5       r,\S* 5       r-\S+ 5       r.\S, 5       r/\S- 5       r0\S. 5       r1\S/ 5       r2\S0 5       r3\S1 5       r4\S2 5       r5\S3 5       r6\S4 5       r7\S5 5       r8\S6 5       r9\S7 5       r:\S8 5       r;\S9 5       r<\S: 5       r=\S; 5       r>\S< 5       r?\S= 5       r@\S> 5       rA\SGS? j5       rB\S@ 5       rC\SA 5       rD\SB 5       rESCrFg)HHalideOverrides   Nc                Z    U[         R                  :X  a  SU  S3$ S[        U5       SU  S3$ )Nr   z != 0)r`   ra   r8   )r;   boolr   )r   r   	src_dtypeuse_compute_typess       rD   to_dtypeHalideOverrides.to_dtype   s9     EJJqc= +e,-Rs!44rF   c                    U[         R                  [         R                  4;   a  S[        U5       SU  S3n S[        U5       SU  S3nU[         R                  [         R                  4;   a  SU S3nU$ )Nr`   ra   r8   zhl.reinterpret(ri   )r;   r   r   r   )r   r   r   lines       rD   to_dtype_bitcast HalideOverrides.to_dtype_bitcast   ss    77;y12"QCq9A U!3 4Bqc;U]]ENN33+D63DrF   c                8    U R                  [        U5      U5      $ r   )r   rE   )clsvaluer   s      rD   constantHalideOverrides.constant  s    ||OE2E::rF   c                    SU  S3$ )Nr   r8   rQ   r   s    rD   absHalideOverrides.abs      1~rF   c                \    [        U S5      (       d  SU  S3$ SU  SU R                   SU  S3$ )Nnamehl.exp(r8   z"hl.fast_exp(hl.cast(hl.Float(32), z)) if z!.type().bits() <= 32 else hl.exp(hasattrr   r   s    rD   expHalideOverrides.exp
  s@    q&!!QCq>!3A3fQVVHDefgehhijjrF   c                    SU  S3$ )Nr   r8   rQ   r   s    rD   libdevice_expHalideOverrides.libdevice_exp  r   rF   c                    SU  S3$ r   rQ   r   s    rD   sqrtHalideOverrides.sqrt      !ArF   c                    [        U S5      (       d	  SU  SU S3$ SU R                   SU S3nSU  SU S	U  S
U  SU SU R                   SU  SU S3$ )Nr   r   ra   r8   r`   	.type(), hl.select((<)|hl.is_nan(), ) if z.type().is_float() else hl.min(r   r   r   s     rD   minimumHalideOverrides.minimum       q&!!QCr!A&&qvvhis!,QCq<s#aS1#U166(JijkillnopnqqrssrF   c                    [        U S5      (       d	  SU  SU S3$ SU R                   SU S3nSU  SU S	U  S
U  SU SU R                   SU  SU S3$ )Nr   r   ra   r8   r`   r  r  >r  r  r  z.type().is_float() else hl.max(r   r  s     rD   maximumHalideOverrides.maximum   r  rF   c                b    [        US5      (       a  SUR                   SU S3nSU  SU SU S3$ )Nr   r`   r  r8   r   ra   r   )r   r   r   s      rD   whereHalideOverrides.where(  sB    1f166()A3a0AA3b2aS**rF   c                    SU  S3$ )Nzhl.cos(r8   rQ   r   s    rD   cosHalideOverrides.cos.  r   rF   c                    SU  S3$ )Nzhl.sin(r8   rQ   r   s    rD   sinHalideOverrides.sin2  r   rF   c                    [        S5      e)NlgammarH   r   s    rD   r$  HalideOverrides.lgamma6      (##rF   c                    SU  S3$ )Nzhl.erf(r8   rQ   r   s    rD   erfHalideOverrides.erf:  r   rF   c                    SU  S3$ )Nzhl.cosh(r8   rQ   r   s    rD   coshHalideOverrides.cosh>  r
  rF   c                    SU  S3$ )Nzhl.sinh(r8   rQ   r   s    rD   sinhHalideOverrides.sinhB  r
  rF   c                    SU  S3$ )Nzhl.acos(r8   rQ   r   s    rD   acosHalideOverrides.acosF  r
  rF   c                    SU  S3$ )Nz	hl.acosh(r8   rQ   r   s    rD   acoshHalideOverrides.acoshJ      1#QrF   c                    SU  S3$ )Nzhl.asin(r8   rQ   r   s    rD   asinHalideOverrides.asinN  r
  rF   c                    SU  S3$ )Nz	hl.asinh(r8   rQ   r   s    rD   asinhHalideOverrides.asinhR  r7  rF   c                    SU  SU S3$ )Nz	hl.atan2(ra   r8   rQ   r   ys     rD   atan2HalideOverrides.atan2V      1#Rs!$$rF   c                    SU  S3$ )Nzhl.atan(r8   rQ   r   s    rD   atanHalideOverrides.atanZ  r
  rF   c                    SU  S3$ )Nz	hl.atanh(r8   rQ   r   s    rD   atanhHalideOverrides.atanh^  r7  rF   c                    [        S5      e)Ncopysignr%  r?  s     rD   rK  HalideOverrides.copysignb  s    *%%rF   c                    [        S5      e)Nerfinvr%  r   s    rD   rN  HalideOverrides.erfinvf  r'  rF   c                    SU  SU S3$ )Nz	hl.hypot(ra   r8   rQ   r?  s     rD   hypotHalideOverrides.hypotj  rC  rF   c                    [        S5      e)N	nextafterr%  r?  s     rD   rT  HalideOverrides.nextaftern  s    +&&rF   c                    U  SU 3$ Nz & rQ   r  s     rD   logical_andHalideOverrides.logical_andr      Cs|rF   c                    U  S3$ )Nz == 0rQ   r   s    rD   logical_notHalideOverrides.logical_notv  s    E{rF   c                    U  SU 3$ Nz | rQ   r  s     rD   
logical_orHalideOverrides.logical_orz  rZ  rF   c                    SU  SU S3$ )Nr    ^ r8   rQ   r  s     rD   logical_xorHalideOverrides.logical_xor~  s    1#S1~rF   c                    U  SU 3$ rW  rQ   r  s     rD   bitwise_andHalideOverrides.bitwise_and  rZ  rF   c                    SU  3$ )N~rQ   r\  s    rD   bitwise_notHalideOverrides.bitwise_not  s    1#wrF   c                    U  SU 3$ r`  rQ   r  s     rD   
bitwise_orHalideOverrides.bitwise_or  rZ  rF   c                    U  SU 3$ )Nrd  rQ   r  s     rD   bitwise_xorHalideOverrides.bitwise_xor  rZ  rF   c                    U  SU 3$ )Nz << rQ   r  s     rD   bitwise_left_shift"HalideOverrides.bitwise_left_shift      D}rF   c                    U  SU 3$ )Nz >> rQ   r  s     rD   bitwise_right_shift#HalideOverrides.bitwise_right_shift  rw  rF   c                    SU  SU S3$ )Nzhalide_helpers.rand(ra   r8   rQ   seedoffsets     rD   randHalideOverrides.rand  s    %dV2fXQ77rF   c                    SU  SU S3$ )Nzhalide_helpers.randn(ra   r8   rQ   r|  s     rD   randnHalideOverrides.randn  s    &tfBvha88rF   c           	          SU  SU SU SU S3	$ )Nzhalide_helpers.randint64(ra   r8   rQ   )r}  r~  lowhighs       rD   	randint64HalideOverrides.randint64  s#    *4&6("SED6KKrF   c                    [         R                  " U S5       S[        R                  R                  R                  SU5       3$ )Nr    + load_seed_offset)opsloadr$   rb   rt   seed_offset)r   r~  s     rD   	load_seedHalideOverrides.load_seed  s7    ((4#$C(A(ABTV\(]'^__rF   c                    SU  S3$ )Nz1./hl.sqrt(r8   rQ   r   s    rD   rsqrtHalideOverrides.rsqrt  s     QCq!!rF   c                    SU  S3$ )Nzhl.tan(r8   rQ   r   s    rD   tanHalideOverrides.tan  r   rF   c                    SU  S3$ )Nzhl.tanh(r8   rQ   r   s    rD   tanhHalideOverrides.tanh  r
  rF   c                    SU  S3$ )Nz3(hl.reinterpret(hl.UInt(32), hl.cast(hl.Float(32), z)) >> 31) != 0rQ   r   s    rD   signbitHalideOverrides.signbit  s    DQC~VVrF   c                    U  SU  SU SU 3$ )Nz - hl.trunc(/z)*rQ   r  s     rD   fmodHalideOverrides.fmod  s!     L1QCr!--rF   c                    SU  SU S3$ )Nzhl.pow(ra   r8   rQ   r  s     rD   powHalideOverrides.pow  s    2aS""rF   c                    SU  S3$ )Nzhl.log(r8   rQ   r   s    rD   logHalideOverrides.log  r   rF   c                    SU  S3$ )Nz hl.is_inf(hl.cast(hl.Float(32), r   rQ   r   s    rD   isinfHalideOverrides.isinf       2!B77rF   c                    SU  S3$ )Nz hl.is_nan(hl.cast(hl.Float(32), r   rQ   r   s    rD   isnanHalideOverrides.isnan  r  rF   c                    SU  S3$ )Nr   r8   rQ   r   s    rD   roundHalideOverrides.round  r7  rF   c                    SU  S3$ )Nry   r8   rQ   r   s    rD   floorHalideOverrides.floor  r7  rF   c                    SU  SU S3$ )Nr   r   z + hl.f32(0))rQ   r  s     rD   int_truedivHalideOverrides.int_truediv  s    1#U1#]++rF   c                .    SU R                    SU  SU S3$ )Nz"hl.floor(hl.cast(hl.Float(max(32, .type().bits())), ) / r8   r   r  s     rD   floordivHalideOverrides.floordiv  s)     18J1#TRSQTTUV	
rF   c                <   [         R                  " [         R                  " SU5      [        R                  5      n[         R                  " [         R                  " US5      [        R                  5      n[         R
                  " X#5      nSUR                   SU S3$ )N0r`   r  r8   )r  r   ltr;   int8subr   )r   r   leftrightr  s        rD   signHalideOverrides.sign  se    ||CFF3NEJJ7SVVAs^UZZ8ggd"!&&3%q11rF   c                    SU  S3$ )Nr   r8   rQ   r   s    rD   truncHalideOverrides.trunc  r7  rF   c                .    SU R                    SU  SU S3$ )Nz"hl.trunc(hl.cast(hl.Float(max(32, r  r  r8   r  r  s     rD   truncdivHalideOverrides.truncdiv  s)    
 18J1#TRSQTTUV	
rF   c                    SU  S3$ )Nr   r8   rQ   r   s    rD   ceilHalideOverrides.ceil  r
  rF   c                    SU  S3$ )Nr   z, 0)rQ   r   s    rD   reluHalideOverrides.relu  s    4  rF   c                |   [         R                  R                  U5      n[         R                  R                  [         R                  R	                  U5      [         R                  R                  U5      [        U5      S9nU[        R                  [        R                  4;  a  [        R                  " XB5      $ U$ Nbounds)r$   rb   prepare_indexinggenfuncindex_to_strused_dims_from_indexr   r;   r   r=   r  r   )r   re   r   indexvars        rD   
index_exprHalideOverrides.index_expr   s    ))$/hhHH!!%(HH))%0(.  

 ekk22<<++
rF   c                    [         R                  " U[        R                  5      n[         R                  " XU5      nX!l        [        [        U5      5      $ r   )r  r   r;   r   halide_clampindirect_indexing_sizer!   str)r   	index_varsizecheckwrap_negs        rD   indirect_indexing!HalideOverrides.indirect_indexing  s?     LLEKK8	$$Ye<	+/(!#i.11rF   c                   [         R                  R                  [         R                  R                  U5      S-
  5      n[	        U[
        [        R                  45      (       d  SUR                   SU S3nSU SU S3$ )Nr%   r`   r  r8   z	hl.clamp(z, 0, )	r$   rb   kexprrename_indexingr9   r:   r   Integerr   )r   r   r  r  ends        rD   r  HalideOverrides.halide_clamp  sm    hhnnQXX55d;a?@$emm 455UZZL	#a8C 5'se1--rF   c                   [         R                  R                  X5       nU" 5       nS S S 5        WR                  R                  (       a  [        U5      n[         R                  R                  SUR                   S[        U5       S3/ [        R                  " U5      S9n[        R                  " WXB5      $ ! , (       d  f       N= f)Nr`   r  r8   r  )r$   rb   
mask_loadsr  is_boolr   r  r   rE   r   wrapr  r  )maskbodyothernew_maskresults        rD   maskedHalideOverrides.masked  s    XX  -VF . ==  KE   v{{m9_U-C,DAF##E* ! 
 yy611 .-s   B::
Cc                    [        S5      e)Nfrexp)NotImplementedErrorr   s    rD   r  HalideOverrides.frexp.  s    !'**rF   rQ   )NT)r   torch.dtyper   Optional[torch.dtype])r   r  r   r  )TT)GrU   rV   rW   rX   r   r   r   classmethodr   r   r  r  r  r  r  r  r  r!  r$  r)  r,  r/  r2  r5  r9  r<  rA  rE  rH  rK  rN  rQ  rT  rX  r]  ra  re  rh  rl  ro  rr  ru  ry  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rY   rQ   rF   rD   r   r      s&    ,0	55 )5 5   ; ;   k k
     t t t t + +
     $ $                   % %       & & $ $ % % ' '                     8 8 9 9 L L ` ` " "     W W . . # #   8 8 8 8         , , 
 
 2 2     
 
   ! ! 	 	 2 2 . . 2 2  + +rF   r   halidec                  |   ^  \ rS rSr\R
                  " S5      r S	     S
U 4S jjjrS rS r	SS jr
S rSrU =r$ )HalideCSEVariablei6  z\b(tmp\d+)\[\?\]c                4   > [         TU ]  XU5        S U l        g r   )rK   rL   	used_dims)rM   r   r  r   rO   s       rD   rL   HalideCSEVariable.__init__9  s     	u-7;rF   c                n   [        U R                  =(       d    S5      n[        R                  " X#R	                  5       5       HK  n[        U[        5      (       d  M  UR                  c	   XU45       eUR                  UR                  5        MM     [        R                  R                  U5      U l        g )NrQ   )r   r  	itertoolschainvaluesr9   r  updater$   rb   sort_used_dims)rM   r   rt   kwargsusedargs         rD   update_on_args HalideCSEVariable.update_on_argsB  s~    $...B/??49C#011}}0C4d2CC0CMM* : 006rF   c                    [        U5      S:X  a  U R                   S3$ U R                   SSR                  [        [        U5      5       S3$ )Nr   z[()][ra   ])rs   r   joinmapr  )rM   dimss     rD   	index_strHalideCSEVariable.index_strJ  sE    t9>ii[%%))AdiiC78::rF   c                p    U R                   c  U R                   S3$ U R                  U R                   5      $ )Nz[?])r  r   r  )rM   s    rD   __str__HalideCSEVariable.__str__P  s0    >>!ii[$$~~dnn--rF   c           	         U R                   b!  [        S U R                    5       5      (       d   eU R                  U R                    Vs/ s H  o!R                  X"5      PM     sn5      $ s  snf )Nc              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7fr   r9   r   Expr.0r   s     rD   	<genexpr>-HalideCSEVariable.subs_str.<locals>.<genexpr>W  s!      2
/=!Jq%**%%~   '))r  allr  get)rM   replacementsr   s      rD   subs_strHalideCSEVariable.subs_strV  sc    ~~)c 2
/3~~2
 /
 /
 	
 
 ~~t~~N~!//5~NOONs   
A+)r  r   )r  zValueRanges[Any]r   r  rS   rT   )rS   r  )rU   rV   rW   rX   recompileundefined_rerL   r  r  r  r(  rY   rZ   r[   s   @rD   r  r  6  sZ    ::12L (,	< !< %	<
 
< <7;.P PrF   r  c                  V   ^  \ rS rSr% S\S'   S\S'   S\S'   S
U 4S jjrSS jrS	rU =r$ )DimensionInfoi]  zOptional[sympy.Expr]re   
sympy.Exprr  stridec                   > [         TU ]  5         [        R                  R                  R                  US5      (       a  U* nU* nXl        X l        X0l        g Nr   )	rK   rL   r$   graphsizevarsstatically_known_ltre   r  r0  )rM   re   r  r0  rO   s       rD   rL   DimensionInfo.__init__c  sH    77//::WF5D		rF   c                    U R                   c   eU R                   nU(       a  US:X  a  gU(       a  0 UEnUR                   H  n[        U[        R                  5      (       d  M$  [        U[        R                  5      (       d   e[        R                  R                  UR                  5      n[        U[        5      (       d   e[        UR                  U5      5      X'   M     [        X15      n[        R                  R!                  U5      $ )Nr   hl.Var())re   free_symbolsr   r   TMPr9   r   Symbolr$   rb   lookup_cse_varr   r  r!   r(  r"   r  )rM   r'  	zero_varsre   symr  s         rD   r  DimensionInfo.index_strl  s    yy$$$yy+l+L((!#txx00%c5<<8888((11#((;C%c+<====(:3<<;U(VL% ) d1Dxx$$T**rF   )re   r  r0  rR   NF)	rU   rV   rW   rX   __annotations__rL   r  rY   rZ   r[   s   @rD   r.  r.  ]  s$    

+ +rF   r.  c                   [         R                  R                  R                  X5      (       a  g [         R                  R                  R	                  U 5      n[         R                  R                  R	                  U5      nX#:X  a)  [         R                  R                  R                  X5        X#:H  $ ! [
         a     gf = fNTF)r$   r3  r4  statically_known_equals	size_hint	TypeErrorguard_equals)r  r  r   r   s       rD   eqrH  }  s    ww//<<GG&&t,GG&&u- 	v	%%d26M	  s   AB5 5
CCc                   [         R                  R                  R                  X5      (       a  g [         R                  R                  R	                  U 5      n[         R                  R                  R	                  U5      nX#:  a)  [         R                  R                  R                  X5        X#:  $ ! [
         a$    [        R                  " X5      nX@:X  a  X:g  s $  gf = frC  )	r$   r3  r4  r5  rE  rF  r   gcdguard_lt)r  r  r   r   rJ  s        rD   r  r    s    ww++D88GG&&t,GG&&u- 	u	!!$.5L  ii$;= 	s   AB5 5)C#"C#c                    ^  \ rS rSr% \r\rS\S'       S$U 4S jjr	S%S jr
S&S jrS'U 4S jjrS rS	 r  S(U 4S
 jjrS rS)S jrS rS rS(S jrS rS*S jrS+S jrS,S jr S-         S.S jjr          S/S jrS r        S0S jr\R<                  " 5       S. S1S jjrS1S jr S,S jr!S r"S2S jr#S-S jr$\%S 5       r&S-S,S  jjr'S! r(        S3S" jr)S#r*U =r+$ )4HalideKerneli  zCallable[[sympy.Expr], str]r  c                z  > [         TU ]  " U40 UD6  U R                  U l        U R                  U l        U R                  U l        [        5       U l        U R                  U l	        U R                  U l
        0 U l        0 U l        0 U l        0 U l        0 U l        0 U l        [#        [$        5      U l        SU l        g r@  )rK   rL   r  computeloadsstoresr)   indexing_code_dominside_reductionneeds_dom_indexinghas_reductionbuffer_dimensionsbuffer_offsetshalide_varsindex_replacementsreduction_renamesdom_renamesr   listbuffer_aliaseshas_indirect_indexing)rM   tilingr  rO   s      rD   rL   HalideKernel.__init__  s    
 	*6*yyYY
ii!/!1"&"7"7!22AC57;=@BCEHJ4?4E%*"rF   c                    [        U5      $ r   )r   )rM   r   s     rD   dtype_to_strHalideKernel.dtype_to_str  s    5!!rF   c                ^    U R                   R                  U SU< S35        [        XU5      $ )Nz = hl.Func(r8   )r  	writeliner  )rM   r   r  r   s       rD   create_cse_varHalideKernel.create_cse_var  s.    		tfKxq9: u55rF   c           
     J  >^^^ U R                   (       d"  U R                  (       d  U R                  (       a   e[        R                  " [
        R                  R                  R                  [        S9m[        R                  [        [        T U ]8  U5      5      n[        [            " 5       m["        R$                  R'                  U R(                   Vs/ s H  o"R*                  R-                  5       PM     sn5       Vs0 s H  nUR/                  5       U_M     snmS nUU4S jnUU4S jnU H  nUR1                  [2        5      (       aY  UR5                  [3        [6        R8                  " S5      [6        R8                  " S5      [6        R8                  " S5      5      U5        UR1                  [:        5      (       aD  UR5                  [;        [6        R8                  " S5      [6        R8                  " S5      5      U5        TR=                  [        T U ]9  U5      R>                  5        M     [A        S T 5       5      U l!        S	n[E        U R(                  5       GH  nUR*                  R-                  5        Vs/ s H  o3R/                  5       T;   d  M  UPM     n	nU	RG                  U4S
 jS9  U	(       d+  U	RI                  URK                  SURL                  5      5        Sn
[6        RN                  RP                  n/ nU
[S        U	5      :  Ga  [U        URL                  U5      (       Gd  U	 Vs/ s H2  n[U        URV                  U5      (       d  M   U" URX                  5      PM4     nnU
[S        U5      -  n
U(       d   U	5       eU[        RZ                  " [
        R                  R                  R\                  U5      -  nUR_                  U	 Vs/ s HQ  n[a        XRV                  5      (       d  M  [a        URV                  U5      (       d  M<  U" URV                  U-  5      PMS     sn5        U(       Ga  [        RZ                  " [6        Rb                  U5      n[U        US5      (       a7  U" URL                  U-  5      n[U        US5      (       a   e/ n[S        U	5      n
Sn[e        S[S        U R                  5       35      nURf                  (       a.  [e        S[S        U R                  5       35      U R                  U'   XR                  U'   URI                  UU45        X-  nU	 Vs/ s H,  n[U        URV                  U5      (       d  M   URX                  PM.     nnU
[S        U5      -  n
[S        U5      nU Vs/ s H/  n[U        UU5      (       a  M  [6        Rh                  " UU-  5      PM1     nn[S        U5      U:  d  US:X  d   eUR_                  U5        U(       a  GM  U
[S        U	5      :  a  [U        URL                  U5      (       d  GM  U	 H  n SnSn[U        URV                  U5      (       d/  UU   u  nnUS-  nUU-  n[U        URV                  U5      (       d  M/  Sn[6        RN                  Rj                  n[U        URX                  U5      (       d7  UU   u  nnUS-  nUUU-  -  nUU-  n[U        URX                  U5      (       d  M7  UU R                   UR/                  5       '   M     GM     U R                   H/  nU Rp                  Rs                  U SURt                  < S35        M1     U R                  (       aO  U Rw                  SU R                  Ry                  5        VVs0 s H  u  nnUU R                  U   _M     snn5        ggs  snf s  snf s  snf s  snf s  snf s  snf s  snf ! [l         a    U(       d   e[6        RN                  Rj                  n[6        RN                  RP                  nU H  u  nnUUU-  -  nUU-  nM     [
        R                  R                  Ro                  [3        UURV                  URX                  5      U R                  5      U R                   UR/                  5       '    GM{  f = fs  snnf )a  
Hook called right before codegen with every index that will be
used in the fused kernel.

This populates self.halide_vars/index_replacements/reduction_renames which is an alternate indexing
scheme that avoids using divide and modulus.  Instead of xindex/yindex/rindex
we base indexing on a larger number of vars whose product combines to those.

This function populates self.halide_vars, self.index_replacements, and self.reduction_renames
fallbackc                |    [         R                  " [        R                  R                  R                  U 5      5      $ r   )r   simplifyr$   r3  r4  remove_precomputed_replacementsrd   s    rD   rl  0HalideKernel.finalize_indexing.<locals>.simplify  s+    >>  @@F rF   c                "  > U T;   a  TU    nTR                  UR                  R                  UR                  U-  [        R
                  R                  R                  U[        UR                  U5      5      5      R                  5       5        g g r   )addrootlookupdivisorr$   r3  r4  evaluate_minr   lengthsymbol)basers  modulusnodeall_used_symbolssym_to_nodes       rD   visit_modular_indexing>HalideKernel.finalize_indexing.<locals>.visit_modular_indexing  sw    {""4( $$II$$w.((55#Xdkk7%C
 fh #rF   c           	        > U T;   a`  TU    nTR                  UR                  R                  UR                  U-  [	        UR
                  U5      5      R                  5       5        g g r   )rp  rq  rr  rs  r   ru  rv  )rw  rs  ry  rz  r{  s      rD   visit_floor_div7HalideKernel.finalize_indexing.<locals>.visit_floor_div  s]    {""4( $$II$$w. g6 fh	 #rF   rw  rs  rx  c              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7fr   )r   r   INDIRECT)r!  r>  s     rD   r"  1HalideKernel.finalize_indexing.<locals>.<genexpr>  s"      )
:J3N3..:Jr$  Fc                (   > T" U R                   5      $ r   )rs  )r   rE  s    rD   <lambda>0HalideKernel.finalize_indexing.<locals>.<lambda>  s    Yqyy%9rF   keyr%   r   Thhrz
 = hl.Var(r8   rdomN)=rY  rX  rZ  	functoolspartialr$   r3  r4  rE  r   dictfromkeysr  rK   r  r   r   r  r  from_iterablerange_treesnodesr	  rv  hasr   replacer   Wildr   r
  r9  anyr^  reversedsortappendrr  numelSOners   rH  rs  ru  reduceevaluate_maxextendr  rJ  r!   is_reductionrl  Zero
IndexErrorsimplify_with_rangesindexing_codere  r   codegen_rdomitems)!rM   indicestreer   rl  r|  r  r  had_fallbackr  handled_countrs  added_sym_sizesizes_to_addr  	next_sizer>  	new_sizes	prior_lensry  idxr  ru  re   
full_indexr0  vrvrz  rE  r{  rO   s!                                @@@rD   finalize_indexingHalideKernel.finalize_indexing  s&    ##t'7'74;Q;Q	
 
 %%agg&6&6&@&@3O	--EG$<g FG%c?, __22151A1AB1A""$1AB
 HHJM
	

		 Eyy))#

6*

9-

9-
 + yy""

6*

9- $ ##EG$<U$C$P$PQ% ( &) )
:J)
 &
" T--.D $

 1 1 3V 31xxzEU7UQ 3EVJJ9J:T[[DJJ78MggkkGN#e*,R

G5L5L05 051AIIw9O&HQXX&    \!22#*U*|	 0 0GG$$11<!  ## "'!&Agyy1 668C6H 6W!45!& # ) 0 0L II)Q'' %-TZZ'-A$B	#%i#3#333')(+E
'+,qT5E5E1F0G-HIC((6H T%5%5!6 787..s3 -6$$S)"))3	*:;(G38 S5aBqyy'<R5I S!S^3M #L 1I ".$!-A!!Y/ 6q9}5!- ! $
 |,y8INJJ ''	27 #l!  #e*,R

G5L5L\ CG w77$23$7	Tq4 !w77 F 77<<D f55$23$7	Tq,$	 !f55
 >BD++DKKM: o /l ##C((C5
388,a)HI $!!6:6L6L6R6R6TU6TUQT%%a((6TU "c C
z W 0 !T$2 " ''<!&J"WW[[F%3	T"fsl2
$ &4 ((==+JdkkR ,, ++DKKM:( Vsu   #^&^+ ^0^06^5^5^:
:^:
^:
5^?^?_ _$A_	2A,_	 _	?b	Cbbc           
     >   U R                   (       a  SOSnXR                  ;   a  U R                  U   $ 0 nU R                  R                  5        Hp  nU R                   (       d  X0R                  ;   a  M%  [
        R                  " SUR                  5      nU(       d   e[        SU UR                  S5       35      X#'   Mr     U R                  U S3UR                  5        VVs0 s H  u  pVX`R                  U   _M     snn5        X R                  U'   U$ s  snnf )zCRDom based indexing uses explicit iteration ranges for Func updatesioz^h(\d+)$r  r%   dom)rS  r[  rX  keysrZ  r*  matchr   r!   groupr  r  )rM   prefixrenamesr  mr  r  s          rD   setup_dom_indexingHalideKernel.setup_dom_indexingk  s    --3%%%##F++##((*C((S4J4J-Jchh/AH1-&!''!*.FGGL + 	hcN'--/R/R!1!1!!44/R	
 $+ 	 Ss   &Dc           	     ~   UR                  5        Vs/ s H'  nSU R                  U R                  U5      5       S3PM)     nnU R                  R	                  U SSR                  U5       S35        [        UR                  5       5       H)  u  pVU R                  R	                  U SU SU S35        M+     g s  snf )	Nhl.Range(0, r8   z = hl.RDom([ra   ]) = r  r  )r	  r  r  r  re  r  	enumerater  )rM   r   varsr  rsizesr  rsyms          rD   r  HalideKernel.codegen_rdom  s     
% 4::d&:&:4&@AB!D% 	 
 	$$v\$))F:K9LB%OP -GA((D6TF!A3a)@A .
s   .B:c                   > [         TU ]  U5      n[        XR                  5      n[        R
                  R                  R                  XR                  5      $ r   )	rK   r  r"   rY  r$   r3  r4  r  rX  )rM   r  rO   s     rD   r  HalideKernel.prepare_indexing  sE     (/5"9"9:ww44U<L<LMMrF   c                    [        U[        R                  5      (       a%  U R                  UR                  5      R
                  $ U R                  U   $ )zThe size of an index symbol)r   r   r:  r<  r   r  rX  )rM   r>  s     rD   sym_sizeHalideKernel.sym_size  s?    #txx((&&sxx0GGG$$rF   c           	     	  ^ ^^ / m[        UR                  S S9 H  n[        U[        R                  [        R
                  45      (       a  TR                  U5        ME  [        U[        R                  [        R                  [        R                  45      (       a  M   U5       e   [        R                  R                  nT Vs0 s H  of[        R                  R                  _M     nn/ n[        R                  " T R                  U5      5      n[        U[        R                   5      (       a  UR"                  OU/ H  n	U	R                   V
s/ s H  oU;   d  M
  U
PM     nn
[%        U5      S:X  a  XY-  nM;  [%        U5      S:X  a  X{S   ==   U	-  ss'   M[  / n['        [%        U5      5       Hm  nX   c   eX   u  p[)        U5      [)        U5      -  (       a/  UR+                  U V
s/ s H  oU;  d  M
  U
PM     sn
5        X-  n	M[  UR                  X45        Mo     / UQX4PnM     UU U4S jn/ nU H;  u  nnU H  n
UUR-                  U
5      -  nM     UR                  U" UU5      5        M=     UR/                  5        H  u  nnUR                  U" UU/5      5        M!     UR1                  S S9  U(       dF  T R2                  (       a4  UR                  [5        [        R                  R                  SS5      5        O[6        R8                  R:                  R=                  US   R>                  S5      (       dK  URA                  S[5        [        R                  R                  T(       a  SOUS   R>                  S5      5        U(       a  T(       d  UT RB                  ;   an  [6        R8                  R:                  RE                  UT RB                  U   5      (       a2  T RG                  UUT RB                  U   -
  5        T RB                  U   nOC[6        R8                  R:                  RI                  US5      (       a  T RG                  UU5        SnUn[J        RL                  " 5        He  nT RO                  UUUT5      (       a  UU4s  $ T(       a   eU SU 3nUT RP                  U   ;  d  MG  T RP                  U   R                  U5        Mg     gs  snf s  sn
f s  sn
f )	zEConvert address-based indexing into dimensions using self.halide_varsc                    U R                   $ r   r  r   s    rD   r  5HalideKernel.indexing_to_dimensions.<locals>.<lambda>  s    AFFrF   r  r   r%   Nc                ,  > [         R                  " U 5      n [        U5      S:X  aV  [         R                  " ST
S9nU R	                  X!S   -  5      nU(       a$  [        US   T	R                  US   5      X2   5      $ T(       a   U 5       e[         R                  " [        X Vs0 s H  oDT	R                  U5      S-
  _M     sn5      S-   5      n[         R                  R                  n[        U [         R                  5      (       a|  U R                   Hl  n[        U[         R                  5      (       d  M$  Xg-  n[         R                  " X-  5      n [         R                  " [         R                  " XW-  5      5      nMn     [        XU5      $ s  snf )Nr%   wild)excluder   )r   factorrs   r  r  r.  r  rl  r"   r  r  r9   Mulrt   r  ceiling)re   symsstride_wildr  r>  ru  r0  termis_storerM   symbolss           rD   expr_to_dimension>HalideKernel.indexing_to_dimensions.<locals>.expr_to_dimension  s<   <<%D4yA~#jjAJJ{!W45(QtAw!7   %%<^^4!N#t}}S'9A'="=!NORSSF WW[[F$		** IID!$66$~~dk:!&fm0L!M	 &
 !v66 "Os   $Fc                n    [         R                  R                  R                  U R                  [
        S9$ )Nri  )r$   r3  r4  rE  r0  r   )ds    rD   r  r    s"     0 0 : :188c : RrF   _view))sortedr9  r   r   HALIDEr:  r  UNBACKED_INTSIZEPRECOMPUTED_SIZEr   r  r  expandr  r9   Addrt   rs   ranger   r  popr  r  r^  r.  r$   r3  r4  rD  r0  insertrW  statically_known_geqapply_offset_to_dimensionstatically_known_gtr  countinstall_dimsr]  )rM   r  r  r  r>  r~  r  
split_exprsplit_failedpartr  	part_varsnew_split_failedr  
other_vars
other_partr  r  r  re   orig_varr  s   `  `                 @rD   indexing_to_dimensions#HalideKernel.indexing_to_dimensions  s!   %,,2BCCcDKK#:;;s#%))		--    	 D /67w!ow
7DFT11%89",UEII">">EJJUGKD$($5$5I$5qj$5II9~"Y1$Q<(D0(#% s<01A'?666-9_*J!*-
90EE!((Z)VZICU!Z)VW*(//0HI 2  F!1EI3DE! L$	7. &JD$
q)) KK)$56 ' $))+ICKK)$67 ,		R	S))M%'',,1=>!!99$q'..!LLKK=Hq$q'..RST d)))agg.>.>.S.S++C0/ / ..tVd>Q>QRU>V5VW,,S1!!55  ..tV<"A  dFH==Dy <JeA3'C$--h77##H-44S9 #] 8 J *Ws   $S!	S& S&.	S+;S+c                |   XR                   ;  a  X R                   U'   X0R                  U'   gU R                  U   U:w  d%  [        U R                   U   5      [        U5      :w  a  gU(       a  U R                   U   U:H  $ [        U R                   U   U5       H  u  pVUR                  UR                  :w  a    gUR
                  UR
                  :w  d  UR                  UR                  :w  d  MW  [        R                  R                  R                  UR
                  UR
                  5      Ul        SUl        M     g)z>Try to set self.buffer_dimensions[var], return True on successTFN)rV  rW  rs   zipr0  r  re   r$   r3  r4  r  )rM   r  r  r~  r  oldnews          rD   r  HalideKernel.install_dims  s   ,,,*.""3''-$s#v-""3'2
Y2 ))#.$66D2237>HCzzSZZ'xx388#sxx388';77++88388L ? rF   c                   US:X  a  g [        [        [        U5      5      5       H  nX   R                  S:X  d=  [        R
                  R                  R                  X!U   R                  5      (       d  MR  [        X!U   R                  5      nX$X   R                  -  -  nX   =R                  U-  sl	        M     US:X  d   eg )Nr   r%   )
r  r  rs   r0  r$   r3  r4  r  r   re   )rM   r  r~  r  r  s        rD   r  &HalideKernel.apply_offset_to_dimension  s    Q;%D	*+Aw~~"agg&6&6&K&KQ' '  Q7//$ , {{rF   c                   [         [        R                     " 5       nUR                   GH-  n[	        U[        R                  5      (       d   e[        U[        R                  5      (       a\  U R                  UR                  5      n[	        U[        5      (       a  UR                  c   eUR                  UR                  5        M  [        U[        R                  5      (       a  UR                  U5        M  [        U[        R                  [        R                   [        R"                  [        R$                  45      (       a  GM"  ['        SU 35      e   U R)                  U5      $ )zIDetect which range trees are used to populate HalideCSEVariable.used_dimszunhandled symbol )r   r   r;  r9  r9   r   r   r:  r<  r   r  r  r
  r  rp  r  r  r  INDEXr  r  )rM   r  r  r>  cse_vars        rD   r  !HalideKernel.used_dims_from_index  s   u||,.	%%Cc5<<0000c488,,--chh7w(9::))56   !2!23T[[11c"d''D4I4I4::V  ),=cU*CDD# &$ ""9--rF   c                   [        S U 5       5      (       d   e[        R                  " U R                  U R                  R                  5       5       Vs/ s H  nX!;   d  M
  UPM     nn[        U5      [        U5      :X  d   eU$ s  snf )Nc              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7fr   r  r   s     rD   r"  .HalideKernel.sort_used_dims.<locals>.<genexpr>7  s     @i:a,,ir$  )r%  r  r  rX  rZ  r	  rs   )rM   r  r>  ordereds       rD   r  HalideKernel.sort_used_dims6  s    @i@@@@@ !  $"8"8"?"?"A
 	  	 
 7|s9~---
s   	B"Bc                   ^^ SR                  UU4S jU 5       5      n[        U5      S:X  a  SnU$ [        U5      S:X  a  U S3nU$ )Nra   c              3  F   >#    U  H  oR                  TT5      v   M     g 7fr   )r  )r!  r  r'  r=  s     rD   r"  .HalideKernel.make_index_str.<locals>.<genexpr>C  s     QDqkk,	BBDs   !r   ()r%   ,)r  rs   )rM   r  r'  r=  r  s     `` rD   make_index_strHalideKernel.make_index_strB  sM    IIQDQQ	t9>I  Y!^$+QIrF   c                J   U R                   R                  U5      nU R                  U5      nU R                  X2S5      u  p4U SU R	                  U5       S3n[
        R                  R                  U5      nU[        R                  [        R                  4;   a  [        R                  nSU S3nU R                  (       Ga  [        U R                  [        5      (       a  U R                  R                  c   e[!        / U R#                  U5      QU R                  R                  Q75      nU R%                  U R'                  U5      5      nUR                  (       a  U R(                  R+                  UR,                   S35        U R(                  R+                  UR,                   SU R                   S35        U R/                  U R0                  =(       d    S5      n	U R(                  R+                  U S	[3        U5       S
U	 S35        U R(                  R+                  U SU S[3        U5       S
UR,                   S35        U$ U R(                  R+                  U SU R                   S
U S[3        U5       S35        U$ U R5                  XPR#                  U5      5      $ )z"Codegen a load from an InputBufferFr  r  ri   r8   z!_mask = hl.RDom([hl.Range(0, 1)])z_mask.where(r   z = hl.cast(ra   r  z + hl.cast(z_mask)z = hl.select(z
, hl.cast(z, 0)))rt   inputr  r  r  r$   r3  	get_dtyper;   r   r   r   
_load_maskr9   r  r  r   r  newfuncr  r  re  r   r  _load_otherr   r  )
rM   r   r  r  r  r   r   r  r  r  s
             rD   r  HalideKernel.loadK  sK   iiood#%%e,//EB	a++D12!4!!$'U]]ENN33MME+D63D???4??,=>>OO--9: #O$++E2OT__5N5NOI \\$"5"5i"@AF		##v{{m3T$UV		##v{{m<?PPQ$RS

4#3#3#8q9		##hk+e*<)=RwaH 		##hc${;u3E2FbU[\ M 		##hmDOO+<BtfJ{[`OaNbbgh M<<&?&?&FGGrF   c                `    U R                   R                  [        R                  " SSU5         $ )Nz\[.* )csevarname_mapr*  r  rM   r   s     rD   r<  HalideKernel.lookup_cse_varr  s$    xx##BFF7B$=>>rF   c                <   [        U[        5      (       d   eU R                  R                  U5      nU R	                  U5      nU R                  XRS5      u  pVU R                  U5      (       d  Ub  U R                  5       nU R                  Xg5      nUR                  U5      n	SR                  S/[        U5      -  5      =(       d    Sn
U R                  R                  [        X SU
 SU S35      5        OU R                  USS	9n[        U5      n	[         R"                  R%                  U5      nUc  U SU S
['        U5       SU	 S3nO,US:X  a  U SU S['        U5       SU	 S3nO[)        SU 35      eU R                  R                  [        X5      5        g)z"Codegen a store to an OutputBufferTNra   r8  r  r  z] = hl.undef(z.type()))r=  z] = hl.cast(r8   
atomic_addz] += hl.cast(zstore mode=)r9   r  rt   outputr  r  is_indirect_indexingr  r  r(  r  rs   r  re  r(   r  r$   r3  r  r   r  )rM   r   r  r   moder  r  r'  r  	value_str
undef_dimsr   r   s                rD   storeHalideKernel.storeu  s    %!23333iit$%%e,//DA	$$U++t/?224L++D?I|4I))ZL3t9$<=F$JIITU!J<}SE#RS ++DD+AIE
I!!$'<U!I;l;u3E2FbSTUD\!U!I;mK4F3Gr)TUVD%D6&:;;		L45rF   c           	        U R                   (       d   eU R                  (       a   eX#U4nXPR                  R                  ;   a  U R                  R                  U   $ [	        U[
        5      (       a2  US:X  d   eU R                  " U6 =U R                  R                  U'   nU$ [	        U[        5      (       a  UR                  c   e[        U R                  5      nU R                  UR                   Vs/ s H  oU;  d  M
  UPM     sn5      n	U[        UR                  5      -
  (       a:  U R                  U U R                  [        / UR                  QUQ75      5      5      nUR                  U R                  5      n
[        R                   R#                  X25      n[%        U5      nUS;   a  U	R&                   SU 3nU R(                  R+                  U SU SU
 S35        / nSn[-        U R                  5       HD  u  nnUR/                  U S	U S
35        US:w  a  US==   SU 3-  ss'   XR0                  U   -  nMF     U R(                  R+                  U	 SSR3                  U5       35        OUS:X  a  U R5                  X5      n	O[7        X<5      n[8        R:                  " [=        [?        5       5      5         U" X5      nSSS5        SU S[A        U5       S3nU R(                  R+                  U	 SU 35        U R(                  R+                  U	 SW 35        XR                  R                  U'   U	$ s  snf ! , (       d  f       N= f)zCodegen a reduction operationwelford_combineN)argmaxargmin_z = hl.z(rdom, r8   r%   r  r  *r  r  welford_reducer`   ra   )!rS  r  r  reduction_cacher9   tuplewelford_combine_implr  r  r   rZ  r  r  r  r(  r   	Reductiondefault_accumulatorr   r   r  re  r  r  rX  r  welford_reduce_fallbackr   r$   set_ops_handlerr   r   rE   )rM   r   r   reduction_typer   	cache_keyresult_tuplereduction_varsr  
result_varr&  defaultacc_typer  partsr0  r  r>  
combine_fncombine_strdefault_strs                        rD   	reductionHalideKernel.reduction  s	    $$$$??""6	00088++I66eU##!%6666))51DHH$$Y/,  %!2338SSS#D$:$:;\\C1N+BQC

 Ju77LL'##J/R/R>/R$STE NN4#9#9:	,,22>M"5)11!'q(89EII5'/?wykQR STEF#D$:$:;3was!_-Q;"I1VH-I**3//	 <
 II:,c%**U2C1D EF//55eCJ1.KJ""??3D#EF(? G$XJb1I0J!LKII:,c+ ?@II:,c+ ?@.8  +G D8 GFs   6	L?L?	M
Mc                   [        U[        5      (       a  UR                  c   e[        U[        5      (       a  UR                  c   e[        U[        5      (       a  UR                  c   e[        / UR                  QUR                  QUR                  Q7=(       d    U R                  5      nU[        U R
                  5      -  nU R                  U R                  U5      5      nXU4 Vs/ s H  nSUR                   S3PM     nnUR                  nU R                  R                  U SSR                  U5       S35        U R                  R                  U SU S35        U R                  R                  U SU S	35        U R                  R                  U S
U S35        U R                  R                  U SUR                  U R
                  5       35        U R                  R                  U SUR                  U R
                  5       35        U R                  R                  U SUR                  U R
                  5       35        U R                  R                  U SU SU S35        U R                  R                  U SU SU S35        U R                  R                  U SU SU SU S35        U SU SU S3U SU SU SU SU SU S3U S3/n	U R                  R                  U SSR                  U	5       S35        / n
[        S 5       HT  nU
R                  U R                  UR                  5      5        U R                  R                  U
S!    S"U S#U S$35        MV     [        U
5      $ s  snf )%Nr`   z.type(), 0)z = hl.Tuple([ra   r  z
_mean_1 = z[0]z_m2_1 = z[1]z_weight_1 = z[2]z
_mean_2 = z_m2_2 = z_weight_2 = z	_delta = z
_mean_2 - _mean_1z_new_weight = z_weight_1 + 	_weight_2z_w2_over_w = hl.select(z_new_weight == 0.0, 0.0, z_weight_2 / z_new_weight)z
_mean_1 + z	_delta * 
_w2_over_wz_m2_1 + z_m2_2 + z_weight_1 * _new_weightr   r/  r  r  r  )r9   r  r  r   rX  rZ  r  r  r   r  re  r  r(  r  r  r3  )rM   meanm2weightr  r=  r   r>  pfxr
  unpackedr  s               rD   r4  !HalideKernel.welford_combine_impl  sh   $ 122t~~7QQQ"/00R\\5MMM&"3449I9I9UUU?dnn?r||?f.>.>?S4CSCS
	 	Z 6 677	\\$"5"5i"@A
<@f;MN;MaXaffX[1;MNoo		zl-		'8J7K2NO		se:j\=>		se8J<s;<		se<
|3?@		se:dmmD<R<R.S-TUV		se8BKK8N8N,O+PQR		e<0F0F GHI	
 			se9SEC5HI		se>#l3%yQR		e*3%/H\Z]Y^^jk	
 e:cU)C5
;e8C5Yse9SEVYUZZdee;

 			zl-		&8I7J"MNqAOODLL)=)=>?II8B<.J<q1 EF  X7 Os   .Nc           
        U R                   (       d   e[        U5      [        U5      :X  d   e/ n[        [        R                     " 5       nU H  n[        U[        5      (       a  UR                  c   e[        UR                  5      [        U R                  5      -  (       a  UR                  U5        O?UR                  U R                  U / UR                  Q/ U R                  QS S P5      5        UR                  UR                  5        M     U R                  U R                  U5      5      nUR                  (       a0  [        UR                  5      [        U R                  5      -  (       d   e[        X5       VVs/ s H  u  pS[        U5       SU S3PM     n	nnU R!                  U R#                  U R$                  S   R&                  5      5      n
UR(                   S3nU S3nU R*                  R-                  U SU
 S	35        [        U R                  5      S:X  d   S
5       e/ U R                  Qu  nU[/        U5      0nU[/        U5      S-
  0n[        U5      S:X  a(  S nUR1                  U5      /nUR1                  U5      /nOwS n[3        [        U5      5       Vs/ s H  nUR1                  U5      SU S3-   PM     nn[3        [        U5      5       Vs/ s H  nUR1                  U5      SU S3-   PM     nnU R*                  R-                  U SU" U	5       35        [4        R6                  " [9        [;        5       5      5         U" UU5      nS S S 5        U R*                  R-                  UR1                  U5       SU" W5       35        [        U5      S:X  a  U4$ U Vs/ s H#  nU R                  U R                  U5      5      PM%     nn[=        U5       H*  u  nnU R*                  R-                  U SU SU S35        M,     [?        U5      $ s  snnf s  snf s  snf ! , (       d  f       N= fs  snf )Nr%   r`   ra   r8   r/  _rdomz.xz = hl.RDom([hl.Range(1, z)])z&multi-dimensional scan not implementedc                    U S   $ r2  rQ   r   s    rD   maybe_tuple&HalideKernel.scan.<locals>.maybe_tuple$  s    trF   c                ,    SSR                  U 5       S3$ )Nz
hl.Tuple([ra   r  )r  r   s    rD   rT  rU  +  s    #DIIaL>44rF   r  r  r  ) rS  rs   r   r   r;  r9   r  r  rZ  r  r  r
  r  r  r  r   r  r  r  r  r   r  re  r!   r(  r  r$   r8  r   r   r  r3  )rM   dtypesrA  values_origr	  all_used_dimsr   r=  r   initialru  scan_domscanscan_varscan_renames_curscan_renames_prirT  	read_left
read_rightr  rB  r.  unpack_varsr  s                           rD   r\  HalideKernel.scan  s'    $$$$6{c+....*,"5<<02 Ee%677EOO<WWW%//*Z8N8N-OOe$LL '$Ueoo$U7P9O9O7PQSRS7T$U
   1 ! \\$"5"5m"DE
##
:3G3G(H:""L
 )
 	
 

 !$F 3
 3 u-.bq9 3 	 

 D001A1A"1E1K1KLM oo&e,2		xj(@LM4))*a/ 	
4	
/ 0../$&8&>?$&8&>&BCv;! $,,-=>?I$--.>?@J5
 s6{++A ##$45!A3a@+   s6{++A ##$45!A3a@+  
 			zl#k'.B-CDE /@AB$Y
;K C		""#345S[9Q8RS	
 v;!= QWXQWAt||D$7$7$FGQWXk*DAqII1#SAaS :; +[!!k
: CB Ys$   P(0"P.*"P3
P88*Q	8
Qr  c                   U R                   R                  U R                  XS9n[        U[        5      (       d   eX$l        U$ r  )r  generater  r9   r  r  )rM   r   r  r  r  s        rD   r  HalideKernel.genfuncH  s?     hh		4?#01111!
rF   c                t    U R                   R                  5       n[        U[        5      (       d   eXl        U$ r   )r  newvarr9   r  r  )rM   r  r  s      rD   r  HalideKernel.newfuncP  s0    hhoo#01111!
rF   c                x    [         R                  R                  U5      R                  5       R	                  5       $ )z
We map all tensors to 1D buffers in Halide since Halide has trouble representing some strides that PyTorch
supports.  If there are gaps in the underlying layout the numel we pass to Halide includes the gaps while
PyTorch's numel excludes them.
)r$   r3  
get_buffer
get_layoutstorage_sizer  s     rD   halide_buffer_numel HalideKernel.halide_buffer_numelV  s+     ww!!$'224AACCrF   c                  ^ S n/ nU R                   R                  5       u  p4pS[        [        XE5      US9 H  u  nmUR	                  UT45        [        T[        5      (       d  M0  TR                  S:X  a  TR                  b   eUR                  U4S jU R                  R                  TR                  S5       5       5        M     U$ )zH
Halide requires scalar inputs before outputs, so need to reorder args.
c                ~    U u  p[        U[        5      (       a  gSUR                  ;   a  gSUR                  ;   d   eg)Nr%   out_ptrr   in_ptrr   )r9   r-   r   )	arg_tuple	_call_strr  s      rD   	arg_order.HalideKernel.halide_argdefs.<locals>.arg_orderc  s=    &NI#w''chh&388+++rF   r  r   c           
   3     >#    U  H<  nS [        UTR                  TR                  TR                  TR                  S94v   M>     g 7f)N)alias_of)r.   bufferr   r~  r   )r!  aliasr  s     rD   r"  .HalideKernel.halide_argdefs.<locals>.<genexpr>s  sJ       "G !!JJIIJJ%(XX	 "Gs   AArQ   )rt   python_argdefsr  r  r  r9   r.   r~  ry  r  r]  r&  r   )rM   rv  r  r.  r   r   call_strr  s          @rD   halide_argdefsHalideKernel.halide_argdefs^  s    
	 =?YY--/
a#CI9=MHcMM8S/*#y))zzQ3<<+???  "&!4!4!8!82!F 	 >" rF   c                .   / nU R                  5        GH=  u  p#[        U[        5      (       a	  SnSnSnSnOU R                  UR                      Vs/ s H'  n[        U R                  UR                  5      5      PM)     nnU R                  UR                      Vs/ s H'  n[        U R                  UR                  5      5      PM)     nn[        U5      [        U5      :X  d   e[        U R                  UR                     5      n[        UR                      S3nUR                  [        UUR                  UUUUR                  S95        GM@     [         R"                  R%                  5       n	U	R&                  S:X  aE  [(        R*                  R,                  /n
[(        R*                  R.                  nS[1        5       0nSnGOU	R&                  S:X  d   S5       eU	R2                  S	::  d   S
5       e[(        R*                  R4                  /n
[(        R*                  R6                  n[8        R:                  R=                  U	5      nSU
S	   ;  aF  S H@  u  nnUR>                  U:  d  M  UR@                  U:  d  M*  U
R                  SU U 35          O   U
R                  S5        SURB                  0n[E        S	U	R2                  5      nU
R                  S5        U
R                  S5        [(        R*                  RF                  (       d  U
R                  S5        [(        R*                  RH                  (       a  U
R                  S5        SU RJ                  ;   a  U
R                  S5        [M        USRO                  U
5      UUUS9$ s  snf s  snf )z)Compute metadata required by codecache.pyNlongr0  )shaper0  r~  ry  cpuparallelismcudazonly cpu/cuda supportedr   zonly default device supportedcuda_capability))      )r  r   )      )r  r   )r  r%   cuda_capability_user_contextstrict_float
no_runtime
no_assertsdebug64large_buffers-)target	schedulerscheduler_flagscuda_device)(r  r9   r-   rV  r   r0   r  r  r0  rs   rW  r/   r   r  r   ry  r$   r3  get_current_device_or_throwtyper   r   
cpu_targetscheduler_cpur    r  
gpu_targetscheduler_cudar;   r  get_device_propertiesmajorminormulti_processor_countr?   assertsr  rc   r   r  )rM   argtypesr.  r  r  r0  r~  r   r   current_devicer  schdulerr  r  
capabilityr  r  s                    rD   halide_kernel_metaHalideKernel.halide_kernel_meta  s   ))+FA#w'' "33CHH== $..qvv67=   "33CHH== $..qxx89=   5zS[000t22388<='		2315OOHH!! \\	% ,: <<>%'mm../F}}22H35O K!&&&0K2KK0!''1,M.MM,mm../F}}33H99.IJ q	1$LLE5!''50Z5E5E5N(8w&GH %M MM.)z??O
 a!5!56K 	n% 	l#}}$$MM,'==MM'"4###MM/*88F#+#
 	
Cs   .N.Nc                
  ^  T R                   R                  (       a  [        S5      eT R                  5       n[	        5       nUR                  SSS9  UR                  5         T R                  5        H  u  pE[        U[        5      (       a-  UR                  UR                   ST R                   S35        MG  UR                  (       d   U5       eSUR                  ;   a  SOS	n[        UR                  5      n[!        T R"                  UR                     5      nUR                  UR                   S
U SU SU S35        M     UR                  S5        UR                  5         T R                  5        H/  u  pEUR                  UR                   SUR                   35        M1     T R                   R%                  5        H  u  pUR                  U	 S
U
 35        M     UR                  T R&                  5        U 4S jnT R(                  R*                   HH  n[        U[,        5      (       a  [.        R0                  R3                  X5      nUR                  U5        MJ     UR                  S5        UR                  S5        T R                  5        GH  u  pE[        U[        5      (       aU  [4        R6                  R8                  R;                  UR<                  SS9nUR                  UR                   SU S35        Mp  T R"                  UR                     n/ n[?        U5       GH  u  nnT RA                  [4        R6                  R8                  R;                  URB                  SS9U5      nURE                  SU S35        SUR                  ;  d  Mp  UR                  UR                   SU S35         UR                  UR                   SU S[G        URH                  5       S35         UR                  UR                   SU S[G        URB                  5       S35        GM     UR                  UR                   SSRM                  U5       S35        GM     URO                  S5        UR                  SRQ                  5       5        URR                  (       am  UR                  S[T        RV                  " URR                  5      < SURX                  < S URR                  < SURZ                  < S!3	SS9  UR]                  5       $ UR                  S"URX                  < S#3SS9  UR]                  5       $ ! [J         a     GN\f = f! [J         a     GM7  f = f)$z3Called at the end to generate a final kernel stringinplace_buffersz
            import halide as hl
            from torch._inductor.runtime import halide_helpers
            from math import inf, nan

            @hl.generator(name="kernel")
            class Kernel:
        Tstripz = hl.InputScalar(r8   outzhl.OutputBufferzhl.InputBufferr  r   ra   z&
            def generate(g):
        z = g.c                   > [        [        TR                  R                  U R	                  S5         5      nUR
                  c   U5       e[        U5      $ )Nr%   )r   r  r  r  r  r  r  )r  r  rM   s     rD   update_index1HalideKernel.codegen_kernel.<locals>.update_index  sE    ($((*>*>qwwqz*JKC==,1c1,s8OrF   r  zassert g.using_autoscheduler()r%   ri  z.set_estimate(r  z.dim(z).set_min(0)z).set_stride(z).set_extent(z.set_estimates([r  r   zN
            if __name__ == "__main__":
                hl.main()
            z:
                else:
                    hl.load_plugin(z))
                    target = hl.Target(z=)
                    autoscheduler = hl.AutoschedulerParams(a  )
                    with hl.GeneratorContext(target, autoscheduler):
                        gen = Kernel()
                        pipeline = gen._build_pipeline()
                        # gen.compile_to_callable() does not run the autoscheduler
                        pipeline.apply_autoscheduler(target, autoscheduler)
                        kernel = pipeline.compile_to_callable([
                                gen._get_input_parameter(a.name)._to_argument()
                                for a in gen._get_arginfos()
                                if a.dir == hl.ArgInfoDirection.Input
                            ], target)
                zR
                  else:
                      with hl.GeneratorContext(hl.Target(zX)):
                          kernel = Kernel().compile_to_callable()
                  )/rt   r  rH   r  r)   splice	do_indentr  r9   r-   re  r   rc   rz  r   r   rs   rV  aliasesr  r  _linesr  r  r,  r  r$   r3  r4  rE  re   r  _autoscheduler_workaroundsr  r  r:   r0  rF  r  do_unindentrstripr  r   find_libautoscheduler  r  getvalue)rM   r   metacoder.  r  argclsargtypendimr  r   r  r   hintr  range_hintsr  dims   `                 rD   codegen_kernelHalideKernel.codegen_kernel  s   99$$/00&&(  	 
	
 	))+FA#w''#((+=d>N>N=OqQRzz&3&z.3sxx.?*EU%cii0411#((;<#((3vhay4&JK , 		

 	))+FANNchhZuSXXJ78 ,		))+HCNNcU#cU+, ,D&&'	
 II$$D$$$(5599,MNN4 	 %
 	r78))+FA #w''ww''11#((Q1G#((>$qAB--chh7 'oFAs::((22388a2H$D  &&dV1'=>CHH,#((5<'HI! NN#&88*E!M#cjj/ARRS T
! NN#&88*E!M#chh-PQ R .& #((+;DIIk<R;SSUVW; ,> 	 		
 >>KK$$3$H$H$X#[ \((, 7<<@NN;MRPTPdPdOg h	  #  8 }} KK::>++ I
    }}]  ) ! !  ) ! !s$   7U97U
UU
U'&U'c                    [        U5      S:X  aV  [        R                  R                  S:X  a8  [        R
                  R                  5       R                  S:X  a  [        SU 5      n U $ )Nr%   Anderson2021r  r   )	rs   r   r   r  r$   r3  r  r  r?   )r   r  s     rD   r  'HalideKernel._autoscheduler_workaroundsN  sN     IN,,>335::fD Aq	ArF   c                   [         R                  R                  nU R                  5        VVs/ s H  u  pEUR                  b  M  U PM     nnn[         R                  R                  5       nUR                  S:X  a;  UR                  UR                  [         R                  5      nUR                  U5        UR                  UUUSS9  gs  snnf )zCodegen a call to this kernelNr  F)devicetriton)r$   r3  wrapper_coder  ry  r  r  write_get_raw_streamr  r  generate_kernel_call)	rM   r   ry  wrapperr   r  	call_argsr  stream_names	            rD   call_kernelHalideKernel.call_kernelY  s    ''&&*.*=*=*?X*?3<<VsV*?	X<<>&(!66~7K7KQWWUK[)$$!	 	% 	
 Ys   CCc                    gr@  rQ   )rM   r  s     rD   generate_assertHalideKernel.generate_asserth  s    rF   c                    g r   rQ   )rM   re   r  loweruppers        rD   check_boundsHalideKernel.check_boundsk  s     	rF   )r]  rV  rW  rO  r[  rX  r^  rU  rY  rR  rP  rT  rZ  rQ  )r_  zdict[str, sympy.Expr]rS   rT   )r   r  rS   r  )NN)r  zSequence[sympy.Expr])r  r/  )r  r  r  r/  r  r   r@  )r   r  r  r/  )r   r  r   )
r   r  r  r/  r   r'   r%  r6   rS   rT   )
r   r  r   r  r9  r5   r   +Union[CSEVariable, tuple[CSEVariable, ...]]rS   r  )rW  ztuple[torch.dtype, ...]rA  zUCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]rX  tuple[CSEVariable, ...]rS   r  )rS   r  )rS   r   )re   r/  r  r/  r  r   r  r   ),rU   rV   rW   rX   r   	overridestexprr  rA  rL   rb  rf  r  r  r  r  r  r  r  r  r  r  r  r  r<  r(  rD  r4  r\  r   unknownr  r  rn  r  r  r  r   r  r  r  r  rY   rZ   r[   s   @rD   rM  rM    s   I).E&.+%+ 
	+6"6iV*BNN%f:P(
..
%HN? SW66 *63>6FO6	6:;; ; &	;
 ;; 
5;z$LQ"'Q"
Q" -Q" 
!Q"h *5)<)<)>	D"HQ
fwr  
&09=FJ rF   rM  c                  2    \ rS rSr\r\SS j5       rS rSr	g)HalideSchedulingiq  c                    [        [        R                  [        R                  [        R                  /5      n[
        R                  R                  (       a  UR                  [        R                  5        U$ r   )
r   r&   TUPLE_REDUCTIONPREFER_STORE_LOOP_ORDERREDUCE_TO_SINGLE_ELEMENTr   r   scan_kernelsrp  SCAN)r   r  r  s      rD   get_backend_features%HalideScheduling.get_backend_featurest  sR    ..6677
 ==%%JJ~**+rF   c                   [         R                  R                  nXR                  ;   a  UR                  U   nU$ SUR	                  5        3nXTR                  U'   UR                  S5        [        5       nUR                  SUR                  5       < S35        UR                  USS9  UR                  S5        [        X$5      u  pxU SU 3n	UR                  XVR                  5       U	5        [        S	5      (       a  [        US
U5        U$ )z6Codegen kernel definition to go in output wrapper codehalide_kernel_zEfrom torch._inductor.runtime.hints import HalideMeta, HalideInputSpeczasync_compile.halide(z, '''Tr  z''')
kernel_metadatar  )r$   r3  r  src_to_kernelnext_kernel_suffixadd_import_oncer)   re  r  r  r   define_kernelr  r   r   )
rM   src_codenode_schedulerb   r  kernel_namecompile_wrapperoriginsdetailed_originsmetadata_comments
             rD   r  HalideScheduling.define_kernel  s   ''&&,,,!//9K. + +7+E+E+G*HIK.9!!(+##W -.O%%'(A(A(C'FeL ""84"8%%f-(;M(S%G")"-=,>?!!5579I ''899#KX>rF   rQ   N)r  ztorch.devicerS   zOrderedSet[BackendFeature])
rU   rV   rW   rX   rM  kernel_typer  r  r  rY   rQ   rF   rD   r  r  q  s    K
 
rF   r  )r
__future__r   dataclassesr  r  loggingr*  collectionsr   mathr   typingr   r   r   r	   r
   r   r   r;   torch._logging_prims_commonr   utils._ordered_setr   utils._sympy.functionsr   r   utils._sympy.symbolr   r   utils._sympy.value_rangesr   r  r   r   	codecacher   r   metricsr   r   ops_handlerr   runtime.hintsr   r   utilsr   r   r    r!   r"   virtualizedr#   r  r$   commonr&   r'   r(   r)   r*   r+   r,   r-   r.   cppr/   	cpp_utilsr0   simdr1   r2   r3   collections.abcr4   r5   r6   	getLoggerrU   r  rE   RuntimeErrorrH   r]   r   r  pexprr   r   r   r   float64r  int16r   r=   uint8uint16uint32uint64r   r   r   r   _initialize_pointwise_overridesr  	dataclassr.  rH  r  rM  r  rQ   rF   rD   <module>r     s   "     	 #  F F    - , ? 7 4  ' ) B ) 7  )
 
 
   ; ; (6!
F, F
zQM zQz 	 
JJ	NNO	MM>	MM>	MM>	JJ	KK	KK	KK	KK	LL-	LL-	LL-"C+k C+L
  / / 9$P $PN + + +>
 T: Tn+~ +rF   