
    shLs                         S SK r S SKrS SKrS SKJr  S SKrS SKJr  S SKJ	r	  S SK
JrJrJrJr  S SKJr  S SKJr  SS	KJrJr  SS
KJrJr  SSKJr  SSKJr  SSKJrJr  SSK J!r!  SSK"J#r#J$r$J%r%  SSK&J'r'J(r(J)r)J*r*J+r+J,r,J-r-J.r.J/r/  SSK0J1r1J2r2J3r3J4r4J5r5J6r6J7r7J8r8J9r9J:r:J;r;J<r<J=r=  S\>S\\?\@S4      4S jrA S SKBrB\A" \BR                  5      rDSrE\Db  \Du  rFrGOS rFS rG \R                  " \J5      rK\R                  R                  rM\%" S\7\R                  R                  b  \E(       a  \FS:  a  \GS:  a  SOSS9rP\%" S\:SS9rQ\ R                  " S5      S  5       rS\$" \R                  S!5      rU\$" \R                  S"\MR                  R                  S#9rX\$" \R                  S$5      rZ\$" \R                  S%SS&9r\S' r]S( r^S) r_SSSS*.S+ jr`\$" \`S5      ra\!" \MR                  SS,9SS-.S. j5       rb\!" \MR                  SS,9SS-.S/ j5       rc\!" \MR                  SS,9SSSS0.S1 j5       rd\!" \MR                  SS,9SSS2.S3 j5       re\ R                  " S5      S4\\@   S\f4S5 j5       rgS6 rhS7 ri  S<S8\\@   4S9 jjrjS: rkS; rlg! \H a    SrDSrES rFS rG GNf = f)=    N)Optional)counters)AutoHeuristicSelectAlgorithm)	AHContextcontext_add_stridescontext_add_using_tf32mm_operations)CppGemmTemplate)V   )configir)CUTLASS2xGemmTemplateCUTLASS3xGemmTemplate)CKGemmTemplate)PythonWrapperCodegen)FlexibleLayout	is_triton)register_lowering)autotune_select_algorithmExternKernelChoiceTritonTemplate)	get_gpu_shared_memoryget_tma_workspace_arguse_aten_gemm_kernelsuse_ck_gemm_templateuse_cpp_gemm_templateuse_cutlass_templateuse_max_autotuneuse_triton_templateuse_triton_tma_template   )_is_static_problemaddmm_epilogueextra_mm_configsint8_mm_configsmm_args
mm_configsmm_grid
mm_optionspersistent_mm_configspersistent_mm_gridpersistent_mm_optionsshould_fallback_to_atentriton_configversion_stringreturn.c                     Sn[         R                  " X5      nU(       a   [        S UR                  5        5       5      $ g )Nz(\d+)\.(\d+)?c              3   8   #    U  H  n[        U5      v   M     g 7fN)int).0groups     m/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/_inductor/kernel/mm.py	<genexpr> parse_version.<locals>.<genexpr>?   s     <^ESZZ^s   )rematchtuplegroups)r0   patternr<   s      r8   parse_versionr@   :   s2    GHHW-E<U\\^<<<    TFmm   aX	  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if ((stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1)) and M >= BLOCK_M:
        offs_a_m = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        offs_a_m = rm % M
    if ((stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1)) and N >= BLOCK_N:
        offs_b_n = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        offs_b_n = rn % N
    offs_k = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    for k_idx in range(0, tl.cdiv(K, BLOCK_K)):
        {% if not EVEN_K %}
        a_mask = offs_k[None, :] < (K - k_idx * BLOCK_K)
        b_mask = offs_k[:, None] < (K - k_idx * BLOCK_K)
        {% endif %}
        a_k_idx_vals = offs_k[None, :] + (k_idx * BLOCK_K)
        b_k_idx_vals = offs_k[:, None] + (k_idx * BLOCK_K)

        idx_m = offs_a_m[:, None]
        idx_n = a_k_idx_vals
        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}

        idx_m = b_k_idx_vals
        idx_n = offs_b_n[None, :]
        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
a2	  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1):
        offs_a_m = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        offs_a_m = rm % M
    if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1):
        offs_b_n = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        offs_b_n = rn % N
    offs_k = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    for k_idx in range(0, tl.cdiv(K, BLOCK_K)):
        {% if not EVEN_K %}
        a_mask = offs_k[None, :] < (K - k_idx * BLOCK_K)
        b_mask = offs_k[:, None] < (K - k_idx * BLOCK_K)
        {% endif %}
        a_k_idx_vals = offs_k[None, :] + (k_idx * BLOCK_K)
        b_k_idx_vals = offs_k[:, None] + (k_idx * BLOCK_K)

        idx_m = offs_a_m[:, None]
        idx_n = a_k_idx_vals
        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}

        idx_m = b_k_idx_vals
        idx_n = offs_b_n[None, :]
        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
)namegridsourcemm_persistent_tmaal  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return

    start_pid = tl.program_id(0)
    grid_m = tl.cdiv(M, BLOCK_M)
    grid_n = tl.cdiv(N, BLOCK_N)
    k_tiles = tl.cdiv(K, BLOCK_K)
    num_tiles = grid_m * grid_n
    tiles_per_SM = num_tiles // NUM_SMS
    if start_pid < num_tiles % NUM_SMS:
        tiles_per_SM += 1

    tile_id = start_pid - NUM_SMS
    ki = -1

    width = GROUP_M * grid_n
    rk_for_mask = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    workspace_base = ws_ptr + start_pid * 2 * TMA_SIZE
    a_desc_ptr = workspace_base
    b_desc_ptr = workspace_base + TMA_SIZE

    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=a_desc_ptr,
        global_address=A,
        load_size=[BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
        global_size=[M, K] if A_ROW_MAJOR else [K, M],
        element_ty=A.dtype.element_ty,
    )
    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=b_desc_ptr,
        global_address=B,
        load_size=[BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
        global_size=[K, N] if B_ROW_MAJOR else [N, K],
        element_ty=B.dtype.element_ty,
    )

    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)

    pid_m = 0
    pid_n = 0
    rm = 0
    rn = 0

    for _ in range(0, k_tiles * tiles_per_SM):
        ki = tl.where(ki == k_tiles - 1, 0, ki + 1)
        if ki == 0:
            tile_id += NUM_SMS
            # re-order program ID for better L2 performance
            group_id = tile_id // width
            group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
            pid_m = group_id * GROUP_M + (tile_id % group_size)
            pid_n = (tile_id % width) // (group_size)

            rm = pid_m * BLOCK_M
            rn = pid_n * BLOCK_N

        rk = ki * BLOCK_K

        a = tl._experimental_descriptor_load(
            a_desc_ptr,
            [rm, rk] if A_ROW_MAJOR else [rk, rm],
            [BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
            A.dtype.element_ty,
        )
        b = tl._experimental_descriptor_load(
            b_desc_ptr,
            [rk, rn] if B_ROW_MAJOR else [rn, rk],
            [BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
            B.dtype.element_ty,
        )
        acc += tl.dot(
            a if A_ROW_MAJOR else a.T,
            b if B_ROW_MAJOR else b.T,
            allow_tf32=ALLOW_TF32,
        )

        if ki == k_tiles - 1:
            # rematerialize rm and rn to save registers
            rcm = rm + tl.arange(0, BLOCK_M)
            rcn = rn + tl.arange(0, BLOCK_N)
            idx_m = rcm[:, None]
            idx_n = rcn[None, :]
            mask = (idx_m < M) & (idx_n < N)

            # inductor generates a suffix
            {{store_output(("idx_m", "idx_n"), "acc", "mask", indent_width=12)}}
            acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
c                     [        U 5      $ r4   )r   )fns    r8   lazy_register_extern_choicerJ   M  s    b!!rA   z
at::mm_outzat::addmm_out)op_overloadzat::_int_mm_outzat::_sparse_semi_structured_mm)has_out_variantc                 d    U R                  5       [        R                  [        R                  4;   $ r4   )	get_dtypetorchint8uint8)mats    r8   _is_int8_matrS   a  s     ==?uzz5;;777rA   c                     X-  S:  $ )Ni     )mnks      r8   _is_large_block_for_cpurY   e  s    55=rA   c                 $    U S:X  a	  S[         S.$ 0 $ )Ncpug      ?)scaleexclude)rY   )devices    r8   mm_config_kwargsr_   j  s     .
 	
 IrA   outalphabetac          	          U R                  S5      S:X  d  U R                  S5      S:X  a  [        R                  " U S   XX4US9$ [        R                  " XX#XES9$ )z
Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
kernel under the hood.  There are a few shapes where this is slower,
but they are rare.
r   r"   r`   )stridesizerO   addmm)inpmat1mat2ra   rb   rc   s         r8   
bias_addmmrk   s  sO     zz!}SXXa[A-{{3q643$OO;;s$uHHrA   )type_promotion_kindlayoutc                x   [        XUS9u  p4pRpSn[        S   SU SU SU 3==   S-  ss'   [        R                  SUUUU R	                  5       UR	                  5       U5        Un[        5       (       d)  [        UR                  UR                  UR                  S9n[        5       (       a  [        R                  X4U5      /O/ n[        U5      u  pU
(       a  [        U5      (       a  [        X4U40 [!        ["        R$                  " U 5      5      D6 H)  n[&        R(                  " U4X4US	.[+        XXEU5      D6  M+     [-        X5      (       ay  [/        X4U40 [!        ["        R$                  " U 5      5      D6 HK  n[0        R(                  " U4X4U[3        S
U R5                  5       S9S.[+        XXEU5      D[7        X5      D6  MM     U
(       a)  [9        X#XE5      (       a  [:        R<                  " XX/5        U
(       a)  [?        X#XE5      (       a  [@        RB                  " XX/5        [E        X U5      (       a  [F        RH                  " UUX/5        X/nU
(       GaQ  [        U5      (       Ga@  [J        RL                  RN                  RQ                  U5      (       Ga  [S        U 5      (       Ga   / n[        5       (       a  URU                  S5        [W        U5      n[Y        X4U40 [!        ["        R$                  " U 5      5      D6 H)  n[&        R(                  " U4X4US	.[+        XXEU5      D6  M+     [[        U UUUUUUU[]        5       S SUS9n[J        RL                  RN                  R_                  U5      (       d2  Ub*  [W        U5      S:  a  U Vs/ s H  nUU;   d  M  UPM     nnOUS U n[`        Rb                   H.  nURU                  [e        U5      R                  X4U5      5        M0     [g        U5      (       a%  [        R                  X4U5      Ri                  5       $ [k        XhX/U5      $ s  snf )Nrm   rB   aten_mm_infozaten.mm__r"   zOTuned aten.mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr^   dtyperf   input_nodesrn   r   num_tma_descriptorsr^   ru   rn   workspace_arg	extern_mm
   )top_kalways_includedr   )6r'   r   loginforN   r   r   r^   rs   rf   r   aten_mmbindr#   r    r(   r_   r   get_device_typemm_templatemaybe_append_choicer*   r!   r+   persistent_tma_mm_templater   
get_devicer-   r   r   add_cutlass_gemm_choicesr   r   add_ck_gemm_choicesr   r
   add_choicesrO   	_inductorr   run_autoheuristicr   appendlenr%   mm_autoheuristicr	   collect_autoheuristicinductor_configexternal_matmulrJ   r.   output_noder   )ri   rj   rn   rV   rW   rX   rD   aten_layoutchoicesstatic_shape
is_nonzeror   ru   r}    num_choices_before_extra_configs
ah_choiceschoices                    r8   tuned_mmr     s   ")$V"DA!TD ^xs!A3as3494HHY			 K$==6;;
 6K5L5LtlK	01RT   2&9L)&11 qW,<R=O=OPT=U,VWF++!L Vf5	 X #4../a+B,>,>t,DE +>>
!%!"7,-#0#	
 !A&9
 ,D7
 *6a;;66wU*6a;;**7TLIV400##L	
 ,K''OO""44T::dOO """";/+.w<(&!
'(:(:4(@A
F ++!L Vf5	
 &O+

 %%;;DAA%#j/A*=
 18Pf6Z;O6P!"C#CD,,215::D<PQ - w''||TL+6BBDD$TTL&II Qs   
P7$P7c                ,   [        XU[        R                  S9u  p4pRp[        S   SU SU SU 3==   S-  ss'   [        R                  SUUUU R                  5       UR                  5       U5        [        U5      u  pgU=(       a    U=(       a    [        X#XE5      n[        5       (       a  [        R                  X4U5      /O/ n	U(       a  [        R                  " XX/SSS9  U(       af  [        USS	9(       aW  [        X4U40 [!        ["        R$                  " U 5      5      D6 H)  n
[&        R(                  " U	4X4US
.[+        XXEU5      D6  M+     [-        U	5      (       a%  [        R                  X4U5      R/                  5       $ [1        SXU/U5      $ )N)rn   	out_dtyperp   zaten._int_mm_rq   r"   zTTuned aten._int_mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sTfuseablenon_fuseable)enable_int32rt   int_mm)r'   rO   int32r   r~   r   rN   r#   r   r   aten__int_mmr   r   r   r    r&   r_   r   r   r   r   r*   r.   r   r   )ri   rj   rn   rV   rW   rX   r   r   use_cutlassr   r   s              r8   tuned_int_mmr     s   ")6U[[#A!T
 ^}QCq1QC89Q>9HH^			  2&9LW:W2FvRS2WK 6K5L5L		D<	01RT  66d\Dt	
 )&tD%!
'(:(:4(@A
F ++!L Vf5	
 w''  $v6BBDD$XwtfMMrA   )rb   rc   rn   c                   Sn[        XXS9u  pxppn
[        U5      u  p[        S   SU SU SU	 3==   S-  ss'   [        R	                  SUUU	UR                  5       UR                  5       U5        U(       a  [        5       (       dw  SS	KJnJ	n  [        X]5      (       a&  U" UR                  UR                  UR                  S
9n[        5       (       a  [        R!                  XU4UUUS9/O/ n[#        SXX/U5      $ [        5       (       a  [        R!                  XU4UUUS9/O/ n[        5       (       a{  U
R%                  5       S   S:X  ad  U
R'                  5       R(                  S:X  aF  [*        R,                  R.                  (       a'  UR1                  S[2        R!                  XU4XSUS95        U(       Ga%  [5        U5      (       Ga  [7        XxU	40 [9        [:        R<                  " U5      5      D6 HC  n[>        R@                  " U4XU4US.[C        UXxX5      DS[E        UR                  X45      S.D6  ME     [G        X5      (       a  [I        XxU	40 [9        [:        R<                  " U5      5      D6 He  n[J        R@                  " U4XU4U[M        SUR'                  5       S9S.[C        UXxX5      D[O        X5      DS[E        UR                  X45      S.D6  Mg     U(       af  U(       a_  [Q        XWX5      (       aN  [R        RT                  " U
RV                  RX                  S   5      S:w  a  [Z        R\                  " UUXU
/UU/ SQS9  U(       a.  [_        XWX5      (       a  [`        Rb                  " UUXU
/UU/ SQS9  [e        XQU5      (       a  [f        Rh                  " UUXU/UUSS9  [k        U5      (       a  URm                  [        R!                  XU4UUUUS95        U
R%                  5       S   S:X  ad  U
R'                  5       R(                  S:X  aF  [*        R,                  R.                  (       a'  UR1                  S[2        R!                  XU4XSUS95        [#        SXX/U5      $ )N)rc   rb   rm   rp   zaten.addmm_rq   r"   zRTuned aten.addmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr   )FixedLayoutr   rr   )rb   rc   rg   cudart   )prefix_argsepilogue_fnr   rv   rx   )r   r   r"   )rb   rc   input_reorderT)rb   rc   has_bias)7r'   r#   r   r~   r   rN   r   torch._inductor.irr   r   
isinstancer^   rs   rf   r   
aten_addmmr   r   
get_strider   typer   tritonautotune_cublasLtinsertaten_bias_addmmr    r(   r_   r   r   r   r   r*   r$   r!   r+   r   r   r-   r   r   statically_known_int_or_nonern   re   r   r   r   r   r   r   r
   r   r.   r   )rh   ri   rj   rb   rc   rn   ordered_kwargs_for_cpp_kernelrV   rW   rX   inp_expandedr   r   r   r   r   r   s                    r8   tuned_addmmr   (  s   $5!07C0W-A!T1&9L ^{1#Qqc1#671<7HH\			  0 2 2 	Cf**#}}FLLv{{F %&& %	     	 )';LfUU !"" OOT*	  	
   	##%a(A-##%**f4""44 	  T*Fd ! 	
 )&11 qW,<R=O=OPT=U,VWF++)6 VQ15	
 *6<<E X #4../a+B,>,>t,DE +>>!-T :!"7,-#0#	 !q9 ,D7 !" .v||U I" 
';Fq'L'L
 !==##**2. 
 "::\*' *6a;;**&#	
 V400##&	
 w''OOT*-  	
 ##%a(A-'')..&8&&88 NN$$!.$ %  %4f rA   )r   rn   c                   SSK Jn  U" XU5      u  pnU R                  5       u  pgUR                  5       u  pUR                  5       u  p[        R                  R
                  R                  Xh5      n[        R                  R
                  R                  SU-  U
5      nUc:  SSKJn  U" UR                  5       U(       a  UOUR                  5       X/US/5      nO
Ub   S5       e[        5       (       a  [        R                  XU4XCS9/O/ nX-  S:w  a*  [        XLX5      (       a  [        R                   " XXU/SSS	9  [#        S
XX/U5      $ )Nr   )realize_inputsr   )r   r"   z,out_dtype is ignored if layout is specified.)r   Tr   sparse_semi_structured_mm) torch._inductor.select_algorithmr   get_sizer   graphsizevarsguard_equalsr   r   r   rN   r   aten__sparse_semi_structured_mmr   r   r   r   r   )ri   	mat1_metarj   r   rn   r   m1k1m2rq   k2rW   rV   rX   r   r   s                   r8   tuned_sparse_semi_structured_mmr     sP    @*4DADT]]_FB EBMMOEB	%%b-A	%%a"fb1A~2OO"I(8FF	
  P"PP  !""	 ,00$' 1 	
   	uz*6a;;66d)4tRV	
 %#WY.Ev rA   indexc                 p    [         R                  R                  U =(       d    S5      nUR                  S:*  $ )Nr      )rO   r   get_device_propertiesmajor)r   propss     r8   _is_sm7x_or_older_gpur     s)    JJ,,UZa8E;;!rA   c                 &    [        S U  5       5      $ )Nc              3   B   #    U  H  n[        U[        5      v   M     g 7fr4   )r   r5   )r6   dims     r8   r9   dims_are_int.<locals>.<genexpr>  s     4tz#s##ts   )all)dimss    r8   dims_are_intr      s    4t444rA   c                    [        XEXU5      u  pn[        XU/5      (       d  g UR                  [        R                  :w  a  g [        R
                  R                  5       S:  a  [        5       S:w  a  g U S:X  a  US-  S:w  d	  US-  S:w  a  g U S::  a  US:  a  US:  a  [        SSSS	S
S9$ U S:  a  U S::  a  US:  a  US:  a  [        SSSS	S
S9$ U S:  a  U S::  a  US:  a  US:  a  [        SSSS	S
S9$ g )N)   r   i  r"      r   i   @            )BLOCK_MBLOCK_NBLOCK_K
num_stages	num_warps    )	get_size_hintsr   rs   rO   float16r   get_device_capabilityr   r/   )rV   rW   rX   r   ri   rj   
mat2_dtypern   s           r8   try_heuristicr     s   Tq1GA!q	""zzU]]" JJ,,.&8		 F	*Av1r6Q;!b&A+Bw19d
 	
 
RAGT	a4i
 	
 
RAGT	a4i
 	
 rA   r|   c           
         ^ [        XX#U5      u  p#n[        X#U/5      (       d  g [        X5      u  pU4S jnS nU" X$X0XU5      n[        UUUUTUU	S9nU
b  UR	                  XS9$ UR                  5       $ )Nc                 >  > [        5       nUR                  SU 5        UR                  SU5        UR                  SU5        UR                  SUR                  R                  SS9  UR                  SUR                  R                  SS9  [	        USU5        [	        US	U5        UR                  S
UR                  R                  5       SS9  UR                  SUR                  R                  5       SS9  TS:X  a  [        XsR                  R                  5        U$ )NrV   rX   rW   
mat1_dtypeT)is_categoricalr   ri   rj   mat1_iscontigmat2_iscontigrB   )r   add_featurern   rs   r   is_contiguousr   )	rV   rX   rW   ri   rj   mat1_stridemat2_stridecontextrD   s	           r8   get_context%mm_autoheuristic.<locals>.get_contextE  s    +C#C#C#L$++*;*;DQL$++*;*;DQGV[9GV[9T[[668 	 	
 	T[[668 	 	
 4<"7KK,=,=>rA   c                      g r4   rU   rU   rA   r8   fallback"mm_autoheuristic.<locals>.fallbackY  s    rA   )r   r   ru   r   rD   augment_contextprecondition)r}   )r   r   get_size_hints_stridesr   get_top_k_choices_callerget_choice_caller)ri   rj   rV   rW   rX   r   rD   ru   opsr   r|   r}   r   r   r   r   r   autoheuristics         `           r8   r   r   2  s     Tq1GA!q	""5dAK( !KHG0!M 55 6 
 	
 **,,rA   c                    [        U[        5      (       a  [        U[        5      (       dZ  [        R                  R                  R                  U R                  5       [        R                  R                  R                  S9u  p$[        U[        5      (       a  [        U[        5      (       dZ  [        R                  R                  R                  UR                  5       [        R                  R                  R                  S9u  pCX#U4$ )Nr   )r   r5   r   r   r   
size_hintsr   rO   r   r   unbacked_symint_fallback)ri   rj   rV   rW   rX   s        r8   r   r   p  s    aZ3%7%7!!,,MMO__++DD - 

 aZ3%7%7!!,,MMO__++DD - 
 7NrA   c                 j   U R                   R                  nUR                   R                  nX#/n/ nU Hs  n[        U[        5      (       dJ  [        R
                  R                  R                  U[        R                  R                  R                  S9nUR                  U5        Mu     US   US   4$ )Nr  r   r"   )rn   re   r   r5   r   r   r   r  rO   r   r   r  r   )ri   rj   r   r   stridesstrides_hintsre   s          r8   r   r     s    ++$$K++$$K(GM&#&&WW%%00//HH 1 F 	V$  ]1---rA   )NN)m	functoolsloggingr;   typingr   rO   torch._dynamo.utilsr   +torch._inductor.autoheuristic.autoheuristicr   1torch._inductor.autoheuristic.autoheuristic_utilsr   r   r   r	   )torch._inductor.codegen.cpp_gemm_templater
   torch._inductor.virtualizedr    r   r   r   codegen.cuda.gemm_templater   r   'codegen.rocm.ck_universal_gemm_templater   codegen.wrapperr   r   r   loweringr   select_algorithmr   r   r   utilsr   r   r   r   r   r   r   r    r!   	mm_commonr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   strr=   r5   r@   r   __version__triton_version
has_tritontriton_majortriton_minorImportError	getLogger__name__r~   r   atenversionhipr   r   	lru_cacherJ   rB   r   rg   defaultr   _int_mmr   _sparse_semi_structured_mmr   rS   rY   r_   rk   r   r   r   r   r   boolr   r   r   r   r   r   rU   rA   r8   <module>r)     s     	   ( T  F ) , U D 2 * ( 

 
 
   "# (5c?*C "6#5#56NJ!%3"l !yy~~		F MM%<1,1BE@	L@UL\ ,		`d P T" " UXX|
4	KKdjj.@.@
 "%--1BC"4	$$$# 8
 (,11 I %Z6 4775#' uJ 6uJp 4<<T:'+ *N ;*NZ 4::48*+!D a 9aH 422M(,T) N)X T# 4  
5+r  ;- C=;-|.c  NJLL	s   5 I; I; ;JJ