
    shX              	         S SK Jr  S SKrS SKJrJrJrJr  S SKrS SK	J
r
  SSKJrJr  SSKJrJrJrJr  SSKJrJrJrJr  SS	KJrJrJrJrJrJr  SS
K J!r!  SSK"J#r#J$r$  \(       a  S SK%J&r&  SSKJ'r'  \RP                  " \)5      r*\RV                  RX                  r,\S 5       r-\S 5       r.SSS.SSS.SSS.SSS.SSS.SSS.SSS./r/\0" S \/ 5       5      r1\Rd                  Rf                  (       a'  \Rh                  Rk                  5       (       a  \#" \15      r1S r6S r7Sr8 \" S\-S\8-   S -   \8-   S!-   S"9r9S#r:\" S$\.S%\:-   S&-   \:-   S'-   S"9r;\" \Rx                  S(S)\,Rx                  Rz                  S*9r>S+ r?\" \?S5      r@ " S, S-\5      rA                    S4S. jrBS/ rCS0 rD\" \,Rx                  5                        S5S1 j5       r<\" \,R                  5      S2 5       rES3 rF\" \,Rx                  \F5        g)6    )annotationsN)castOptionalTYPE_CHECKING	TypedDict)CKGroupedConvFwdTemplate   )configir)add_layout_constraintconstrain_to_fx_strides	loweringsregister_lowering)autotune_select_algorithmExternKernelChoiceSymbolicGridFnTritonTemplate)is_onesis_zerospad_listlikesympy_productuse_ck_conv_templateuse_triton_template)V   )build_rocm_gemm_configsfiltered_configs)Sequence)	TensorBoxc               B    U" X-  U-  US   5      U" XS   5      US   4$ NBLOCK_MBLOCK_NGROUPS )nchwmetacdivs         o/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/_inductor/kernel/conv.pyconv2d_gridr-   .   s5     	QUQYY(QY X     c               H    U" X-  U-  U-  US   5      U" XS   5      US   4$ r!   r%   )r&   r'   dr(   r)   r*   r+   s          r,   conv3d_gridr1   7   s9     	QUQY]DO,QY X r.   )@         r	      T)r
   cond)r3   r2   r4   r	   r5   )i   r4   r4   r      )   r8       r	   r7   )r2   r2   r9   r	   r5   )r2   r3   r9   r	   r7   )r3   r2   r9   r	   r7   c           	   #     #    U  H@  nUS    (       d  M  [        [        [        [        [        [        [        4   US   5      v   MB     g7f)r6   r
   N)r   tupleint).0r
   s     r,   	<genexpr>r>   N   s?       f~ ;DsCc3&	')9:: s
   A
6A
c                <    U S:  d  US:  d  US:  a  gX-  U-  S:  $ )Nr3   Ti   r%   )mr&   ks      r,   _is_large_block_for_cpurB   Y   s)    3w!c'QW519ur.   c          	     X    US:X  a  [        U UU[        S[        S9$ [        XU[        S9$ )Ncpug      ?)configsscaleexclude)rE   )r   platform_configsrB   )r@   r&   rA   device_typekwargss        r,   conv_configsrK   `   s;    e$+
 	
 A!-=>>r.   a  
        idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
        idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
        idx_x_c = tl.arange(0, BLOCK_K) + k

        x_ptrs = x_base + (
            (idx_x_h * stride_xh)[:, None]
            + (idx_x_w * stride_xw)[:, None]
            + (idx_x_c * stride_xc)[None, :]
        )
        mask_x = (
            (idx_n < BATCH)[:, None]
            & (idx_x_h >= 0)[:, None]
            & (idx_x_h < IN_H)[:, None]
            & (idx_x_w >= 0)[:, None]
            & (idx_x_w < IN_W)[:, None]
            & (idx_x_c < GROUP_IN_C)[None, :]
        )
        matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)

        w_ptrs = w_base + (
            (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww)
        )
        mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
        matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
        acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
convolution2dag  
{{def_kernel("X", "W")}}
    # Tensor dimensions
    BATCH = {{size("X", 0)}}
    IN_C = {{size("X", 1)}}
    IN_H = {{size("X", 2)}}
    IN_W = {{size("X", 3)}}
    OUT_C = {{size(None, 1)}}
    OUT_H = {{size(None, 2)}}
    OUT_W = {{size(None, 3)}}

    # Strides:
    stride_xn = {{stride("X", 0)}}
    stride_xc = {{stride("X", 1)}}
    stride_xh = {{stride("X", 2)}}
    stride_xw = {{stride("X", 3)}}
    stride_wc_out = {{stride("W", 0)}}
    stride_wc_in = {{stride("W", 1)}}
    stride_wh = {{stride("W", 2)}}
    stride_ww = {{stride("W", 3)}}

    nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
    idx_y_w = nhw % OUT_W
    nh = nhw // OUT_W
    idx_y_h = nh % OUT_H
    idx_n = nh // OUT_H
    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)

{% if GROUPS == 1 %}
    group = 0
    GROUP_IN_C = IN_C
    GROUP_OUT_C = OUT_C
{% else %}
    group = tl.program_id(2)
    GROUP_IN_C = IN_C // GROUPS
    GROUP_OUT_C = OUT_C // GROUPS
{% endif %}

    x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
    w_base = (
        W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
    )

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

{% if UNROLL %}
{% for i in range(KERNEL_H) %}
{% for j in range(KERNEL_W) %}
    i = {{i}}
    j = {{j}}
    for k in range(0, GROUP_IN_C, BLOCK_K):
        a  
{% endfor %}
{% endfor %}
{% else %}
    # Could be simplified, but slightly slower:
    # for i in range(KERNEL_H):
    #     for j in range(KERNEL_W):
    #         for k in range(0, GROUP_IN_C, BLOCK_K):
    BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
    for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
        k = (ijk % BLOCK_K_COUNT) * BLOCK_K
        ij = ijk // BLOCK_K_COUNT
        i = ij // KERNEL_W
        j = ij % KERNEL_W
        a  
{% endif %}

    mask = (
        (idx_n < BATCH)[:, None]
        & (idx_y_h < OUT_H)[:, None]
        & (idx_y_w < OUT_W)[:, None]
        & (idx_y_c < GROUP_OUT_C)[None, :]
    )
    idx_n = idx_n[:, None]
    idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
    idx_h = idx_y_h[:, None]
    idx_w = idx_y_w[:, None]

    # inductor generates a suffix
    {{store_output(("idx_n", "idx_c", "idx_h", "idx_w"), "acc", "mask")}}
)namegridsourcea  
        idx_x_d = d - PADDING_D + idx_y_d * STRIDE_D
        idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
        idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
        idx_x_c = tl.arange(0, BLOCK_K) + k

        x_ptrs = x_base + (
            (idx_x_d * stride_xd)[:, None]
            + (idx_x_h * stride_xh)[:, None]
            + (idx_x_w * stride_xw)[:, None]
            + (idx_x_c * stride_xc)[None, :]
        )
        mask_x = (
            (idx_n < BATCH)[:, None]
            & (idx_x_d >= 0)[:, None]
            & (idx_x_d < IN_D)[:, None]
            & (idx_x_h >= 0)[:, None]
            & (idx_x_h < IN_H)[:, None]
            & (idx_x_w >= 0)[:, None]
            & (idx_x_w < IN_W)[:, None]
            & (idx_x_c < GROUP_IN_C)[None, :]
        )
        matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)

        w_ptrs = w_base + (
            (idx_x_c * stride_wc_in)[:, None] +
            (d * stride_wd) + (i * stride_wh) + (j * stride_ww)
        )
        mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
        matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
        acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
convolution3daH  
{{def_kernel("X", "W")}}
    # Tensor dimensions
    BATCH = {{size("X", 0)}}
    IN_C = {{size("X", 1)}}
    IN_D = {{size("X", 2)}}
    IN_H = {{size("X", 3)}}
    IN_W = {{size("X", 4)}}
    OUT_C = {{size(None, 1)}}
    OUT_D = {{size(None, 2)}}
    OUT_H = {{size(None, 3)}}
    OUT_W = {{size(None, 4)}}

    # Strides:
    stride_xn = {{stride("X", 0)}}
    stride_xc = {{stride("X", 1)}}
    stride_xd = {{stride("X", 2)}}
    stride_xh = {{stride("X", 3)}}
    stride_xw = {{stride("X", 4)}}
    stride_wc_out = {{stride("W", 0)}}
    stride_wc_in = {{stride("W", 1)}}
    stride_wd = {{stride("W", 2)}}
    stride_wh = {{stride("W", 3)}}
    stride_ww = {{stride("W", 4)}}

    ndhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
    idx_y_w = ndhw % OUT_W
    ndh = ndhw // OUT_W
    idx_y_h = ndh % OUT_H
    nd = ndh // OUT_H
    idx_y_d = nd % OUT_D
    idx_n = nd // OUT_D
    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)

{% if GROUPS == 1 %}
    group = 0
    GROUP_IN_C = IN_C
    GROUP_OUT_C = OUT_C
{% else %}
    group = tl.program_id(2)
    GROUP_IN_C = IN_C // GROUPS
    GROUP_OUT_C = OUT_C // GROUPS
{% endif %}

    x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
    w_base = (
        W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
    )

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

{% if UNROLL %}
{% for d in range(KERNEL_D) %}
{% for i in range(KERNEL_H) %}
{% for j in range(KERNEL_W) %}
    d = {{d}}
    i = {{i}}
    j = {{j}}
    for k in range(0, GROUP_IN_C, BLOCK_K):
        aF  
{% endfor %}
{% endfor %}
{% endfor %}
{% else %}
    # Could be simplified, but slightly slower:
    # for d in range(KERNEL_D):
    #   for i in range(KERNEL_H):
    #     for j in range(KERNEL_W):
    #         for k in range(0, GROUP_IN_C, BLOCK_K):
    BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
    for dijk in range(KERNEL_D * KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
        k = (dijk % BLOCK_K_COUNT) * BLOCK_K
        dij = dijk // BLOCK_K_COUNT
        j = dij % KERNEL_W
        di = dij // KERNEL_W
        i = di % KERNEL_H
        d = di // KERNEL_H
        a  
{% endif %}

    mask = (
        (idx_n < BATCH)[:, None]
        & (idx_y_d < OUT_D)[:, None]
        & (idx_y_h < OUT_H)[:, None]
        & (idx_y_w < OUT_W)[:, None]
        & (idx_y_c < GROUP_OUT_C)[None, :]
    )
    idx_n = idx_n[:, None]
    idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
    idx_d = idx_y_d[:, None]
    idx_h = idx_y_h[:, None]
    idx_w = idx_y_w[:, None]

    # inductor generates a suffix
    {{store_output(("idx_n", "idx_c", "idx_d", "idx_h", "idx_w"), "acc", "mask")}}
zat::convolutionF)has_out_variantop_overloadc          
         [         R                  " [         R                  " US5      S5      n[         R                  " U R                  SSSS5      UR                  SS5      UR                  SSSS5      S9$ )Nr   r	      r   )out)torchsqueezematmulpermute)xr)   rV   s      r,   conv1x1_via_mmr\   {  s]    emmAr*B/A<<			!Q1qyyACKK1a4K r.   c                  R    \ rS rSr% S\S'   S\S'   S\S'   S\S'   S\S'   S	\S
'   Srg)ConvLayoutParamsi  tuple[int, ...]stridepaddingdilationbool
transposedoutput_paddingr<   groupsr%   N)__name__
__module____qualname____firstlineno____annotations____static_attributes__r%   r.   r,   r^   r^     s%    ##Kr.   r^   c	                p   [         R                  R                     [        R                  R
                  R                  [        R                  " U SS9[        R                  " USS9[        R                  " USS9[         R                  R                  R                  U5      [         R                  R                  R                  U5      [         R                  R                  R                  U5      U[         R                  R                  R                  U5      U5	      n	[        R                  " U	R                  5       5      n
[        R                  " U	R                  5       5      nSSS5        [        R                  " U R                  5       U R!                  5       W
U5      $ ! , (       d  f       NC= f)z)Determine output layout for a convolutionT)guard_shapeN)r   graph	fake_moderW   opsatenconvolutionr   ir_node_to_tensorsizevars
size_hintsconvert_shape_to_inductorsizer`   FixedLayoutget_device_or_error	get_dtype)r[   weightbiasr`   ra   rb   rd   re   rf   outputsizess              r,   conv_layoutr     s)    
		++  5  T:  48GG''/GG''0GG''1GG''7

 ,,V[[];--fmmo> 
 >>			  
	s   EF''
F5c                    [        [        [        U 5      5      5      nUR                  SUR	                  S5      5        U$ )Nr   rT   )listreversedrangeinsertpop)rankorders     r,   channels_last_orderr     s0    %+&'E	LLEIIbM"Lr.   c                   [        UR                  5       5      n[        US-
  5       H  n[        [        R
                     " USS9nM!     [        [        R                     " USS/5      n[        R                  R                  U [        U5      5      n [        [        U5      5      nUR                  UR                  S5      5        [        [        R                     " X5      n U R                  5       Gt pg[        [        R                     " U [        U5      U/5      n Uc  [        [        R                      " X5      nO[        [        R"                     " X U5      n[        [        R                     " U/ UQSP5      n[        [        U5      5      n	U	R%                  SU	R                  S5      5        [        [        R                     " X5      $ )Nr	   rT   dimr   r   )lenget_sizer   Lrr   rX   rZ   r   ExternKernelrequire_stride_orderr   r   appendr   reshaper   mmaddmmr   )
r[   r|   r}   r   _	x_permuter   in_chanresultresult_permutes
             r,   convert_1x1_conv_to_mmr     sR   v !D4!8_4<<R0 t||_VaV,F
,,Q0CD0IJAU4[!IY]]1%&	$,,%AjjlOU	$,,M%0':;A|477A&4::t/t||_V\u\b\2F%+&N!^//34T\\?622r.   c	                v  ^ ^^^ [        U5      n[        U5      n[        U5      n[        U5      n[        U[        5      (       d)  [        R                  R
                  R                  U5      n[        U[        5      (       d   e[        [        R                  R
                  R                  U5      5      n[        [        R                  R
                  R                  U5      5      nUUUUUUS.m[        T R                  5       5      [        TR                  5       5      S-
  :X  aU  [        [        R                     " [        [        [        R                     " T S/T R                  5       Q5      TU40 TD6SS9$ [        R                  R
                  R                  TR                  5       5      tpn[        T R                  5       5      S:X  a  [        U5      S:X  a  [        R                   " T 5      S:X  a  TR#                  SU-   SU-   SU-   SU-   S	.5        [        [        R$                     " T S
S9m [        [        R$                     " TS
S9m[        [        R                     " [        T TU40 TD6S
S9$ [        U5      m['        UT5      n['        UT5      n['        UT5      n['        UT5      nUUUU 4S jn[(        R*                  =(       d    [(        R,                  n[(        R.                  (       d  U(       a  U" 5       (       a  [1        U5      (       a  [1        U5      (       a  [3        U5      (       a  [1        U5      (       ap  U(       di  [3        U5      (       aY  US:X  aS  [        R                  R
                  R5                  [7        T R                  5       5      S5      (       a  [9        T TU5      $ Ubz  [        R                   " T 5      S:w  a`  [        T TS 40 TD6n[        [        R:                     " U[        [        R<                     " X.R                  5       S   /TS/-  -   5      5      $ T R?                  5         TR?                  5         [        R                  R@                  (       av  TS
:X  ap  [        R                  =RB                  S-  sl!        [        RD                  RG                  T 5      m [        RD                  RG                  T5      m[I        T TS 40 TD6nO[I        T TS 40 TD6n[        RJ                  " [        R                  R
                  RM                  URN                  5      5      n[        RD                  RQ                  T U5      m [        RD                  RQ                  TU5      m/ SQnUc  T T/nS TS'   URS                  SS5        O\T TU/nUR?                  5         URU                  5         [        R                  R
                  R                  UR                  5       5        / n[V        RX                  RZ                  R]                  S5      (       a  [^        R`                  " UUU40 TD6/n[V        RX                  RZ                  R]                  S5      (       Gam  [c        U5      (       Ga\  [1        U5      (       GaK  U(       GdC  [3        U5      (       Ga2  [        R                  R
                  Re                  U
T R                  5       S   5      (       Ga  [1        U5      (       aK  [1        U5      (       a;  [3        U5      (       a+  US:X  a%  URg                  [h        Ra                  UU5      5        [k        [7        T R                  5       S   /T R                  5       S
S  Q5      U	U
[        R                   " T 5      S9 GHC  nTS
:X  a  [l        Rn                  " U4T T4UUS   US   US   US   US   US   U[1        U5      [V        Rp                  Rr                  Rt                  URv                  URx                  S.URz                  D6  M  TS:X  d  M  [|        Rn                  " U40 ST T4_SU_SUS   _SUS   _SUS
   _SUS   _SUS   _SUS
   _SUS   _SUS   _SUS
   _SU_S[1        U5      _S[V        Rp                  Rr                  Rt                  _S URv                  _S!URx                  _URz                  D6  GMF     [        U5      (       a.  [        R                  " UUT T4Ub  U4O	[        5       -   UUUUTS"9  [        S#UUU5      $ )$N)r`   ra   rb   rd   re   rf   r   r   r   rU   xpu)r   )r   )r`   ra   rb   re   r	   c                    > [         R                  R                  (       a  TS:X  a  g[        TTS 40 TD6n [        R
                  " [         R                  R                  R                  U R                  5      5      nU[        R                  :H  $ )Nr	   T)
r   ro   
layout_optr   r   get_stride_orderru   rv   r`   NHWC_STRIDE_ORDER)layoutreq_stride_orderrJ   ndimr|   r[   s     r,   channels_last_conv'convolution.<locals>.channels_last_conv  sl    77$!)Q77..GG''6
  2#7#777r.   rD   r}   ATENTRITON)rI   )input_nodesr   KERNEL_HKERNEL_WSTRIDE_HSTRIDE_W	PADDING_H	PADDING_Wr$   UNROLL
ALLOW_TF32
num_stages	num_warpsr   r   KERNEL_Dr   r   STRIDE_Dr   r   	PADDING_Dr   r   r$   r   r   r   r   )r   r`   ra   rb   rf   n_spatial_dimensionsrs   )Cr;   
isinstancer<   r   ro   ru   evaluate_static_shapeevaluate_static_shapesr   r   r   rr   rX   rs   expandr   get_device_typeupdate	unsqueezer   r
   max_autotunemax_autotune_gemmconv_1x1_as_mmr   r   statically_known_gtr   r   addviewrealizer   num_channels_last_convr   require_channels_lastr   r   rv   r`   r   r   freeze_layoutrW   	_inductorutils_use_conv_autotune_backendaten_convolutionbindr   statically_known_equalsr   aten_conv1x1_via_mmrK   conv2d_templatemaybe_append_choicebackendscudnn
allow_tf32r   r   rJ   conv3d_templater   r   add_ck_conv_choicesr   )r[   r|   r}   r`   ra   rb   rd   re   rf   out_chanr   kernel_shaper   autotuning_gemmr   r   r   ordered_kwargs_for_cpp_kernelargschoicescfgrJ   r   s   ``                   @@r,   rs   rs     s    6]FGnGXH>*Nfc""!!77?fc"""" 177##::6BCFAGG$$;;GDEG  ( F 1::<C 12Q66$++q1*<qzz|*<=vtVvV
 	

 ()ww'7'7'N'N($H 	AJJLQ"q!U*-'> 8O"&"7		
 dnnaQ'4>>"6q164262
 	

 |D&$'F7D)GHd+H!.$7N8 8 ))EV-E-EO 
		?7I7K7KL!!FOOWH^$$aKGG00qzz|1LaPP%a66B..q1U:Q77{AdiiL(9!(<'=s
'JK
 	
 IIK
NN
 	wwdai	&&!+&OO11!4 66v>Q77Q77..GG''6
 OO004DE55f>NO%! |6{v%,,Q764 	//@G77??!!- 	
 	88BB''H^$$GG44Wajjl1oNN L!!!!!NN.33D&AB1::<?>QZZ\!"-=>?**1-	
C qy33!"F!)!_)!_#AY#AY%aj%aj! #<0$~~33>>"~~!mm!" jj#& 33!"F " *!_	
 *!_ *!_ $AY $AY $AY &aj &aj &aj "  #<0!"  %~~33>>#$  #~~%& "mmjj)7
b F## 44F$2BwP!%		
 %]GT6JJr.   c                     [        XX#XEXgU5	      $ N)rs   )r[   r|   r}   r`   ra   rb   rd   re   rf   	benchmarkdeterministiccudnn_enabledr   s                r,   _convolutionr     s      	4JPV r.   c                    U R                   [        R                  R                  R                  R
                  :X  d   e[        R                  R                  (       a  X4$ [        U /UQ70 UD6$ r   )
targetrW   rq   rr   rs   defaultr   ro   r   r   )fx_noder   rJ   s      r,   constrain_conv_to_fx_stridesr     sR    >>UYY^^77?????ww|&w@@@@r.   )r[   r   r|   r   r}   Optional[TensorBox]r`   Sequence[int]ra   r_   rb   r_   rd   rc   re   r_   rf   r<   returnz	ir.Layout)r[   r   r|   r   r}   r   r`   r   ra   r   rb   r   rd   rc   re   r   rf   r<   )G
__future__r   loggingtypingr   r   r   r   rW   -torch._inductor.codegen.rocm.ck_conv_templater    r
   r   loweringr   r   r   r   r   select_algorithmr   r   r   r   r   r   r   r   r   r   r   virtualizedr   	mm_commonr   r   collections.abcr   r   	getLoggerrg   logrq   rr   r-   r1   kernel_configsr;   rH   versionhipcudais_availablerB   rK   LOOP_BODY_2Dr   LOOP_BODY_3Dr   rs   r   r   r\   r   r^   r   r   r   r   r   r%   r.   r,   <module>r      sO   "  ; ;  R      @ (! yy~~     #D1"D1#T2#T2!40"D1"D1	      	==0022./?@
?8
 !		3h i4jkCH IDJKUYvB !		;x y<z{O` aPbccgR &	  ((	  )> y       	 
       $     F3. 4##$oKoKoK oK 	oK
 oK oK oK "oK oK %oKd 4$$% &(A d&&(D Er.   