
    sh6                     *    S SK r S rS rS rS rS rg)    Nc                 H   [         R                  " SXS9SS2S4   R                  SU5      n[         R                  " SXS9R                  U S5      nUR                  S:X  a  SOSnUR                  S:X  a  SOSnXF-  U-  US	-  U-  -   XF-  S	-  -   nUS-  S:H  US-  S:H  -  R	                  [         R
                  5      nUS-  S:H  US-  S:H  -  R	                  [         R
                  5      n	XHU	-
  -  nXXU	-
  -  nSn
XZ-  nXZ-  nX-  U
-  XJ-  -   U-   R                  S
5      $ )a  
This is PyTorch implementation of main part of reorder_meta()
function, from tools/util/include/cutlass/util/host_reorder.h file
of CUTLASS source tree.  Furthermore, CUTLASS template for sparse
GEMM decides upon layout of this matrix, and at the moment for the
sparse GEMM executed on tensor cores, this is layout described by
ColumnMajorInterleaved<2> data structure, in
include/cutlass/layout/matrix.h of CUTLASS source tree.  The
reordering of meta matrix into meta_reordered matrix calculated
according to these segments of CUTLASS code is re-implemented here.
Note that this calculation produces offsets for scattering metadata
matrix elements into reordered metadata matrix elements (or,
equivalently, for gathering reordered metadata matrix element back
into metadata matrix elements).
r   deviceN                   )torcharangerepeatitemsizetoint8view)m
meta_ncols
meta_dtyper   dst_rowsdst_colsgroup
interweavetopright
bottomleft
interleavecols_majcols_mins                }/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/sparse/_semi_structured_conversions.py*_calculate_meta_reordering_scatter_offsetsr!      sP     ||Aq0D9@@JOH||Az9@@AFH %%*BE ))Q.AJE!a<:
%	&
!	"  A"x!|q'89==ejjIHa<1$A):;??

KJ:%%H:%%H
 J%H$HL:%(==HNNrRR    c                    U R                  5       S:w  a  [        SU R                  5        S35      eU R                  u  pU R                  n[        R
                  nU R                  [        R
                  :X  a  [        R                  nOgU R                  [        R                  [        R                  [        R                  4;   a  [        R                  nO[        SU R                   S35      eUR                  S-  S-  nUS;  a  [        S	5      eU[        R                  :X  a  US
-  S:w  a  [        SU S35      eOUS-  S:w  a  [        SU S35      eUSU-  -  S:w  a  [        SU SSU-   35      eU R                  [        R                  :w  a/  SnU R                  SX&-  U5      nUS:g  R                  S5      u  ppO1SnU R                  SX&-  U5      nUS:g  R                  S5      =u  pu  pX&U-  -  nX-  nU) U	-  nU) U	) -  nUnUnUU-  U-  nX) -  nUUR                  [        R                   5      S-  -  nUUR                  [        R                   5      S-  -  nU R                  [        R                  :w  al  WR#                  SUR%                  S5      5      nUR#                  SUR%                  S5      5      n[        R&                  " UU4SS9R                  XS-  5      nO6WR#                  SUR%                  S5      S-  5      R                  XS-  5      nUUS-  -  nUR                  SX45      R                  U5      nUS:X  a=  USS2SS2S4   USS2SS2S4   S-  -  USS2SS2S4   S-  -  USS2SS2S4   S-  -  nOUS:X  a|  USS2SS2S4   USS2SS2S4   S-  -  USS2SS2S4   S-  -  USS2SS2S4   S-  -  USS2SS2S4   S
-  -  USS2SS2S4   S-  -  USS2SS2S4   S-  -  USS2SS2S4   S-  -  nWR)                  X-  45      n[+        XXC5      nUR-                  SUUR                  S5      5        UUR                  X5      4$ )z
This function converts dense matrix into sparse semi-structured
representation, producing "compressed" matrix, in the layout used by
CUTLASS backend, and corresponding metadata matrix.
r   z)Expected 2-dimensional dense tensor, got -dimensional tensorInvalid datatype z of dense matrixr   r
   )r
   r   z6Invalid number of elements per meta element calculatedr	   r   zNumber of rows of dense matrix z must be divisible by 16r   z must be divisible by 32z"Number of columns of dense matrix z must be divisible by r   r   )dimN                        )r&   RuntimeErrorshaper   r   r   dtypeint32halfbfloat16floatint16r   r   unbindr   int64gather	unsqueezestack	new_emptyr!   scatter_)denser   kr   r   quadbits_per_meta_elemksparsedense_4m0m1_m2m3dense_2r   expr0expr1expr2bit0bit1bit2bit3idxs0idxs1sparse0sparse1sparsemeta_4meta_nmetameta_reorderedmeta_offsetss                                  r    )sparse_semi_structured_from_dense_cutlassrY   /   s    yy{a7		}DWX
 	
 ;;DA\\FJ{{ejj [[
	U^^U[[A	A[[
.u{{m;KLMM'00149V+STTU[[ r6Q;1!4LM  
 r6Q;1!4LM  	A&&'1,03I!NdJdIef
 	
 {{ekk!**Rw7"a<//3R**Rw7#qL0044&"!778JH GEC"HEC2#IEDD5=2D3;DDGGEKK(A-.EDGGEKK(A-.E{{ekk!..U__R%89..U__R%89gw/R8==aaHEOOB$71$<=BB11fMeqj!F[["jABEEjQF"1a7OaAg!#%aAg!#% aAg"$& 	 
 1	$1a7OaAg!#%aAg!#% aAg"$& aAg"$	&
 aAg"$& aAg"$& aAg"$& 	 ^^Q^$56N=	zL A|TYYr];N''677r"   c                 r	   U R                  5       S:w  a  [        SU R                  5        S35      eU R                  u  p#U R                  nUR                  5       S:w  a  [        SUR                  5        S35      eUR                  U:w  a  [        SU SUR                   S35      eUR                  nU[
        R                  [
        R                  4;  a  [        SU S	35      eUR                  S
-  S-  nU R                  [
        R                  :w  a  SnOSnUR                  u  pX:w  a  [        SU SU 35      eX-  U-  SU-  :w  a  [        SU SX-  U-  S-   S35      e[        X)XT5      n
[
        R                  " UR                  S5      SU
5      R                  X)5      n[
        R                  " X)SU-  4UUS9nUS:X  a  US-  USS2SS2S4'   US-	  S-  USS2SS2S4'   US-	  S-  USS2SS2S4'   US-	  S-  USS2SS2S4'   US
-	  S-  USS2SS2S4'   US-	  S-  USS2SS2S4'   US-	  S-  USS2SS2S4'   US-	  S-  USS2SS2S4'   GO$US
:X  Ga  US-  USS2SS2S4'   US-	  S-  USS2SS2S4'   US-	  S-  USS2SS2S4'   US-	  S-  USS2SS2S4'   US
-	  S-  USS2SS2S4'   US-	  S-  USS2SS2S4'   US-	  S-  USS2SS2S4'   US-	  S-  USS2SS2S4'   US-	  S-  USS2SS2S
4'   US-	  S-  USS2SS2S4'   US -	  S-  USS2SS2S4'   US!-	  S-  USS2SS2S"4'   US#-	  S-  USS2SS2S4'   US$-	  S-  USS2SS2S%4'   US&-	  S-  USS2SS2S4'   US'-	  S-  USS2SS2S(4'   UR                  S5      [
        R                  " SSU-  U-  U-  US)9S-  R                  SS5      R                  SS5      R                  S5      -   n[
        R                   " US-  U-  4U R                  US9nU R                  [
        R                  :w  a"  UR#                  SXR                  S5      5        O[UR                  [
        R$                  5      R#                  SXR                  [
        R$                  5      R                  S5      5        UR                  USU-  5      $ )*z
This function performs reverse of the function above - it
reconstructs dense matrix from a pair of "compressed" matrix, given
in the layout used by CUTLASS backend, and accompanying metadata
matrix.
r   z*Expected 2-dimensional sparse tensor, got r$   z(Expected 2-dimensional meta tensor, got zExpected meta matrix to be on z device, got matrix on z devicer%   z of meta matrixr   r
   zNumber of rows of meta matrix z4 must be equal to number of columns of spase matrix z#Number of columns of sparse matrix z different from the z<, expected according to the number of columns of meta matrixr   r   r1   r   r'   Nr   r+   
   r)   r(      r-   r	      	   r*         r,         r.         r   )r&   r/   r0   r   r1   r   r6   r2   r   r5   r!   r9   r   emptyr   r   zerosr=   r3   )rS   rW   r   r?   r   r   r@   rA   
meta_nrowsr   rX   rV   meta_2dense_offsetsr>   s                  r    'sparse_semi_structured_to_dense_cutlassrk      s3    zz|q8FYZ
 	
 <<DA]]Fq 6~7I7I7K6LL_`
 	
 &,VH4KNLaLaKbbij
 	
  %%J%++u{{33.zl/JKK'00149||u{{"+11J,ZL8lmnlop
 	
 44A=1!4HI]`vIvz{I{H| }I I
 	
 >	zL <<++B/LAFFqUD [[	
223F
 "+q!Qw19,q!Qw19,q!Qw19,q!Qw19,q!Qw2:-q!Qw2:-q!Qw2:-q!Qw	1	$+q!Qw19,q!Qw19,q!Qw19,q!Qw19,q!Qw2:-q!Qw2:-q!Qw2:-q!Qw2:-q!Qw2:-q!Qw BJ$.q!Rx BJ$.q!Rx BJ$.q!Rx BJ$.q!Rx BJ$.q!Rx BJ$.q!RxKKOQA	W,V<q@
d2qk&&A,ttBx(M KKQFLLHE||u{{"q-R9

5::''}kk%**5::2>	
 ::aQr"   c                     S nU R                  SSS5      R                  SSS5       H  nU H  nU" U5        M     M     U $ )a  
This function computes a 2:4 sparse tile by greedily taking the largest values.

Since we take the largest values greedily, how the sorting algorithm handles duplicates affects
the ultimate sparsity pattern.

Note that this function does not have the same sorting semantics as our CUDA backend,
which is exposed via `torch._sparse_semi_structured_tile` and thus returns a different pattern.
c                     / SQn/ SQnU R                  5       R                  SSS9R                   H<  nUS-  US-  pTX   S:  a"  X%   S:  a  X==   S-  ss'   X%==   S-  ss'   M6  SXU4'   M>     g )N)r   r   r   r   T)
descendingstabler
   r   r   r   )flattensortindices)tilenum_kept_rownum_kept_colxrcs         r    greedy_prune_tile7_sparse_semi_structured_tile.<locals>.greedy_prune_tile(  sx    ##$$T$BJJA61q5q"|':1$1$T
 Kr"   r   r
   r   )unfold)r>   ry   batchrs   s       r    _sparse_semi_structured_tiler}     sH    
 aA&--aA6Dd#  7 Lr"   c                    U R                  5       R                  [        R                  5      nUR	                  SSS5      R	                  SSS5      nUR	                  SSS5      R	                  SSS5      nUR
                  " / UR                  SS QSPSPSP76 nS[        R                  " S[        R                  SS	9-  nUR                  [        R                  5      U-  R                  [        R                  5      nU$ )
z@
Calculates the compressed swizzled bitmask from a dense tensor
r   r   r   r   r
   r'   Ncudar[   )	boolr   r   uint8r{   reshaper0   r   r5   )r>   int_bitmaskbitmask_8x8_chunksbitmask_4x4_chunksbitmask_binary_representationpowers_of_twocompressed_swizzled_bitmasks          r    $_compute_compressed_swizzled_bitmaskr   ;  s     **,//%++.K %++Aq!4;;Aq!D ,221a;BB1aK %7$>$> %		!	!"1	%%'(%*+%-.%!
 au{{6JJM 	&((5Ebo   '&r"   )r   r!   rY   rk   r}   r    r"   r    <module>r      s'    'STB8Jf R<)'r"   