
    sh                     :    S SK Jr  S SKJr  S SKrS SKJr  SS jrg)    )cuda)driverN)numpy_supportc           	        ^^ [        U SS5      nU(       dr  U R                  u  p4U R                  R                  U-  U R                  R                  4n[        R
                  R                  R                  XC4UU R                  US9n[        R                  " U R                  5      m[        R                  " 5       R                  n[        [        R                  " S[        R                   " US5      S-  5      5      n[        Xg-  5      nXS-   4m[        R"                  UU4S j5       n	[        UR                  S   U-  S-   5      [        UR                  S   U-  S-   5      4n
X4nXX4   " X5        U$ )a  Compute the transpose of 'a' and store it into 'b', if given,
and return it. If 'b' is not given, allocate a new array
and return that.

This implements the algorithm documented in
http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/

:param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
    the device its stream will be used to perform the transpose (and to copy
    `b` to the device if necessary).
streamr   )dtyper         c                   > [         R                  R                  T
T	S9n[         R                  R                  n[         R                  R
                  n[         R                  R                  [         R                  R                  -  n[         R                  R
                  [         R                  R
                  -  nXc-   nXT-   nXd-   U R                  S   :  a$  XS-   U R                  S   :  a  XU-   XS-   4   X$U4'   [         R                  " 5         XR                  S   :  a  XqR                  S   :  a  X#U4   XU4'   g g g )N)shaper   r   r
   )
r   sharedarray	threadIdxxyblockIdxblockDimr   syncthreads)inputoutputtiletxtybxbyr   r   dt
tile_shapes            p/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/numba/cuda/kernels/transpose.pykerneltranspose.<locals>.kernel)   s     {{  z <^^^^]]__t}}.]]__t}}.GG7U[[^#%++a.(@ b"'!12DRL||A1||A#6B<Fa4L $7    )getattrr   r   itemsizer   cudadrvdevicearrayDeviceNDArraynps
from_dtyper   
get_deviceMAX_THREADS_PER_BLOCKintmathpowlogjit)abr   colsrowsstridestpb
tile_widthtile_heightr   blocksthreadsr   r   s               @@r   	transposer:      sH    Q!$FWW
''""T)177+;+;;LL$$22L''	 3  
	 B




3
3CTXXa#q!1A!567Jc&'KA~.J	XX( ($ k)A-.AGGAJ4Ka4O0PPF%G
7"#A)Hr!   )N)	numbar   numba.cuda.cudadrv.driverr   r,   numba.npr   r'   r:    r!   r   <module>r?      s     ,  ):r!   