
    sh$                     8    S r SSKJr  SrSrS r " S S\5      rg)	zC
A library written in CUDA Python for generating reduction kernels
    )
from_dtype       c                 H  ^^^^^^^^	 SSK Jm  TR                  SS9" U 5      m	[        S-   m[        [        -  mTR                  SS9UU	4S j5       mTR                  SS9UUU	4S j5       mTR                  SS9UUU	4S j5       mUUUUUUU	4S	 jnTR                  U5      $ )
Nr   cudaT)device   c                   > TR                   R                  nU[        -  nU[        -  nXSS24   nXU'   TR                  5         [        S-  nU(       a7  XF:  a  XT   nT	" XuXF-      5      XT'   TR                  5         US-  nU(       a  M6  gg)z(
Compute reduction within a single warp
N   )	threadIdxx	_WARPSIZEsyncwarp)
sm_partialsinittidwarpidlaneidsm_thiswidtholdr   	reduce_ops
           p/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/numba/cuda/kernels/reduction.pyinner_warp_reduction1_gpu_reduce_factory.<locals>.inner_warp_reduction   s    
 nn	!yai(Q~o"+C1H"IMMOaKE e    c                   > TR                   R                  nTR                  R                  nTR                  R                  nTR                  R                  nX5U-  -   nU R
                  nXV-  n	X   n
[        Xy-   X5       H  nT" XU   5      n
M     TR                  5         T" X*5        TR                  5         US:  a(  T" X#S4   X#S-   S4   5      X#S4'   TR                  5         US:X  a  T" US   US   5      X'   gg)a$  
Partially reduce `arr` into `partials` using `sm_partials` as working
space.  The algorithm goes like:

    array chunks of 128:  |   0 | 128 | 256 | 384 | 512 |
                block-0:  |   x |     |     |   x |     |
                block-1:  |     |   x |     |     |   x |
                block-2:  |     |     |   x |     |     |

The array is divided into chunks of 128 (size of a threadblock).
The threadblocks consumes the chunks in roundrobin scheduling.
First, a threadblock loads a chunk into temp memory.  Then, all
subsequent chunks are combined into the temp memory.

Once all chunks are processed.  Inner-block reduction is performed
on the temp memory.  So that, there will just be one scalar result
per block.  The result from each block is stored to `partials` at
the dedicated slot.
r   r   r   r   )r
   r   N)	r   r   blockIdxblockDimgridDimsizerangesyncthreadsr   )arrpartialsr   r   blkidblkszgridszstartstopsteptmpir   r   r   s               r   device_reduce_full_block5_gpu_reduce_factory.<locals>.device_reduce_full_block(   s   * nn em#xx~ ju|T0ACQ(C 1 	[. 7"+KQ,?,7a
,C#EKQMMO!8'D(9;t;LMHO r   c                 v  > TR                   R                  nTR                  R                  nTR                  R                  nU[        -  nU[        -  nU R
                  nTR                   R                  nX   n	XXg4'   TR                  5         US-   [        -  U:  a	  T" X)5        O=US:X  a7  X&SS24   n
U[        -  n[        SX-
  5       H  nT" U
S   X   5      U
S'   M     TR                  5         US:X  a=  U[        -   S-
  [        -  nUS   n[        SU5       H  nT" XUS4   5      nM     XU'   gg)z
This computes reduction on `arr`.
This device function must be used by 1 threadblock only.
The blocksize must match `arr.size` and must not be greater than 128.
r
   r   Nr   )r   r   r    r!   r   r#   r%   r$   )r&   r'   r   r   r(   r)   r   r   r#   valuer   baser/   num_active_warpsresultr   r   r   s                  r   device_reduce_partial_block8_gpu_reduce_factory.<locals>.device_reduce_partial_block_   s>    nn	!yxxnn&+FN#QJ)#d* 4 {%ai0	)q$+.A!*71:wz!BGAJ / 	!8 %	 1A 5)C &F1./"6q!t+<= 0 %UO r   c                 >  > TR                   R                  nTR                  R                  [        T	4TS9nTR
                  R                  T
:X  a
  T" XU5        O	T" XU5        U(       a2  US:X  a+  TR                  R                  S:X  a  T" US   U5      US'   gggg)a  
Perform reductions on *arr* and writing out partial reduction result
into *partials*.  The length of *partials* is determined by the
number of threadblocks. The initial value is set with *init*.

Launch config:

Blocksize must be multiple of warpsize and it is limited to 4 warps.
)dtyper   N)r   r   sharedarray	_NUMWARPSr!   r    )r&   r'   r   use_initr   r   r   r0   r7   inner_sm_sizemax_blocksizenbtyper   s         r   gpu_reduce_block_strided5_gpu_reduce_factory.<locals>.gpu_reduce_block_strided   s     nnkk''M(B.4 ( 6==??m+$SK@'{CqT]]__%9#HQK6HQK &:8r   )numbar   jitr   r=   )
fnrA   rB   r   r0   r7   r?   r   r@   r   s
    ` @@@@@@@r   _gpu_reduce_factoryrG      s    %b)IMM	)M	XXTX ( 
XXTX4N 4Nl 
XXTX)% )%V7 7, 88,--r   c                   2    \ rS rSrSr0 rS rS rSS jrSr	g)	Reduce   zCreate a reduction object that reduces values using a given binary
function. The binary function is compiled once and cached inside this
object. Keeping this object alive will prevent re-compilation.
c                     Xl         g)z
:param functor: A function implementing a binary operation for
                reduction. It will be compiled as a CUDA device
                function using ``cuda.jit(device=True)``.
N_functor)selffunctors     r   __init__Reduce.__init__   s	      r   c                     U R                   U4nX R                  ;   a  U R                  U   nU$ [        U R                   [        U5      5      nX0R                  U'   U$ )N)rM   _cacherG   r   )rN   r:   keykernels       r   _compileReduce._compile   sW    mmU"++[[%F  )
58IJF%KKr   Nc                    SSK Jn  UR                  S:w  a  [        S5      eUb  USU nUR                  R                  U5      nUR                  S:  a  U$ U R                  UR                  5      n[        [        -  nUR                  U-  U-  n	UR                  U	-
  n
[        X-  [        S-  5      nUnU
(       a  US-  nUR                  XR                  S9nU	(       a  X{X4   " USU	 USU US5        U
(       a  USX4   " XS XS UU(       + 5        UR                  S:  a  USX4   " XUS	5        Ub  USS R                  USS US
9  gUS   $ )a  Performs a full reduction.

:param arr: A host or device array.
:param size: Optional integer specifying the number of elements in
            ``arr`` to reduce. If this parameter is not specified, the
            entire array is reduced.
:param res: Optional device array into which to write the reduction
            result to. The result is written into the first element of
            this array. If this parameter is specified, then no
            communication of the reduction output takes place from the
            device to the host.
:param init: Optional initial value for the reduction, the type of which
            must match ``arr.dtype``.
:param stream: Optional CUDA stream in which to perform the reduction.
            If no stream is specified, the default stream of 0 is
            used.
:return: If ``res`` is specified, ``None`` is returned. Otherwise, the
        result of the reduction is returned.
r   r   r
   zonly support 1D arrayNr   )shaper:   TF)stream)rD   r   ndim	TypeErrorr:   typer#   rV   r=   r   mindevice_arraycopy_to_device)rN   r&   r#   resr   rZ   r   rU   	blocksize	size_fullsize_partialfull_blockctpartials_sizer'   s                 r   __call__Reduce.__call__   s   ( 	 88q=344 et*Cyy~~d# 88a<Ksyy) 	)	XX*i7	xx)+919q=A %QM$$=		$J23C
O4<]l4K4848:
 1l*+C
O,4],C,00<,<>
 ==11m+,XuM ?G""8BQ<"?A;r   rL   )NNr   r   )
__name__
__module____qualname____firstlineno____doc__rS   rP   rV   rg   __static_attributes__ r   r   rI   rI      s    
 F Ir   rI   N)rm   numba.np.numpy_supportr   r   r=   rG   objectrI   ro   r   r   <module>rr      s0    . 		U.pbV br   