
    sh_                     $   S SK Jr  S SKJrJr  S SKJr  S SKJrJ	r	  S SK
Jr  S SKJr  S SKJr  S SKJr  S	 r\S
 5       r\S 5       r\S 5       r\" \R,                  " \5      SSS9S 5       r\S 5       rS r\S 5       r\S 5       r\S 5       rg)    )ir)cudatypes)cgutils)RequireLiteralValueNumbaValueError)	signature)overload_attribute)	nvvmutils)	intrinsicc                     U R                   nUS:X  a  [        R                  nO7US;   a&  [        R                  " [        R                  U5      nO[	        S5      e[        U[        R                  5      $ )N   )      zargument can only be 1, 2, 3)literal_valuer   int64UniTupler   r	   int32)ndimvalrestypes      i/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/numba/cuda/intrinsics.py_type_grid_functionr      sU    


C
ax++	..c2<==Wekk**    c                 x    [        U[        R                  5      (       d  [        U5      e[	        U5      nS nX#4$ )a  grid(ndim)

Return the absolute position of the current thread in the entire grid of
blocks.  *ndim* should correspond to the number of dimensions declared when
instantiating the kernel. If *ndim* is 1, a single integer is returned.
If *ndim* is 2 or 3, a tuple of the given number of integers is returned.

Computation of the first integer is as follows::

    cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x

and is similar for the other two indices, but using the ``y`` and ``z``
attributes.
c                    UR                   nU[        R                  :X  a  [        R                  " USS9$ [        U[        R                  5      (       a4  [        R                  " XR                  S9n[        R                  " X5      $ g )Nr   )dim)
return_typer   r   r   get_global_id
isinstancer   countr   
pack_array)contextbuildersigargsr   idss         r   codegengrid.<locals>.codegen1   se    //ekk!**7::00))'}}EC%%g33 1r   r    r   IntegerLiteralr   r   )	typingctxr   r%   r(   s       r   gridr-      s;    " dE0011!$''
d
#C4 <r   c                    ^ [        U[        R                  5      (       d  [        U5      e[	        U5      nS mU4S jnX#4$ )a  gridsize(ndim)

Return the absolute size (or shape) in threads of the entire grid of
blocks. *ndim* should correspond to the number of dimensions declared when
instantiating the kernel. If *ndim* is 1, a single integer is returned.
If *ndim* is 2 or 3, a tuple of the given number of integers is returned.

Computation of the first integer is as follows::

    cuda.blockDim.x * cuda.gridDim.x

and is similar for the other two indices, but using the ``y`` and ``z``
attributes.
c                     [         R                  " S5      n[        R                  " U SU 35      n[        R                  " U SU 35      nU R	                  U R                  X25      U R                  XB5      5      $ )N@   zntid.znctaid.)r   IntTyper   	call_sregmulsext)r$   r   i64ntidnctaids        r   _nthreads_for_dim#gridsize.<locals>._nthreads_for_dimR   sb    jjn""7eC5M:$$Wuo>{{7<<2GLL4MNNr   c                 `  > UR                   nT" US5      nU[        R                  :X  a  U$ [        U[        R                  5      (       ac  T" US5      nUR
                  S:X  a  [        R                  " XU45      $ UR
                  S:X  a!  T" US5      n[        R                  " XXg45      $ g g )Nxyr   r   z)r   r   r   r    r   r!   r   r"   )	r#   r$   r%   r&   r   nxnynzr8   s	           r   r(   gridsize.<locals>.codegenX   s    //w,ekk!I00"7C0B}}!))'8<<!#&w4))'<@@ $ 1r   r*   )r,   r   r%   r(   r8   s       @r   gridsizerB   <   sC    " dE0011!$''
d
#COA <r   c                 @    [        [        R                  5      nS nX4$ )Nc                 0    [         R                  " US5      $ )Nwarpsize)r   r2   )r#   r$   r%   r&   s       r   r(   _warpsize.<locals>.codegenn   s    ""7J77r   )r	   r   r   r,   r%   r(   s      r   	_warpsizerH   j   s    
EKK
 C8 <r   rE   r   )targetc                     S nU$ )zS
The size of a warp. All architectures implemented to date have a warp size
of 32.
c                     [        5       $ )N)rH   )mods    r   getcuda_warpsize.<locals>.getz   s
    {r    )rL   rM   s     r   cuda_warpsizerP   t   s    Jr   c                 @    [        [        R                  5      nS nX4$ )a  
Synchronize all threads in the same thread block.  This function implements
the same pattern as barriers in traditional multi-threaded programming: this
function waits until all threads in the block call it, at which point it
returns control to all its callers.
c                     SnUR                   n[        R                  " [        R                  " 5       S5      n[        R
                  " XVU5      nUR                  US5        U R                  5       $ )Nzllvm.nvvm.barrier0rO   )moduler   FunctionTypeVoidTyper   get_or_insert_functioncallget_dummy_value)r#   r$   r%   r&   fnamelmodfntysyncs           r   r(   syncthreads.<locals>.codegen   sU    $~~r{{}b1--d%@T2&&((r   )r	   r   nonerG   s      r   syncthreadsr_      s!     EJJ
C) <r   c                    ^ [        U[        R                  5      (       d  g [        [        R                  [        R                  5      nU4S jnX44$ )Nc                    > [         R                  " [         R                  " S5      [         R                  " S5      45      n[        R                  " UR
                  UT5      nUR                  XS5      $ )N    )r   rT   r1   r   rV   rS   rW   )r#   r$   r%   r&   r[   r\   rY   s         r   r(   '_syncthreads_predicate.<locals>.codegen   sM    rzz"~

2/@A--gnndEJ||D''r   )r    r   Integerr	   i4)r,   	predicaterY   r%   r(   s     `  r   _syncthreads_predicaterg      s:    i//
EHHehh
'C(
 <r   c                     Sn[        XU5      $ )z
syncthreads_count(predicate)

An extension to numba.cuda.syncthreads where the return value is a count
of the threads where predicate is true.
zllvm.nvvm.barrier0.popcrg   r,   rf   rY   s      r   syncthreads_countrk      s     &E!)>>r   c                     Sn[        XU5      $ )z
syncthreads_and(predicate)

An extension to numba.cuda.syncthreads where 1 is returned if predicate is
true for all threads or 0 otherwise.
zllvm.nvvm.barrier0.andri   rj   s      r   syncthreads_andrm      s     %E!)>>r   c                     Sn[        XU5      $ )z
syncthreads_or(predicate)

An extension to numba.cuda.syncthreads where 1 is returned if predicate is
true for any thread or 0 otherwise.
zllvm.nvvm.barrier0.orri   rj   s      r   syncthreads_orro      s     $E!)>>r   N)llvmliter   numbar   r   
numba.corer   numba.core.errorsr   r   numba.core.typingr	   numba.core.extendingr
   
numba.cudar   numba.cuda.extendingr   r   r-   rB   rH   ModulerP   r_   rg   rk   rm   ro   rO   r   r   <module>ry      s       B ' 3   *	+  @ * *Z   ELL&
6B C  ( ? ? ? ? ? ?r   