
    shݜ                        S SK rS SKrS SKrS SKrS SKrS SKJrJrJ	r	J
r
JrJr  S SKJrJr  S SKJr  S SKJr  S SKJr  S SKJrJr  S SKJr  S S	KJr  S S
KJrJr  S SK J!r!  S SK"J#r#  S SK$J%r%  S SK&J'r'J(r(  S SK)J
r*  S SK+J,r,  S SK+J-r-  S SK.J/r/  / SQr0 " S S\Rb                  5      r2 " S S\35      r4 " S S5      r5 " S S\5      r6 " S S\5      r7 " S S\\Rb                  5      r8g)     N)config	serializesigutilstypestypingutils)Cache	CacheImpl)global_compiler_lock)
Dispatcher)NumbaPerformanceWarning)Purposetypeof)get_current_device)wrap_arg)compile_cudaCUDACompiler)driver)get_context)cuda_target)missing_launch_config_msgnormalize_kernel_dimensions)r   cuda)_dispatcher)warn)hsinhcoshloghlog10hlog2hexphexp10hexp2hsqrthrsqrthfloorhceilhrcphrinthtrunchdivc                   4  ^  \ rS rSrSr\   SU 4S jj5       r\S 5       r\S 5       r	S r
\S 5       r\S 5       r\U 4S	 j5       rS
 rS r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       rS rS rS rS rSS jrSS jrSS jrS rSrU =r$ )_Kernel(   zx
CUDA Kernel specialized for a given set of argument types. When called, this
object launches the kernel on the device.
c                 N  > U(       a  [        S5      e[        TU ]	  5         SU l        S U l        Xl        X l        X@l        XPl        U=(       d    / U l	        UU
(       a  SOSS.n[        5       R                  n[        U R
                  [        R                  U R                  U R                  UUUUUS9	nUR                  nU R
                  R                   nUR"                  nUR$                  nUR'                  UR(                  UR*                  XEUUUU	5      u  nnU(       d  / nSUR-                  5       ;   U l        U R.                  (       a  SUl        [2         Vs/ s H  nS	U 3UR-                  5       ;   d  M  UPM      nnU(       aq  [4        R6                  R9                  [4        R6                  R;                  [<        5      5      n[4        R6                  R?                  US
5      nURA                  U5        U H  nURC                  U5        M     URD                  U l#        URH                  U l$        URJ                  U l&        UU l'        URP                  U l(        Xl        UR*                  U l        URR                  U l)        / U l*        / U l+        / U l,        g s  snf )Nz,Cannot compile a device function as a kernelF   r   )fastmathoptdebuglineinfoinliner2   nvvm_optionscccudaCGGetIntrinsicHandleT__numba_wrapper_zcpp_function_wrappers.cu)-RuntimeErrorsuper__init__
objectmodeentry_pointpy_funcargtypesr5   r6   
extensionsr   compute_capabilityr   r   voidtarget_context__code__co_filenameco_firstlinenoprepare_cuda_kernellibraryfndescget_asm_strcooperativeneeds_cudadevrtcuda_fp16_math_funcsospathdirnameabspath__file__joinappendadd_linking_filename
entry_name	signaturetype_annotation_type_annotation_codelibrarycall_helperenvironment_referenced_environmentsliftedreload_init)selfrA   rB   linkr5   r6   r7   r2   rC   max_registersr3   devicer8   r9   crestgt_ctxcodefilenamelinenumlibkernelfnresbasedirfunctions_cu_pathfilepath	__class__s                             i/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/numba/cuda/dispatcher.pyr>   _Kernel.__init__.   sI   
 MNN     
 $* !1

  !44DLL%**dmm"&**%-#)%-)5!# %%||$$##%%11$,,27<2:G2?AV
 D 69JJ"&C0 B0b%bT*coo.?? 0 B ggoobggooh&?@G "W-G!IKK)*H  *  !++ $ 4 4++ &kk++(*%;Bs    J"?J"c                     U R                   $ N)r^   rd   s    ru   rK   _Kernel.library   s           c                     U R                   $ rx   )r]   ry   s    ru   r\   _Kernel.type_annotation   s    $$$r{   c                     U R                   $ rx   )ra   ry   s    ru   _find_referenced_environments%_Kernel._find_referenced_environments   s    ,,,r{   c                 6    U R                   R                  5       $ rx   )rF   codegenry   s    ru   r   _Kernel.codegen   s    ""**,,r{   c                 @    [        U R                  R                  5      $ rx   )tupler[   argsry   s    ru   argument_types_Kernel.argument_types   s    T^^(())r{   c	                    > U R                  U 5      n	[        X	]  5         SU	l        Xl        X)l        X9l        SU	l        XIl        XYl	        Xil
        Xyl        Xl        U	$ )
Rebuild an instance.
N)__new__r=   r>   r@   rN   rZ   r[   r]   r^   r5   r6   r_   rC   )clsrN   rY   r[   codelibraryr5   r6   r_   rC   instancert   s             ru   _rebuild_Kernel._rebuild   sb     ;;s#c%'#*"&$(! +$*(r{   c                     [        U R                  U R                  U R                  U R                  U R
                  U R                  U R                  U R                  S9$ )z
Reduce the instance for serialization.
Compiled definitions are serialized in PTX form.
Type annotation are discarded.
Thread, block and shared memory configuration are serialized.
Stream information is discarded.
)rN   rY   r[   r   r5   r6   r_   rC   )	dictrN   rZ   r[   r^   r5   r6   r_   rC   ry   s    ru   _reduce_states_Kernel._reduce_states   sL      0 0t"nn$:K:K**t}} $ 0 0T__N 	Nr{   c                 8    U R                   R                  5         g)z'
Force binding to current CUDA context
N)r^   
get_cufuncry   s    ru   bind_Kernel.bind   s     	$$&r{   c                 ^    U R                   R                  5       R                  R                  $ )z>
The number of registers used by each thread for this kernel.
)r^   r   attrsregsry   s    ru   regs_per_thread_Kernel.regs_per_thread   s%    
   ++-33888r{   c                 ^    U R                   R                  5       R                  R                  $ )z4
The amount of constant memory used by this kernel.
)r^   r   r   constry   s    ru   const_mem_size_Kernel.const_mem_size   %    
   ++-33999r{   c                 ^    U R                   R                  5       R                  R                  $ )z=
The amount of shared memory used per block for this kernel.
)r^   r   r   sharedry   s    ru   shared_mem_per_block_Kernel.shared_mem_per_block   s%    
   ++-33:::r{   c                 ^    U R                   R                  5       R                  R                  $ )z*
The maximum allowable threads per block.
)r^   r   r   
maxthreadsry   s    ru   max_threads_per_block_Kernel.max_threads_per_block   s%    
   ++-33>>>r{   c                 ^    U R                   R                  5       R                  R                  $ )z=
The amount of local memory used per thread for this kernel.
)r^   r   r   localry   s    ru   local_mem_per_thread_Kernel.local_mem_per_thread   r   r{   c                 6    U R                   R                  5       $ )z&
Returns the LLVM IR for this kernel.
)r^   get_llvm_strry   s    ru   inspect_llvm_Kernel.inspect_llvm   s       --//r{   c                 4    U R                   R                  US9$ )z'
Returns the PTX code for this kernel.
)r9   )r^   rM   )rd   r9   s     ru   inspect_asm_Kernel.inspect_asm   s       ,,,33r{   c                 6    U R                   R                  5       $ )z^
Returns the CFG of the SASS for this kernel.

Requires nvdisasm to be available on the PATH.
)r^   get_sass_cfgry   s    ru   inspect_sass_cfg_Kernel.inspect_sass_cfg   s       --//r{   c                 6    U R                   R                  5       $ )zX
Returns the SASS code for this kernel.

Requires nvdisasm to be available on the PATH.
)r^   get_sassry   s    ru   inspect_sass_Kernel.inspect_sass   s       ))++r{   c                     U R                   c  [        S5      eUc  [        R                  n[	        U R
                  < SU R                  < 3US9  [	        SUS9  [	        U R                   US9  [	        SUS9  g)
Produce a dump of the Python source of this function annotated with the
corresponding Numba IR and type information. The dump is written to
*file*, or *sys.stdout* if *file* is *None*.
Nz Type annotation is not available filezP--------------------------------------------------------------------------------zP================================================================================)r]   
ValueErrorsysstdoutprintrZ   r   )rd   r   s     ru   inspect_types_Kernel.inspect_types  sg       (?@@<::D$*=*=>TJhT"d##$/hT"r{   c                     [        5       nU R                  R                  5       n[        U[        5      (       a  [
        R                  " S U5      nUR                  UUU5      nUR                  R                  nXV-  $ )a  
Calculates the maximum number of blocks that can be launched for this
kernel in a cooperative grid in the current context, for the given block
and dynamic shared memory sizes.

:param blockdim: Block dimensions, either as a scalar for a 1D block, or
                 a tuple for 2D or 3D blocks.
:param dynsmemsize: Dynamic shared memory size in bytes.
:return: The maximum number of blocks in the grid.
c                 
    X-  $ rx    )xys     ru   <lambda>5_Kernel.max_cooperative_grid_blocks.<locals>.<lambda>&  s    QUr{   )
r   r^   r   
isinstancer   	functoolsreduce$get_active_blocks_per_multiprocessorrg   MULTIPROCESSOR_COUNT)rd   blockdimdynsmemsizectxcufuncactive_per_smsm_counts          ru   max_cooperative_grid_blocks#_Kernel.max_cooperative_grid_blocks  sr     m""--/h&& ''(:HEH@@AIALN ::22''r{   c                 F  ^ U R                   R                  5       mU R                  (       a{  TR                  S-   nTR                  R                  U5      u  pxU[        R                  " [        R                  5      :X  d   e[        R                  " 5       n	UR                  SUS9  / n
/ n[        U R                  U5       H  u  pU R                  XXJU5        M     [        R                  (       a   [        R                  R!                  S5      nOS nU=(       a    UR"                  =(       d    Un[        R$                  " TR"                  /UQUQUPUPUP7SU R&                  06  U R                  (       Ga  [        R(                  " [        R*                  " W	5      WW5        U	R,                  S:w  a  U4S jnS Vs/ s H  nU" SU-   5      PM     nnS Vs/ s H  nU" SU-   5      PM     nnU	R,                  nU R.                  R1                  U5      u  nnnUc  S	nO4Uu  nnn[2        R4                  R7                  U5      nS
U< SU< SU< S3nU< SU< SU< 3nU(       a  U< SUS   < 34USS  -   nOU4nU" U6 eU
 H
  nU" 5         M     g s  snf s  snf )N__errcode__r   )streamrN   c                    > TR                   R                  TR                  < SU < S35      u  p[        R                  " 5       n[
        R                  " [        R                  " U5      X5        UR                  $ )N__)	moduleget_global_symbolrY   ctypesc_intr   device_to_host	addressofvalue)rY   memszvalr   s       ru   load_symbol#_Kernel.launch.<locals>.load_symbolS  s\    $mm==?E{{?C?E FGC !,,.C))&*:*:3*?I99$r{   zyxtidctaid zIn function z, file z, line z, ztid=z ctaid=z:    )r^   r   r5   rY   r   r   r   sizeofr   memsetzipr   _prepare_argsr   USE_NV_BINDINGbindingCUstreamhandlelaunch_kernelrN   r   r   r   r_   get_exceptionrQ   rR   rT   )rd   r   griddimr   r   	sharedmemexcnameexcmemexcszexcvalretr
kernelargstvzero_streamstream_handler   ir   r   rj   excclsexc_argsloclocinfosymrs   linenoprefixwbr   s                                 @ru   launch_Kernel.launch-  sk   ""--/::kkM1G"MM;;GDMFFMM&,,7777\\^FMM!FM+ 
++T2DAqV:> 3    ..11!4KK06==?K 	V]] 	;%	;&	; '	; +		;
 (	; *.)9)9	; :::!!&"2"26":FEJ||q % 8==u!{519-u=;@A5aWq[15A||(,(8(8(F(Ft(L%#; G,/)C6!wwx8HFIFNFLOG 18eD,2HQK @B  %H  &wHh'' BD / >As   JJc                 <   [        U R                  5       H  nUR                  UUUUS9u  pM     [        U[        R
                  5      (       Ga  [        U5      R                  XC5      n[        R                  n[        R                  " S5      n	[        R                  " S5      n
U" UR                  5      nU" UR                  R                  5      n[        R                  " U5      n[        R                   (       a  [#        U5      n[        R                  " U5      nUR%                  U	5        UR%                  U
5        UR%                  U5        UR%                  U5        UR%                  U5        ['        UR(                  5       H'  nUR%                  U" UR*                  U   5      5        M)     ['        UR(                  5       H'  nUR%                  U" UR,                  U   5      5        M)     g[        U[        R.                  5      (       a+  [1        [        SU-  5      " U5      nUR%                  U5        gU[        R2                  :X  aY  [        R4                  " [6        R2                  " U5      R9                  [6        R:                  5      5      nUR%                  U5        gU[        R<                  :X  a(  [        R>                  " U5      nUR%                  U5        gU[        R@                  :X  a(  [        RB                  " U5      nUR%                  U5        gU[        RD                  :X  a1  [        RF                  " [#        U5      5      nUR%                  U5        gU[        RH                  :X  a_  UR%                  [        RB                  " URJ                  5      5        UR%                  [        RB                  " URL                  5      5        gU[        RN                  :X  a_  UR%                  [        R>                  " URJ                  5      5        UR%                  [        R>                  " URL                  5      5        g[        U[        RP                  [        RR                  45      (       aC  UR%                  [        RT                  " UR9                  [6        RV                  5      5      5        g[        U[        RX                  5      (       al  [        U5      R                  XC5      nURZ                  n[        R                   (       a  [        R                  " [#        U5      5      nUR%                  U5        g[        U[        R\                  5      (       aD  [_        U5      [_        U5      :X  d   e[a        X5       H  u  nnU Rc                  UUX4U5        M     g[        U[        Rd                  5      (       a*   U Rc                  UR                  URf                  X4U5        g[i        X5      e! [h         a    [i        X5      ef = f)z6
Convert arguments to ctypes and append to kernelargs
)r   r  r   zc_%sN)5reversedrC   prepare_argsr   r   Arrayr   	to_devicer   	c_ssize_tc_void_psizedtypeitemsizer   device_pointerr   intrW   rangendimshapestridesIntegergetattrfloat16c_uint16npviewuint16float64c_doublefloat32c_floatbooleanc_uint8	complex64realimag
complex128
NPDatetimeNPTimedeltac_int64int64Recorddevice_ctypes_pointer	BaseTuplelenr   r   
EnumMemberr   NotImplementedError)rd   tyr   r   r  r  	extensiondevaryc_intpmeminfoparentnitemsr  ptrdataaxcvaldevrecr  r  s                       ru   r   _Kernel._prepare_argsu  sA    "$//2I,,	 - GB 3 b%++&&c],,T:F%%Fooa(G__Q'FFKK(Ffll334H''/C$$#h??3'Dg&f%f%h'd#FKK(!!&b)9":; )FKK(!!&);"<= ) EMM**66B;/4Dd#5== ??2::c?#7#7		#BCDd#5== ??3'Dd#5== >>#&Dd#5== >>#c(+Dd#5??"fnnSXX67fnnSXX675###foochh78foochh78U--u/@/@ABBfnnSXXbhh-?@AELL))c],,T:F..C$$ooc#h/c"EOO,,r7c#h&&&B1""1azB % E,,--3""HHciiz &b..	 ' 3)"223s   (X X)r^   ra   r]   rB   r_   rN   r5   rZ   r@   r`   rC   rL   rb   r6   r?   rA   rc   r[   rF   )	NFFFFNNTFrx   )r   r   r   ) __name__
__module____qualname____firstlineno____doc__r   r>   propertyrK   r\   r   r   r   classmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   __static_attributes____classcell__rt   s   @ru   r.   r.   (   s1   
 ;@JN6;Z Zx ! ! % %- - - * *  *N' 9 9 : : ; ; ? ? : :040,#"(,FP\/ \/r{   r.   c                   &    \ rS rSrS rS rS rSrg)ForAlli  c                 h    US:  a  [        SU-  5      eXl        X l        X0l        X@l        XPl        g )Nr   z0Can't create ForAll with negative task count: %s)r   
dispatcherntasksthread_per_blockr   r   )rd   rZ  r[  tpbr   r   s         ru   r>   ForAll.__init__  s;    A:O%& ' '$ #"r{   c                 .   U R                   S:X  a  g U R                  R                  (       a  U R                  nOU R                  R                  " U6 nU R	                  U5      nU R                   U-   S-
  U-  nX$X0R
                  U R                  4   " U6 $ )Nr   r   )r[  rZ  specialized
specialize_compute_thread_per_blockr   r   )rd   r   r`  r   r   s        ru   __call__ForAll.__call__  s    ;;!??&&//K//44d;K11+>;;)A-(:Hkk>>* +,02 	2r{   c                    U R                   nUS:w  a  U$ [        5       n[        [        UR                  R                  5       5      5      n[        UR                  R                  5       SU R                  SS9nUR                  " S0 UD6u  pbU$ )Nr   i   )funcb2d_funcmemsizeblocksizelimitr   )r\  r   nextiter	overloadsvaluesr   r^   r   r   get_max_potential_block_size)rd   rZ  r]  r   rn   kwargs_s          ru   rb   ForAll._compute_thread_per_block  s    ##!8J -C $z33::<=>F((335#	F 55??FAJr{   )rZ  r[  r   r   r\  N)rM  rN  rO  rP  r>   rc  rb  rT  r   r{   ru   rX  rX    s    #2r{   rX  c                        \ rS rSrS rS rSrg)_LaunchConfigurationi  c                     Xl         X l        X0l        X@l        XPl        [
        R                  (       a4  SnUS   US   -  US   -  nXv:  a  SU S3n[        [        U5      5        g g g )N   r   r      z
Grid size zB will likely result in GPU under-utilization due to low occupancy.)	rZ  r   r   r   r   r   CUDA_LOW_OCCUPANCY_WARNINGSr   r   )	rd   rZ  r   r   r   r   min_grid_size	grid_sizemsgs	            ru   r>   _LaunchConfiguration.__init__  sy    $ "--  M
WQZ/'!*<I(#I; /A A,S12 ) .r{   c                     U R                   R                  XR                  U R                  U R                  U R
                  5      $ rx   )rZ  callr   r   r   r   rd   r   s     ru   rc  _LaunchConfiguration.__call__  s4    ##D,,$(KKA 	Ar{   )r   rZ  r   r   r   N)rM  rN  rO  rP  r>   rc  rT  r   r{   ru   rs  rs    s    3.Ar{   rs  c                   &    \ rS rSrS rS rS rSrg)CUDACacheImpli  c                 "    UR                  5       $ rx   )r   )rd   rn   s     ru   r   CUDACacheImpl.reduce   s    $$&&r{   c                 .    [         R                  " S0 UD6$ )Nr   )r.   r   )rd   rF   payloads      ru   rebuildCUDACacheImpl.rebuild#  s    *'**r{   c                     g)NTr   )rd   rh   s     ru   check_cachableCUDACacheImpl.check_cachable&  s     r{   r   N)rM  rN  rO  rP  r   r  r  rT  r   r{   ru   r  r    s    '+r{   r  c                   0   ^  \ rS rSrSr\rU 4S jrSrU =r	$ )	CUDACachei1  zK
Implements a cache that saves and loads CUDA kernels and compile results.
c                 t   > SSK Jn  U" S5         [        TU ]  X5      sS S S 5        $ ! , (       d  f       g = f)Nr   )target_overrider   )numba.core.target_extensionr  r=   load_overload)rd   sigrF   r  rt   s       ru   r  CUDACache.load_overload7  s*     	@V$7(= %$$s   )
7r   )
rM  rN  rO  rP  rQ  r  _impl_classr  rT  rU  rV  s   @ru   r  r  1  s      K> >r{   r  c                   d  ^  \ rS rSrSrSr\r\4U 4S jjr	\
S 5       rS r\R                  " SS9S$S	 j5       rS
 rS%S jr\
S 5       rS rS rS rS rS r\
S 5       rS&S jrS&S jrS&S jrS&S jrS&S jrS rS&S jrS r S r!S&S jr"S&S jr#S&S jr$S&S jr%S&S  jr&\'S! 5       r(S" r)S#r*U =r+$ )'CUDADispatcheri@  az  
CUDA Dispatcher object. When configured and called, the dispatcher will
specialize itself for the given arguments (if no suitable specialized
version already exists) & compute capability, and launch on the device
associated with the current context.

Dispatcher objects are not to be constructed by the user, but instead are
created using the :func:`numba.cuda.jit` decorator.
Fc                 >   > [         TU ]  XUS9  SU l        0 U l        g )N)targetoptionspipeline_classF)r=   r>   _specializedspecializations)rd   rA   r  r  rt   s       ru   r>   CUDADispatcher.__init__R  s.    (6 	 	8 "  "r{   c                 .    [         R                  " U 5      $ rx   )
cuda_typesr  ry   s    ru   _numba_type_CUDADispatcher._numba_type_b  s    ((..r{   c                 8    [        U R                  5      U l        g rx   )r  rA   _cachery   s    ru   enable_cachingCUDADispatcher.enable_cachingf  s    -r{   ru  )maxsizec                 6    [        X5      u  p[        XX#U5      $ rx   )r   rs  )rd   r   r   r   r   s        ru   	configureCUDADispatcher.configurei  s    7J#D8YOOr{   c                 T    [        U5      S;  a  [        S5      eU R                  " U6 $ )N)rv  r1      z.must specify at least the griddim and blockdim)r<  r   r  r~  s     ru   __getitem__CUDADispatcher.__getitem__n  s)    t9I%MNN~~t$$r{   c                     [        XX#US9$ )a  Returns a 1D-configured dispatcher for a given number of tasks.

This assumes that:

- the kernel maps the Global Thread ID ``cuda.grid(1)`` to tasks on a
  1-1 basis.
- the kernel checks that the Global Thread ID is upper-bounded by
  ``ntasks``, and does nothing if it is not.

:param ntasks: The number of tasks.
:param tpb: The size of a block. An appropriate value is chosen if this
            parameter is not supplied.
:param stream: The stream on which the configured dispatcher will be
               launched.
:param sharedmem: The number of bytes of dynamic shared memory required
                  by the kernel.
:return: A configured dispatcher, ready to launch on a set of
         arguments.)r]  r   r   )rX  )rd   r[  r]  r   r   s        ru   forallCUDADispatcher.foralls  s    ( diPPr{   c                 8    U R                   R                  S5      $ )a  
A list of objects that must have a `prepare_args` function. When a
specialized kernel is called, each argument will be passed through
to the `prepare_args` (from the last object in this list to the
first). The arguments to `prepare_args` are:

- `ty` the numba type of the argument
- `val` the argument value itself
- `stream` the CUDA stream used for the current call to the kernel
- `retr` a list of zero-arg functions that you may want to append
  post-call cleanup work to.

The `prepare_args` function must return a tuple `(ty, val)`, which
will be passed in turn to the next right-most `extension`. After all
the extensions have been called, the resulting `(ty, val)` will be
passed into Numba's default argument marshalling logic.
rC   )r  getry   s    ru   rC   CUDADispatcher.extensions  s    & !!%%l33r{   c                      [        [        5      erx   )r   r   )rd   r   ro  s      ru   rc  CUDADispatcher.__call__  s    233r{   c                     U R                   (       a-  [        [        U R                  R	                  5       5      5      nO![
        R                  R                  " U /UQ76 nUR                  XX4U5        g)z:
Compile if necessary and invoke this kernel with *args*.
N)	r`  rj  rk  rl  rm  r   r   
_cuda_callr  )rd   r   r   r   r   r   rn   s          ru   r}  CUDADispatcher.call  sT     $t~~44678F ++66tCdCFdXyAr{   c                     U(       a   eU Vs/ s H  o0R                  U5      PM     nnU R                  [        U5      5      $ s  snf rx   )typeof_pyvalcompiler   )rd   r   kwsarB   s        ru   _compile_for_args CUDADispatcher._compile_for_args  s>    w267$Q%%a($7||E(O,, 8s   Ac                      [        U[        R                  5      $ ! [         aL    [        R
                  " U5      (       a/  [        [        R                  " USS9[        R                  5      s $ e f = f)NF)sync)r   r   argumentr   r   is_cuda_arrayas_cuda_array)rd   r   s     ru   r  CUDADispatcher.typeof_pyval  sf    		#w//00 	!!#&& d005A%..0 0 	s    AA20A2c                   ^  T R                   (       a  [        S5      e[        5       R                  n[	        U 4S jU 5       5      nT R
                  R                  X#45      nU(       a  U$ T R                  n[        T R                  US9nUR                  U5        UR                  5         SUl        UT R
                  X#4'   U$ )zL
Create a new instance of this dispatcher specialized for the given
*args*.
zDispatcher already specializedc              3   F   >#    U  H  nTR                  U5      v   M     g 7frx   )r  ).0r  rd   s     ru   	<genexpr>,CUDADispatcher.specialize.<locals>.<genexpr>  s     <t!**1--ts   !)r  T)r`  r<   r   rD   r   r  r  r  r  rA   r  disable_compiler  )rd   r   r9   rB   specializationr  s   `     ru   ra  CUDADispatcher.specialize  s    
 ?@@!44<t<<--112.A!!**'6CEx(&&(&*#-;R\*r{   c                     U R                   $ )z.
True if the Dispatcher has been specialized.
)r  ry   s    ru   r`  CUDADispatcher.specialized  s    
    r{   c                 Z   Ub#  U R                   UR                     R                  $ U R                  (       a6  [	        [        U R                   R                  5       5      5      R                  $ U R                   R                  5        VVs0 s H  u  p#X#R                  _M     snn$ s  snnf )a  
Returns the number of registers used by each thread in this kernel for
the device in the current context.

:param signature: The signature of the compiled kernel to get register
                  usage for. This may be omitted for a specialized
                  kernel.
:return: The number of registers used by the compiled variant of the
         kernel for the given signature and current device.
)rl  r   r   r`  rj  rk  rm  itemsrd   r[   r  overloads       ru   get_regs_per_thread"CUDADispatcher.get_regs_per_thread  s      >>)..1AAAT^^22456FFF *.)=)=)?A)? 111)?A A A   B'c                 Z   Ub#  U R                   UR                     R                  $ U R                  (       a6  [	        [        U R                   R                  5       5      5      R                  $ U R                   R                  5        VVs0 s H  u  p#X#R                  _M     snn$ s  snnf )a  
Returns the size in bytes of constant memory used by this kernel for
the device in the current context.

:param signature: The signature of the compiled kernel to get constant
                  memory usage for. This may be omitted for a
                  specialized kernel.
:return: The size in bytes of constant memory allocated by the
         compiled variant of the kernel for the given signature and
         current device.
)rl  r   r   r`  rj  rk  rm  r  r  s       ru   get_const_mem_size!CUDADispatcher.get_const_mem_size  s      >>)..1@@@T^^22456EEE *.)=)=)?A)? 000)?A A Ar  c                 Z   Ub#  U R                   UR                     R                  $ U R                  (       a6  [	        [        U R                   R                  5       5      5      R                  $ U R                   R                  5        VVs0 s H  u  p#X#R                  _M     snn$ s  snnf )a  
Returns the size in bytes of statically allocated shared memory
for this kernel.

:param signature: The signature of the compiled kernel to get shared
                  memory usage for. This may be omitted for a
                  specialized kernel.
:return: The amount of shared memory allocated by the compiled variant
         of the kernel for the given signature and current device.
)rl  r   r   r`  rj  rk  rm  r  r  s       ru   get_shared_mem_per_block'CUDADispatcher.get_shared_mem_per_block        >>)..1FFFT^^22456KKK *.)=)=)?A)? 666)?A A Ar  c                 Z   Ub#  U R                   UR                     R                  $ U R                  (       a6  [	        [        U R                   R                  5       5      5      R                  $ U R                   R                  5        VVs0 s H  u  p#X#R                  _M     snn$ s  snnf )a  
Returns the maximum allowable number of threads per block
for this kernel. Exceeding this threshold will result in
the kernel failing to launch.

:param signature: The signature of the compiled kernel to get the max
                  threads per block for. This may be omitted for a
                  specialized kernel.
:return: The maximum allowable threads per block for the compiled
         variant of the kernel for the given signature and current
         device.
)rl  r   r   r`  rj  rk  rm  r  r  s       ru   get_max_threads_per_block(CUDADispatcher.get_max_threads_per_block  s      >>)..1GGGT^^22456LLL *.)=)=)?A)? 777)?A A Ar  c                 Z   Ub#  U R                   UR                     R                  $ U R                  (       a6  [	        [        U R                   R                  5       5      5      R                  $ U R                   R                  5        VVs0 s H  u  p#X#R                  _M     snn$ s  snnf )ay  
Returns the size in bytes of local memory per thread
for this kernel.

:param signature: The signature of the compiled kernel to get local
                  memory usage for. This may be omitted for a
                  specialized kernel.
:return: The amount of local memory allocated by the compiled variant
         of the kernel for the given signature and current device.
)rl  r   r   r`  rj  rk  rm  r  r  s       ru   get_local_mem_per_thread'CUDADispatcher.get_local_mem_per_thread/  r  r  c                 ,   U R                   (       a  U R                  [        U5      5        U R                  R                  nSR                  U5      n[        R                  " XCU R                  S9n[        R                  " U R                  5      nXVX4$ )z
Get a typing.ConcreteTemplate for this dispatcher and the given
*args* and *kws* types.  This allows resolution of the return type.

A (template, pysig, args, kws) tuple is returned.
zCallTemplate({0}))key
signatures)_can_compilecompile_devicer   rA   rM  formatr   make_concrete_templatenopython_signaturesr   pysignature)rd   r   r  	func_namerY   call_templatepysigs          ru   get_call_template CUDADispatcher.get_call_templateB  s{     d, LL))	")))455D,D,DF!!$,,/T..r{   c                    XR                   ;  Ga"  U R                     U R                  R                  S5      nU R                  R                  S5      nU R                  R                  S5      nU R                  R                  S5      nU R                  R                  S5      (       a  SOSUS.n[	        5       R
                  n[        U R                  X!UUUUUUS	9	n	XR                   U'   U	R                  R                  U	R                  U	R                  U	R                  /5        S
S
S
5        U	$ U R                   U   n	U	$ ! , (       d  f       W	$ = f)zCompile the device function for the given argument types.

Each signature is compiled once by caching the compiled function inside
this object.

Returns the `CompileResult`.
r5   r6   r7   r2   r3   r1   r   )r3   r2   r4   N)rl  _compiling_counterr  r  r   rD   r   rA   rF   insert_user_functionr@   rL   rK   )
rd   r   return_typer5   r6   r7   r2   r8   r9   rh   s
             ru   r  CUDADispatcher.compile_device]  s6    ~~%((**..w7--11*=++//9--11*= !% 2 2 6 6u = =11 ( 
 ()<<#DLL+*/-5+1-51=')+ (,t$##889I9I9=:>,,I- )8  >>$'D9 )(8 s   DE
Ec                     U Vs/ s H  o3R                   PM     nnU R                  XASS9  XR                  U'   g s  snf )NTr   )_code_insertrl  )rd   rn   rB   r  c_sigs        ru   add_overloadCUDADispatcher.add_overload  s9    "*+(Q(+U.#)x  ,s   ;c                    [         R                  " U5      u  p#Ub  U[        R                  :X  d   eU R                  (       a,  [        [        U R                  R                  5       5      5      $ U R                  R                  U5      nUb  U$ U R                  R                  XR                  5      nUb  U R                  U==   S-  ss'   OU R                  U==   S-  ss'   U R                  (       d  [!        S5      e[#        U R$                  U40 U R&                  D6nUR)                  5         U R                  R+                  X5        U R-                  XB5        U$ )zg
Compile and bind to the current context a version of this kernel
specialized for the given signature.
r   zCompilation disabled)r   normalize_signaturer   noner`  rj  rk  rl  rm  r  r  r  	targetctx_cache_hits_cache_missesr  r<   r.   rA   r  r   save_overloadr  )rd   r  rB   r  rn   s        ru   r  CUDADispatcher.compile  s!   
 !) < <S A"kUZZ&??? T^^224566^^''1F! **3?S!Q&! s#q(#$$"#9::T\\8Jt7I7IJFKKMKK%%c2&+r{   c                    U R                   R                  S5      nUbK  U(       a'  U R                  U   R                  R	                  5       $ U R                  U   R                  5       $ U(       aG  U R                  R                  5        VVs0 s H  u  p4X4R                  R	                  5       _M!     snn$ U R                  R                  5        VVs0 s H  u  p4X4R                  5       _M     snn$ s  snnf s  snnf )z
Return the LLVM IR for this kernel.

:param signature: A tuple of argument types.
:return: The LLVM IR for the given signature, or a dict of LLVM IR
         for all previously-encountered signatures.

rg   )r  r  rl  rK   r   r   r  )rd   r[   rg   r  r  s        ru   r   CUDADispatcher.inspect_llvm  s     ##''1 ~~i088EEGG~~i0==??-1^^-A-A-CE-CMC --::<<-CE E .2^^-A-A-CE-CMC 2244-CE EEEs   &C5C;c                 2   [        5       R                  nU R                  R                  S5      nUbM  U(       a(  U R                  U   R
                  R                  U5      $ U R                  U   R                  U5      $ U(       aH  U R                  R                  5        VVs0 s H   u  pEXER
                  R                  U5      _M"     snn$ U R                  R                  5        VVs0 s H  u  pEXER                  U5      _M     snn$ s  snnf s  snnf )z
Return this kernel's PTX assembly code for for the device in the
current context.

:param signature: A tuple of argument types.
:return: The PTX code for the given signature, or a dict of PTX codes
         for all previously-encountered signatures.
rg   )	r   rD   r  r  rl  rK   rM   r   r  )rd   r[   r9   rg   r  r  s         ru   r   CUDADispatcher.inspect_asm  s     !44##''1 ~~i088DDRHH~~i0<<R@@-1^^-A-A-CE-CMC --99"==-CE E .2^^-A-A-CE-CMC 11"55-CE EEEs   $'D,Dc                    U R                   R                  S5      (       a  [        S5      eUb  U R                  U   R	                  5       $ U R                  R                  5        VVs0 s H  u  p#X#R	                  5       _M     snn$ s  snnf )aK  
Return this kernel's CFG for the device in the current context.

:param signature: A tuple of argument types.
:return: The CFG for the given signature, or a dict of CFGs
         for all previously-encountered signatures.

The CFG for the device in the current context is returned.

Requires nvdisasm to be available on the PATH.
rg   z'Cannot get the CFG of a device function)r  r  r<   rl  r   r  rd   r[   r  defns       ru   r   CUDADispatcher.inspect_sass_cfg  s     !!(++HII >>),==?? &*^^%9%9%;=%;	 ..00%;= = =   )B	c                    U R                   R                  S5      (       a  [        S5      eUb  U R                  U   R	                  5       $ U R                  R                  5        VVs0 s H  u  p#X#R	                  5       _M     snn$ s  snnf )ag  
Return this kernel's SASS assembly code for for the device in the
current context.

:param signature: A tuple of argument types.
:return: The SASS code for the given signature, or a dict of SASS codes
         for all previously-encountered signatures.

SASS for the device in the current context is returned.

Requires nvdisasm to be available on the PATH.
rg   z(Cannot inspect SASS of a device function)r  r  r<   rl  r   r  r  s       ru   r   CUDADispatcher.inspect_sass  s     !!(++IJJ >>),99;; &*^^%9%9%;=%;	 **,,%;= = =r  c                     Uc  [         R                  nU R                  R                  5        H  u  p#UR	                  US9  M     g)r   Nr   )r   r   rl  r  r   )rd   r   rp  r	  s       ru   r   CUDADispatcher.inspect_types  s<     <::D~~++-GAD) .r{   c                     U " X5      nU$ )r   r   )r   rA   r  r   s       ru   r   CUDADispatcher._rebuild  s    
 w.r{   c                 >    [        U R                  U R                  S9$ )zL
Reduce the instance for serialization.
Compiled definitions are discarded.
)rA   r  )r   rA   r  ry   s    ru   r   CUDADispatcher._reduce_states  s     
 DLL"&"4"46 	6r{   )r  r  r  rL  )r   r   r   rx   ),rM  rN  rO  rP  rQ  
_fold_argsr   targetdescrr   r>   rR  r  r  r   	lru_cacher  r  r  rC   rc  r}  r  r  ra  r`  r  r  r  r  r  r  r  r  r  r   r   r   r   r   rS  r   r   rT  rU  rV  s   @ru   r  r  @  s    JK>J "  / /. %P &P%
Q, 4 4(4	B-0 ! !A&A(A&A*A&/6%N*
"HE.E0=*=,
*  6 6r{   r  )9numpyr(  rQ   r   r   r   
numba.corer   r   r   r   r   r   numba.core.cachingr	   r
   numba.core.compiler_lockr   numba.core.dispatcherr   numba.core.errorsr   numba.core.typing.typeofr   r   numba.cuda.apir   numba.cuda.argsr   numba.cuda.compilerr   r   numba.cuda.cudadrvr   numba.cuda.cudadrv.devicesr   numba.cuda.descriptorr   numba.cuda.errorsr   r   
numba.cudar  numbar   r   warningsr   rP   ReduceMixinr.   objectrX  rs  r  r  r  r   r{   ru   <module>r*     s     	 
   H H / 9 , 5 4 - $ : % 2 -< *   * i/i## i/X+V +\A A:I $> >a6Z!6!6 a6r{   