
    sh                        S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKJrJr  S SKJr  S SKJrJrJrJr  S SKJrJrJrJrJr  S SKrS SKrS SKJr  S SKJr  S S	KJ r   S S
K!J"r"  S SK#J$r$J%r%J&r&J'r'J(r(  S SK)J*r*J+r+  S SK,J-r-  S SK.J/r/  \(       a  S SK0J1r1  S SK2J3r3  S SK4J5r5  S SK6J7r7  SSK8J9r9  SSK:J;r;  SSK8J<r<  SSK=J>r>  SSK?J@r@  SrASqB\-" \CS5      rD\R                  " \C5      rF " S S5      rG " S S5      rH " S  S!\I5      rJ\R                  S<S" j5       rL\R                   " S# S$5      5       rN\R                   " S% S&5      5       rO\O" 5       rP\\"R                  \"R                  4   rS\R                   " S' S(5      5       rT\R                   " S) S*5      5       rU " S+ S,\U5      rV " S- S.5      rW " S/ S05      rX " S1 S2\U5      rY " S3 S4\W\Y5      rZ " S5 S6\X\Y5      r[ " S7 S8\W\U5      r\ " S9 S:\X\U5      r]    S=S; jr^g)>    )annotationsN)IterableSequence)ThreadPoolExecutor)byrefc_size_tc_void_pCDLL)AnyCallableOptionalTYPE_CHECKINGUnion)multiprocessing)get_interface_for_device)rand_strided)ir)CppCodeCacheCUDACodeCache
DLLWrapperget_hashPyCodeCache)get_gpu_typeis_gpu)getArtifactLogger)
OrderedSet)BaseProcess)Queue)
ModuleType)TritonTemplateCaller   )WorkspaceArg)config)WorkspaceZeroMode)benchmarker)VCUDA_VISIBLE_DEVICESF
autotuningc                      \ rS rSrSrg)Ping;    N__name__
__module____qualname____firstlineno____static_attributes__r,       t/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/_inductor/autotune_process.pyr*   r*   ;       r3   r*   c                      \ rS rSrSrg)Pong?   r,   Nr-   r,   r3   r4   r7   r7   ?   r5   r3   r7   c                      \ rS rSrSrg)!NonzeroWorkspaceNotSupportedErrorC   r,   Nr-   r,   r3   r4   r:   r:   C   r5   r3   r:   c              #  v  #    U c  Sv   g[         R                  R                  [        5      n[	        U 5      [         R                  [        '    Sv   Uc  [         R                  [        	 gU[         R                  [        '   g! Uc  [         R                  [        	 f U[         R                  [        '   f = f7f)z
Context manager to set the CUDA_VISIBLE_DEVICES environment variable to the
specified single device. If device is None, don't manipulate the environment.
N)osenvirongetr'   str)devicecurrents     r4   set_cuda_visible_devicerC   G   s      ~jjnn12G'*6{BJJ#$7?

/0/6BJJ+, ?

/0/6BJJ+,s   AB9B 1B92B66B9c                      \ rS rSr% SrSrS\S'   SrS\S'   SrS\S	'   Sr	S\S
'   \
      SS j5       r\
SS j5       rSS jrSS jrSS jrSS jr S SS jjrSS jrSS jrSSS jjrSrg)TuningProcess\   z
Abstraction for launching a helper process to benchmark kernels. Spawns
the parent process and uses multiprocessing queues to send benchmark
requests and return results.
NOptional[int]rA   zOptional[BaseProcess]processzOptional[Queue[Any]]request_queueresponse_queuec                    [         R                  S[        R                  R	                  [
        5      5         [        R                  X5        g! [         a    [         R                  S5         gf = f)z$
Entry point for the child process.
z2Entering TuningProcess child. Visible devices = %szException in TuningProcessN)
autotuning_logdebugr=   r>   r?   r'   rE   workloop	Exception	exception)rI   rJ   s     r4   process_mainTuningProcess.process_maini   sZ     	@JJNN/0	
	C""=A 	C$$%AB	Cs   A A10A1c                &    U R                  5       nUc  g[        U[        5      (       a  UR                  [	        5       5        OL[        U[
        5      (       a   UR                  UR                  5       5        O[        S[        U5       35      eM  )z,
Work loop for the benchmarking subprocess.
NzInvalid request type )	r?   
isinstancer*   putr7   BenchmarkRequest	benchmarkRuntimeErrortype)rI   rJ   objs      r4   rN   TuningProcess.workloopz   sx    
 ##%C{C&&""46*C!122""3==?3"%:49+#FGG r3   c                r    U R                   SL=(       a#    U R                  SL=(       a    U R                  SL$ )z/
True if the sub-process has been initialized.
NrH   rI   rJ   selfs    r4   validTuningProcess.valid   s;    
 LL$ 0""$.0##4/	
r3   c                .    S=U l         =U l        U l        g)z"
Reset to an uninitialized state.
Nr]   r^   s    r4   clearTuningProcess.clear   s     CGFFt)D,?r3   c                   U R                  5       (       a  g[        R                  " S5      nUR                  5       U l        UR                  5       U l        UR                  U R                  U R                  U R
                  4S9U l        U R                  c   e[        U R                  5         U R                  R                  5         SSS5        g! , (       d  f       g= f)z
Create child process, request/response queues, and do the warm up.
Set the environment to make only the provided GPU device visible
to the process.
Nspawn)targetargs)r`   r   get_contextr   rI   rJ   ProcessrQ   rH   rC   rA   start)r_   ctxs     r4   
initializeTuningProcess.initialize   s     ::<< ))'2 YY[!iik{{$$""## # 
 ||'''$T[[1LL  211s   0C
C"c                x    U R                  5         U R                  c   eU R                  R                  U5        g)z(
Push a work item to the child process.
N)rm   rI   rU   )r_   rZ   s     r4   rU   TuningProcess.put   s4    
 	!!---s#r3   c                   U R                   c   eU R                  c   e  UnSnUb&  US:  a   US-  n U R                  R                  SS9n Uc  U R                  R                  US9nU$ ! [        R                   a#    U R                   R                  5       (       d  e  Of = fUc  M]  US:  a  M  Nf! [        R                   a<    U R                   R                  nUc  U R                  UUS9  e U R                  5         e f = f)a  
Get a response from the child process. Raises queue.Empty on timeout
or if the process dies.

This method is (so far) only used by TuningProcessPool, where torch._inductor.config entries are being used
to populate the timeouts:

Arguments:

    @param result_timeout: Timeout in seconds, defaults to 120.0 or to
                           config.max_autotune_subproc_result_timeout_seconds when called by TuningProcessPool
    @param graceful_timeout: Timeout in seconds to allow graceful shutdown (SIGTERM is sent after this time).
                            Defaults to 3.0 or to config.max_autotune_subproc_graceful_timeout_seconds
    @param terminate_timeout: Timeout in seconds after SIGTERM, until we send SIGKILL if the process
                              remains alive. Defaults to 1.0 or to
                              config.max_autotune_subproc_terminate_timeout_seconds.
Returns:
    A response from the child process (Any type)
N      ?g      ?timeout)graceful_timeoutterminate_timeout)	rH   rJ   r?   queueEmptyis_aliveexitcodekillrc   )r_   result_timeoutru   rv   remaining_timeoutresstatuss          r4   r?   TuningProcess.get   s,   , ||'''""...$2!'38IS8P%,%""1155c5B ;--11:K1LC
 !;; "#||4466!  7" (38IS8P ;; 
..>II)9*;    JJL
s5   B4 A, B4 ,4B# B4 "B##B4 +B4 4ADc                    U R                  5       (       a:  U R                  c   eU R                  c   eU R                  R                  S5        gg)z(
Signal the child process to terminate.
N)r`   rH   rI   rU   r^   s    r4   	terminateTuningProcess.terminate   sJ     ::<<<<+++%%111""4( r3   c                t    U R                   b+  U R                   R                  5         U R                  5         gg)z%
Wait for the child process to exit.
N)rH   joinrc   r^   s    r4   waitTuningProcess.wait   s,     <<#LLJJL $r3   c                R   U R                   Gb  U R                  5         U R                   R                  US9  U R                   R                  5       (       a  [        R                  SU R                   R                  5        U R                   R                  5         U R                   R                  US9  U R                   R                  5       (       aD  [        R                  SU R                   R                  5        U R                   R                  5         U R                  5         g g )Nrs   z&Sending SIGTERM to process with PID %dz&Sending SIGKILL to process with PID %d)
rH   r   r   ry   rL   warningpiderrorr{   rc   )r_   ru   rv   s      r4   r{   TuningProcess.kill  s    
 <<#NNLL&67||$$&&&&<LL$$ &&(!!*;!<<<((**"((@(( LL%%'JJL! $r3   r]   )rI   
Queue[Any]rJ   r   returnNone)r   boolr   r   )rZ   r   r   r   )g      ^@g      @rr   )r   r   )g      @rr   )r.   r/   r0   r1   __doc__rA   __annotations__rH   rI   rJ   staticmethodrQ   rN   r`   rc   rm   rU   r?   r   r   r{   r2   r,   r3   r4   rE   rE   \   s     !FM %)G")*.M'.+/N(/C!C"C 
C C  H H 
G!2$ MP1	1f) r3   rE   c                  p    \ rS rSr% SrSrS\S'   SrS\S'   SS jrSS	 jr	SS
 jr
SS jr    SS jrSrg)TuningProcessPooli  z
Maintains a pool of TuningProcesses to benchmark kernels in parallel
across devices. By default, we create one TuningProcess per device and
set the sub-process environment to make only that device visible.
Nz$Optional[queue.Queue[TuningProcess]]	processeszOptional[ThreadPoolExecutor]executorc                   U R                   SL U R                  SL :X  d   eU R                   b  gU R                  5       n[        R	                  SU5        [
        R                  " 5       U l         U HP  n[        US9nUR                  5         UR                  [        5       5        U R                   R                  U5        MR     U R                   R
                   H'  n[        UR                  SS9[        5      (       a  M'   e   [        [        U5      S9U l        [         (       d"  SqSSKnUR%                  U R&                  5        gg)z
Start the child processes.
Nz$Sub-process autotune device list: %s)rA   )r|   )max_workersTr   )r   r   get_device_listlogrM   rw   r   rE   rm   rU   r*   rT   r?   r7   r   lenEXIT_HANDLER_REGISTEREDatexitregisterr   )r_   devicesrA   pr   s        r4   rm   TuningProcessPool.initialize$  s    $&DMMT,ABBB>>%&&(		8'B FV,ALLNEE$&MNNq!	  %%Aaee4e8$???? & +s7|D
 '&&*#OODNN+	 'r3   c                   [         R                  (       d  S/$ [        5       n[        U5      nUR	                  5       n[
        [        R                  ;   aR  [        R                  [
           R                  S5       Vs/ s H  n[        U5      PM     nn[        U5      U::  d   eU$ [        [        U5      5      $ s  snf )z4
Gather the list of devices to be used in the pool.
N,)r#   autotune_multi_devicer   r   device_countr'   r=   r>   splitintr   listrange)r_   gpu_typedevice_interfacecountdr   s         r4   r   !TuningProcessPool.get_device_listI  s     ++6M>3H= --/  2::-')zz2F'G'M'Mc'RS'R!s1v'RGSw<5(((NE%L!!	 Ts   >B<c                >   U R                   b!  U R                   R                  5         SU l         U R                  bb  U R                  R                   H  nUR	                  5         M     U R                  R                   H  nUR                  5         M     SU l        gg)z*
Signal all child processes to terminate.
N)r   shutdownr   rw   r   r   )r_   r   s     r4   r   TuningProcessPool.terminate]  su     ==$MM""$ DM>>%^^)) *^^)) *!DN &r3   c                N   UR                   c   eU R                  c   eU R                  R                  5       nUR                  UR                   5         UR                  [        R
                  [        R                  [        R                  5      U R                  R                  U5        $ ! [        R                   aC    [        R                  " SU S35        [        S5      s U R                  R                  U5        $ f = f! U R                  R                  U5        f = f)z
Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
remove it from the queue, execute the benchmark in that subprocess, and return
the TuningProcess to the queue.
zFailed to benchmark choice 'z['. It will be ignored. Please debug the root cause in case the choice can bring perf gains.inf)bmreqr   r?   rU   r#   +max_autotune_subproc_result_timeout_seconds-max_autotune_subproc_graceful_timeout_seconds.max_autotune_subproc_terminate_timeout_secondsrw   rx   warningswarnfloat)r_   choicerH   s      r4   rg   TuningProcessPool.targetl  s     ||'''~~)))..$$&FLL!	(;;BBDDEE NNw' {{ 	 MM.vh 7W W
 <NNw'	  NNw's$   <B- -9D&D DD D$c                    U R                   c   S5       eU R                  c   e0 n[        XR                  R                  U R                  U5      5       H	  u  p4XBU'   M     U$ )z.
Benchmark each choice in a separate process.
z&Tuning process pool is not initialized)r   r   zipmaprg   )r_   choicesresultsr   results        r4   rW   TuningProcessPool.benchmark  sf     ~~)S+SS)}}((( "'==+<+<T[['+RSNF$FO T r3   )r   r   r   )r   zSequence[Optional[int]])r   r    r   r   r   zlist[TritonTemplateCaller]r   z!dict[TritonTemplateCaller, float])r.   r/   r0   r1   r   r   r   r   rm   r   r   rg   rW   r2   r,   r3   r4   r   r     sK     7;I3:-1H*1#,J"("(6+ 
+r3   r   c                  |    \ rS rSr% S\S'   S\S'   S\S'   S\S	'   S
\S'   SrS\S'   \    SS j5       rSS jrSr	g)
TensorMetai  ztorch.devicerA   ztorch.dtypedtypeztorch._prims_common.ShapeTypesizesztorch._prims_common.StrideTypestridesr   offsetNzOptional[str]namec                D   [        U[        5      (       a;  U Vs/ s H  o R                  U5      PM     nn[        S U 5       5      (       d   eU$ Un[        U[        R
                  5      (       a  [        R                  " SUS9nUR                  5       nUc   eUR                  5       nUc   e[        UU[        R                  R                  R                  UR                  5       [        R                   S9[        R                  R                  R                  UR#                  5       [        R                   S9[        R                  R                  R%                  UR'                  5       R(                  [        R                   S9UR+                  5       S9$ s  snf )Nc              3  B   #    U  H  n[        U[        5      v   M     g 7fN)rT   r   .0xs     r4   	<genexpr>*TensorMeta.from_irnodes.<locals>.<genexpr>  s     A&Qz!Z00&s   fake)r   layout)fallback)rA   r   r   r   r   r   )rT   r   from_irnodesallr   LayoutBuffer	get_dtype
get_devicer   r&   graphsizevars
size_hintsget_sizer#   unbacked_symint_fallback
get_stride	size_hint
get_layoutr   get_name)clsirnodesr   r   noder   rA   s          r4   r   TensorMeta.from_irnodes  s]    gx((>E Fg!1!1!!4gF FA&AAAAAMdBII&&99&6D    "!!!''""--88 .  GG$$//!88 0  77##--!((88 .  
 	
 !Gs   Fc                    [        U R                  U R                  U R                  U R                  U R
                  S9$ )N)rA   r   
extra_size)r   r   r   rA   r   r   r^   s    r4   	to_tensorTensorMeta.to_tensor  s2    JJLL;;**{{
 	
r3   r,   )r   z/Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]r   #Union[TensorMeta, list[TensorMeta]])r   torch.Tensor)
r.   r/   r0   r1   r   r   classmethodr   r   r2   r,   r3   r4   r   r     sQ    ((++KD-!
E!
	,!
 !
F
r3   r   c                      \ rS rSrSr          SS jr      SS jrSS jrSS.     SS jjrSS.     SS	 jjr	S
r
g)rV   i  a  
Only handle triton template benchmark for now. The extern kernel benchmark
can be done inside the same process since they usually don't cause crash.

Important: Instances of this class and subclasses have to be serializable
across process boundaries. Do not put CUDA Tensors in here!
c                   ^ Xl         [        U[        5      (       a  U/nX l        [        T[        [
        45      (       a0  [        T5      S:  a  [        U4S jT 5       5      (       d   eTS   mTU l        X@l	        g )Nr!   c              3  n   >#    U  H*  nS   H   n[        TS   U5      [        X5      :H  v   M"     M,     g7f))rA   r   r   r   r   r   N)getattr)r   r   attroutput_tensor_metas      r4   r   ,BenchmarkRequest.__init__.<locals>.<genexpr>  s=      / Q .q148GA<LL Q M/s   25r   )
kernel_namerT   r   input_tensor_metatupler   r   r   r   
extra_args)r_   r   r   r   r   s      ` r4   __init__BenchmarkRequest.__init__  s     ''44!2 3!2(5$-88%&* /    
 "4A!6"4$r3   c                   [         er   NotImplementedErrorr_   output_tensorinput_tensorss      r4   make_run_fnBenchmarkRequest.make_run_fn  s
     "!r3   c                    g r   r,   r^   s    r4   cleanup_run_fnBenchmarkRequest.cleanup_run_fn  s    r3   Nr  c                   [         er   r  r_   fnr  r  s       r4   do_benchBenchmarkRequest.do_bench  s
     "!r3   c               :   [         R                  [        R                  5      nU(       a  [        R                  " 5       nUcG  [        U5      S:X  d   e[        S U R                   5       5      nU R                  R                  5       nU(       a-  [        R                  " 5       W-
  n[        R                  " 5       n U R                  " USU06nU(       a-  [        R                  " 5       W-
  n[        R                  " 5       nU R                  " U/UQUP76 nU(       a:  [        R                  " 5       W-
  n	[         R                  S[!        U 5      WWU	5        U R#                  5         U$ ! [         a#    [         R                  S5        [        S5      s $ f = f)Nr   c              3  @   #    U  H  oR                  5       v   M     g 7fr   )r   r   s     r4   r   -BenchmarkRequest.benchmark.<locals>.<genexpr>  s     !P9OA++--9Os   r  z0Skipping op due to nonzero workspace requirementr   z6InChildProcess %s: load %f, create tensor %f, bench %f)rL   isEnabledForloggingDEBUGtimer   r   r   r   r   r  r:   infor   r  rM   r@   r	  )
r_   r  r  rM   start_tscreate_tensor_elapser  load_elapseoutbench_elapses
             r4   rW   BenchmarkRequest.benchmark  sM   
 ++GMM:yy{H  }%***!!P9O9O!PPM 33==?M#'99;#9 yy{H	 !!=NNB ))+0Kyy{HmmB>>>99;1L  HD	$ 	
+ 1 	  RS<	 s   ?E- -*FF)r   r   r   r   )
r   r@   r   r   r   r   r   Iterable[Any]r   r   r  r   r  r   r   zCallable[[], None]r   r  r   r  zOptional[torch.Tensor]r   r   )r.   r/   r0   r1   r   r   r  r	  r  rW   r2   r,   r3   r4   rV   rV     s    %% ?% @	%
 "% 
%6"*";G"	"
 15	" %" .	"
 
" 15)$) .) 
	) )r3   rV   c                  B    \ rS rSrSrSS	S jjrSS.     S
S jjrSrg)TestBenchmarkRequesti9  zx
Supports unit testing. Defined in this file so that the TuningProcess
sub-process knows how to unpickle these objects.
Nc                    Xl         g r   value)r_   r&  s     r4   r   TestBenchmarkRequest.__init__?  s    
r3   r  c               J    U R                   c  [        S5      eU R                   $ )NzFailed to run)r&  rO   r  s      r4   rW   TestBenchmarkRequest.benchmarkB  s#     ::O,,zzr3   r%  r   )r&  zOptional[float]r   r   r!  )r.   r/   r0   r1   r   r   rW   r2   r,   r3   r4   r#  r#  9  s5    
 UY*;Q	 r3   r#  c                  0    \ rS rSrSS.     SS jjrSrg)GPUDeviceBenchmarkMixiniJ  Nr  c                  [        S / UQUP 5       5      n[        U5      S::  d
   SU 35       e[        S U 5       S5      n[        U5      n[        U5      S:X  a  [        [	        U5      5      nOUR                  5       nUR                  U5         [        R                  " U5      nUR                  5         S S S 5        U$ ! , (       d  f       W$ = f)Nc              3    #    U  H{  n[        U[        R                  5      (       d  M$  [        UR                  R
                  5      (       d  MJ  UR                  R                  c  Mc  UR                  R                  v   M}     g 7fr   )rT   torchTensorr   rA   rY   indexr   tensors     r4   r   3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>Q  s^      $
9&%,,/   v}}))*   ##	  FMM9s   #B"BB(Br!   zCan not mix devices c              3     #    U  HA  n[        UR                  R                  5      (       d  M)  UR                  R                  v   MC     g 7fr   )r   rA   rY   r1  s     r4   r   r3  Z  s5      +F&--,,- #""+s
   (AAcuda)
r   r   nextr   itercurrent_devicerA   r%   benchmark_gpusynchronize)	r_   r  r  r  device_idx_setdevice_typer   
device_idxr  s	            r4   r   GPUDeviceBenchmarkMixin.do_benchK  s     $ $
9M9=9$
 
 >"a'P+??O)PP'+
 
 4K@~!#d>23J)88:J$$Z0++B/C((* 1 
	 10 
s   'C
Cr,   r!  r.   r/   r0   r1   r  r2   r,   r3   r4   r+  r+  J  s/    
 15	 % .	
 
 r3   r+  c                  0    \ rS rSrSS.     SS jjrSrg)CPUDeviceBenchmarkMixinim  Nr  c               .    [         R                  " U5      $ r   )r%   benchmark_cpur  s       r4   r   CPUDeviceBenchmarkMixin.do_benchn  s     ((,,r3   r,   r!  r?  r,   r3   r4   rA  rA  m  s/    
 15	- %- .	-
 
- -r3   rA  c                     ^  \ rS rSr    S                         SU 4S jjjr      S	S jrS rS
S jrSrU =r	$ )TritonBenchmarkRequestiw  c                   > [         TU ]  XX45        XPl        X`l        Xpl        Xl        Xl        Xl        Xl        Xl	        g r   )
superr   module_pathmodule_cache_key
num_stages	num_warpsmatrix_instr_nonkdimwaves_per_eukpackworkspace_arg)r_   r   r   r   r   rI  rJ  rK  rL  rM  rN  rO  rP  	__class__s                r4   r   TritonBenchmarkRequest.__init__z  sB     	9KX& 0$"$8!(
*r3   c                 ^^^^	^
^^ [         R                  " U R                  U R                  5      n[        R                  SU R                  U R                  5        [        X0R                  5      R                  m	[        U R                  5      mST	R                  l        0 mSS KnSUR                  T	5      R                  ;   a  STS'   TR                   R"                  S:X  a  Sm
OPTR                   R"                  n[%        U5      nUR'                  U R(                  R                   R*                  5      m
U R,                  b  U R,                  mUUUU	U
UU4S jnU$ [/        [        X0R                  5      [0        R2                  R4                  R6                  R8                  5      (       a"  [:        R<                  " T	/TQTPTQ70 TDST
0D6$ [:        R<                  " T	/TQTPTQ70 TDT
SS	.D6$ )
Nz"benchmark module key: %s, path: %sFr   warmupcpuc                   > TR                   n [        R                  " U 4S[        R                  TR                  S9nTR
                  [        R                  :w  a  UR                  5         T" / TQTPUPTQ70 TDTSS.D6  g )N)r!   r   rA   Tstreambenchmark_run)	r   r.  empty_strideduint8rA   	zero_moder$   UNINITIALIZEDzero_)	workspace_sizeworkspace_tensorr   r  r  
run_methodrY  
warmup_argrP  s	     r4   run_with_workspace>TritonBenchmarkRequest.make_run_fn.<locals>.run_with_workspace  s    !.!4!4#(#6#6#%++(//	$  !**.?.M.MM$**,  "! %  	
 ! ""&r3   rY  TrX  )r   load_by_key_pathrJ  rI  rL   rM   r   r   runr   r   __self__with_bandwidth_infoinspect	signature
parametersrA   rY   r   get_raw_streamr   r0  rP  rT   r.  	_inductorruntimetriton_heuristicsDebugAutotuner	functoolspartial)r_   r  r  modrj  r<  r   rd  r   rb  rY  rc  rP  s    ``     @@@@@r4   r  "TritonBenchmarkRequest.make_run_fn  s    **4+@+@$BRBRS0!!	
 S"2"2377
$//*
27
/ 
w((4???#(Jx $$-F'..33K7D%44''..44F ) ..M 2 &%C))*OO##55DD
 
 $$  	
    $$  	
  " r3   c                    [         R                  " U R                  U R                  5      n[	        XR
                  5      R                  5         g r   )r   rf  rJ  rI  r   r   
precompile)r_   rt  s     r4   rw  !TritonBenchmarkRequest.precompile  s7    **4+@+@$BRBRS%%&113r3   c                Z    SU R                   < SU R                  < SU R                  < 3$ )Nself.kernel_name=z, self.module_path=z, self.module_cache_key=)r   rI  rJ  r^   s    r4   __str__TritonBenchmarkRequest.__str__  s2    #$""$$8t'7'7&99RD<Q<Q;STTr3   )rO  rM  rJ  rI  rK  rL  rN  rP  )r   r   r   N)r   r@   r   r   r   r   r   r  rI  r@   rJ  r@   rK  r   rL  r   rM  r   rN  r   rO  r   rP  zOptional[WorkspaceArg]r   r   r   r   r@   )
r.   r/   r0   r1   r   r  rw  r{  r2   __classcell__rQ  s   @r4   rF  rF  w  s     %&04++ ?+ @	+
 "+ + + + + "+ + + .+ 
+ +2R*R;GR	Rh4U Ur3   rF  c                      \ rS rSrSrg)TritonGPUBenchmarkRequesti  r,   Nr-   r,   r3   r4   r  r    r5   r3   r  c                      \ rS rSrSrg)TritonCPUBenchmarkRequesti  r,   Nr-   r,   r3   r4   r  r    r5   r3   r  c                     ^  \ rS rSr            S
U 4S jjrS r      SS jrSS jrS rSS jr	SS jr
S	rU =r$ )CUDABenchmarkRequesti  c                   > [         TU ]  XX45        XPl        SU l        S U l        S U l        SU l        SU l        SU l        [        R                  " U R                  S5      u  U l        U l        g )Nr   F so)rH  r   source_coder`  	workspaceDLL_workspace_size_updatedhash_keysource_filer   writer_   r   r   r   r   r  rQ  s         r4   r   CUDABenchmarkRequest.__init__  sk     	9KX&#$15)-',$ "*7*=*=d>N>NPT*U't'r3   c                    [         R                  SU 5        [        R                  " U R                  S5        [         R                  SU 5        g )NPrecompiling %sr  Done precompiling %s)rL   rM   r   compiler  r^   s    r4   rw  CUDABenchmarkRequest.precompile  s<     	.5d..53T:r3   c          	     H   U R                  5         U R                  5         [        U5      U/-    Vs/ s H  n[        UR	                  5       5      PM     nn[
        R                  SU R                  U R                  U R                  U R                  UU R                  5        [        [        R                  R                  5       R                  5      n[!        U R                  U R                  5      n[        S5      nU R"                  S:  af  [        R$                  " U R"                  S-   S-  [        R&                  UR(                  S9U l        [        U R*                  R	                  5       5      n[,        R.                  " U/UQU R                  QS PUPUP76 $ s  snf )Nzqmake_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sr         rW  )ensure_dll_loadedupdate_workspace_sizer   r	   data_ptrrL   rM   r   r  r  r  r   r.  r5  current_streamcuda_streamr   r`  zerosfloat64rA   r  rr  rs  )r_   r  r  r2  rh   
stream_ptrrb  workspace_ptrs           r4   r   CUDABenchmarkRequest.make_run_fn  s|    	 ""$ }-?
? V__&'? 	 
 	MMHHOO	
 ejj779EEF
TXXt'7'78
 ""[[$$q(Q.mm$++DN
 %T^^%<%<%>?M   

 __
 	

 
 
 	
3
s   #Fc           
         U R                   (       a  g U R                  5         [        U R                   Vs1 s H  oR                  iM     sn5      n[        US-   5       Vs/ s H  n[        S 5      PM     nn[        [        R                  R                  5       R                  5      n[        U R                  U R                  5      n[        5       nU" / UQU R                  Q[!        U5      PS PUP76   [        R                  R#                  5         UR$                  U l        [(        R+                  SU R&                  U R                  U R,                  U R.                  U R                  UU R                  5        SU l         g s  snf s  snf )Nr!   zupdate_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sT)r  r  r   r   r   r   r	   r.  r5  r  r  r   r  r   r   r   r   r:  r&  r`  rL   rM   r  r  )r_   metaunique_input_count_rh   r  rb  c_workspace_sizes           r4   r  *CUDABenchmarkRequest.update_workspace_size;  s^   ''  #'#9#9:#94YY#9:
 )..@1.D(EF(E1(EFejj779EEF
TXXt'7'78
#: 	
	
__	
  	
 	
 	
 	

 .44 hMMHHOO		
 (,$; ;Fs   F"Fc                    U R                   c5  [        R                  " U R                  S5      u  U l         U l        U l        g g )Nr  )r  r   loadr  r  r  r^   s    r4   r  &CUDABenchmarkRequest.ensure_dll_loaded_  s:    888E8J8J  $95DHdmT%5 r3   c                `    U R                   b  U R                   R                  5         S U l        g r   )r  closer  r^   s    r4   r	  #CUDABenchmarkRequest.cleanup_run_fne  s!    88HHNNr3   c                Z    SU R                   < SU R                  < SU R                  < 3$ )Nrz  z, self.source_file=z, self.hash_key=)r   r  r  r^   s    r4   r{  CUDABenchmarkRequest.__str__j  s0    #$""$$8t'7'7&99JDMM;KLLr3   )r  r  r  r  r  r  r`  r   r@   r   r   r   r   r   r  r  r@   r   r   r   r   r}  )r.   r/   r0   r1   r   rw  r  r  r  r	  r{  r2   r~  r  s   @r4   r  r    s    VV ?V @	V
 "V V 
V$;%
*%
;G%
	%
N",H
M Mr3   r  c                  t   ^  \ rS rSr            SU 4S jjrS r      S	S jrS
S jrSS jrSr	U =r
$ )CppBenchmarkRequestin  c                `   > [         TU ]  XX45        XPl        [        U5      U l        S U l        g r   )rH  r   r  r   r  r  r  s         r4   r   CppBenchmarkRequest.__init__r  s.     	9KX& -6:r3   c                    [         R                  SU 5        [        R                  " U R                  SS9  [         R                  SU 5        g )Nr  rU  r<  r  )rL   rM   r   r  r  r^   s    r4   rw  CppBenchmarkRequest.precompile  s<     	.5$**>3T:r3   c               h   [         R                  " U R                  SS9U l        [	        U5      U/-    Vs/ s H  o3R                  5       PM     nn[        R                  SU R                  U R                  UU R                  5        [        U R                  U R                  5      n[        S U R                   5       5      (       d   e[        R                  /[        U5      [        [	        U R                  5      5      -   -  Ul        [         R"                  " U/UQU R                  Q76 $ s  snf )NrU  r  zJmake_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%sc              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7fr   )rT   ctypesc_ulonglong)r   args     r4   r   2CppBenchmarkRequest.make_run_fn.<locals>.<genexpr>  s      R/3:c6#5#566/s   '))r   r  r  r  r   r  rL   rM   r   r   r   r   r  r  r   argtypesrr  rs  )r_   r  r  r2  rh   rb  s         r4   r  CppBenchmarkRequest.make_run_fn  s     $$T%5%55I04]0C}o0UV0Uf!0UVXHHOO	
 TXXt'7'78
R$//RRRRR%112ID122


   

 __
 	
! Ws   D/c                    U R                   b8   [        U R                   S5      (       a  U R                   R                  5         g g g )Nr  )r  hasattrr  r^   s    r4   r	  "CppBenchmarkRequest.cleanup_run_fn  s<    88 txx))  *	  r3   c                "    SU R                   < 3$ )Nrz  )r   r^   s    r4   r{  CppBenchmarkRequest.__str__  s    #$""$%%r3   )r  r  r  r  r   r   r}  )r.   r/   r0   r1   r   rw  r  r	  r{  r2   r~  r  s   @r4   r  r  n  st    ;; ?; @	;
 "; ; 
;;
*
;G
	
6!& &r3   r  c                ,    [         R                  U 5      $ )zG
Do benchmarking in a subprocess and return the perf number (latency).
)tuning_poolrW   )r   s    r4   benchmark_in_sub_processr    s       ))r3   )rA   rG   r   )_
__future__r   
contextlibr  dataclassesrr  r  r=   rw   r  r   collections.abcr   r   concurrent.futuresr   r   r   r	   r
   typingr   r   r   r   r   r.  torch._inductor.async_compiler   torch._dynamo.device_interfacer   torch._dynamo.testingr   torch._inductorr   torch._inductor.codecacher   r   r   r   r   torch._inductor.utilsr   r   torch._loggingr   torch.utils._ordered_setr   multiprocessing.processr   multiprocessing.queuesr   typesr    torch._inductor.select_algorithmr    codegen.commonr"   r  r#   r$   runtime.benchmarkingr%   virtualizedr&   r'   r   r.   rL   	getLoggerr   r*   r7   rO   r:   contextmanagerrC   	dataclassrE   r   r  r   r   LayoutOrBufferr   rV   r#  r+  rA  rF  r  r  r  r  r  r,   r3   r4   <module>r     s   "      	    . 1 2 2 @ @  $ ! C .   7 , / 3, E,  - -  .  "8\:!	 		 			 	 7 7( y y yx ~ ~ ~B  ! ryy"))+, 3
 3
 3
l ] ] ]@+ "   F- -uU- uUp	 79O 		 79O 	tM24D tMn<&13C <&~*'*&*r3   