
    sh+                        S SK r S SKrS SKJrJr  S SKJr  S SKJr  S SK	J
r
Jr  S SKJrJrJrJr  S SKrS SKJrJr  S SKJr  \R.                  R1                  \S	5      r\=(       a    \R6                  R9                  5       rS
r\" S5      r\" S5      rS\\\
\4   \4   S\\\
\4   \4   4S jr  " S S5      r! " S S\!5      r" " S S\"5      r#\(       a  \#" 5       r$g\"" 5       r$g)    N)cached_propertywraps)chain)median)AnyCallable)Concatenate	ParamSpecSelfTypeVar)countersdynamo_timed)use_experimental_benchmarkerbenchmarkingi  PTfnreturnc           	         ^  [        T 5      S[        S[        R                  S[        R                  S[
        4U 4S jj5       nU$ )zWraps `fn` with `dynamo_timed` context, and increments the appropriate dynamo
counters. It is expected that `fn` is a method of `Benchmarker` or one of its
subclasses; typing limitations prevent us from declaring this directly.
selfargskwargsr   c                    > U R                   R                   STR                   3n[        S   SU 3==   S-  ss'   [        USS9   T" U /UQ70 UD6sS S S 5        $ ! , (       d  f       g = f)N.inductorzbenchmarking.   T)log_pt2_compile_event)	__class____name__r   r   )r   r   r   fn_qual_namer   s       x/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/_inductor/runtime/benchmarking.pywrappertime_and_count.<locals>.wrapper"   sg    ..112!BKK=A}\N;<A<,dCd,T,V, DCCs   A
A,)r   r   r   r   r   r   )r   r"   s   ` r!   time_and_countr$      sF     2Y-c -!&& -AHH - - - N    c                       \ rS rSrS\SS4S jr\S\S\S\4   S\	\S4   S	\
\\4   S
\S\4S j5       r\ SS\S\/ \4   S\S\S\4
S jj5       r\S\S\S
\S\4S j5       rSrg)Benchmarker,   r   r   Nc                     g N )r   s    r!   __init__Benchmarker.__init__-   s    r%   r   .fn_args	fn_kwargsr   c                   ^^^ Sn[        TTR                  5       5       HP  n[        U[        R                  5      (       d  M$  Uc  UR
                  nM5  UR
                  U:w  d  MG  [        S5      e   Uc  [        S5      eUUU4S jnU[        R
                  " S5      :X  a  U R                  " U40 UD6$ U R                  " U40 UD6$ )a  Benchmark `fn(*fn_args, *fn_kwargs)` and return the runtime, in milliseconds (the
actual runtime calculation is dictated by the benchmarking implementation, but may be
one of [mean, median, minimum, etc.]). Functions as a convenience wrapper around
device-specific implementations, like `benchmark_cpu` and `benchmark_gpu`. Raises
`ValueError(...)` if we can't safely infer the device type of `fn`; for example,
if multiple device types are found in `fn_args` and `fn_kwargs`, or if no device
types are found.

Arguments:
- fn: The function to benchmark.
- fn_args: The function's arguments.
- fn_kwargs: The function's kwargs.

Keyword Arguments:
- **kwargs: The benchmarking implementation's kwargs.

Returns:
- The runtime of `fn(*fn_args, **fn_kwargs)`, in milliseconds.
NzcCan't safely infer the device type of `fn` with multiple device types in `fn_args` and `fn_kwargs`!zCan't safely infer the device type of `fn` with no device types in `fn_args` or `fn_kwargs`! You should be calling `.benchmark_cpu` or `.benchmark_gpu` directly.c                     > T " T0 TD6$ r*   r+   )r   r.   r/   s   r!   <lambda>'Benchmarker.benchmark.<locals>.<lambda>Y   s    B595r%   cpu)	r   values
isinstancetorchTensordevice
ValueErrorbenchmark_cpubenchmark_gpu)r   r   r.   r/   r   inferred_devicearg_or_kwarg	_callables    ```    r!   	benchmarkBenchmarker.benchmark0   s    6 !'9+;+;+=>LlELL99&"."5"5$$7 y  ? " t  6	ell511%%i:6:: !!)6v66r%   r?   warmuprepc                 l   ^ S[         S[        [           4U4S jjnU" U5        [        U" U5      5      $ )a  Benchmark the CPU callable, `_callable`, and return the median runtime,
in milliseconds.

Arguments:
- _callable: The CPU callable to benchmark.

Keyword Arguments:
- warmup: Optionally, the duration, in milliseconds, to run `_callable`
before benchmarking starts.
- rep: Optionally, the duration, in milliseconds, to run `_callable`
during benchmarking.

Returns:
- The median runtime of `_callable`, in milliseconds.
msr   c                    > / n[         R                  " 5       n [         R                  " 5       nT" 5         [         R                  " 5       nUR                  XC-
  [        -  5        XB-
  [        -  U :  a   U$ M_  r*   )timeperf_counterappendMILLISECONDS_PER_SECOND)rE   timingsrun_start_tstart_tend_tr?   s        r!   run_for*Benchmarker.benchmark_cpu.<locals>.run_foru   sl    G++-K++-))+3JJK(,CCrIN r%   )intlistfloatr   )r   r?   rB   rC   rO   s    `   r!   r;   Benchmarker.benchmark_cpua   s2    (
	 
	U 
	 	gcl##r%   r   c                     [         er*   )NotImplementedError)r   r   r   s      r!   r<   Benchmarker.benchmark_gpu   s    !!r%   r+   )   d   )r   
__module____qualname____firstlineno__r   r,   r$   r   r   tupledictstrrS   r@   rQ   r;   r<   __static_attributes__r+   r%   r!   r'   r'   ,   s    t   .7.7S#X.7 sCx.7 S>	.7
 .7 
.7 .7` OR $ $'C0 $:= $IL $	 $  $D "D " " " " "r%   r'   c            	       h    \ rS rSr\S\S\S\4   4S j5       r\	S\S\/ \4   S\S\
4S j5       rS	rg
)TritonBenchmarker   r   r   .c                 P     SSK Jn  U$ ! [         a  n[        S5      UeSnAff = f)z"Lazily import Triton's `do_bench`.r   )do_benchzrequires TritonN)triton.testingre   ImportErrorrV   )r   re   es      r!   triton_do_bench!TritonBenchmarker.triton_do_bench   s4    	@/   	@%&78a?	@s   
 
% %r?   r   c                 @   [         R                  " U R                  5      R                  n[	        UR                  5       5       H  nXC;  d  M
  X$	 M     SU;   a  U R                  " U40 UD6S   $ SU;   a  U R                  " U40 UD6$ U R                  " U40 UDSS0D6$ )a  Benchmark the GPU callable, `_callable`, and return the runtime, in milliseconds.

Arguments:
- _callable: The GPU callable to benchmark.

Keyword Arguments:
- quantiles: Optionally, a tuple of floats denoting the requested quantiles.
- return_mode: Optionally, the requested return mode. Currently, Triton's
`do_bench` supports min, max, mean, and median return modes.
- **kwargs: Additional kwargs passed to Triton's `do_bench`.

Returns:
- The runtime of `callable`, in milliseconds. If `kwargs["quantiles"]` is specified,
this is the first requested quantile. Else, if `kwargs["return_mode"]` is specified,
this is the requested return mode. Otherwise, this is the median.
	quantilesr   return_moder   )inspect	signatureri   
parametersrR   keys)r   r?   r   do_bench_paramskwargs        r!   r<   TritonBenchmarker.benchmark_gpu   s    $ "++D,@,@ALL&++-(E+M ) & ''	<V<Q??f$''	<V<<##INNXNNr%   r+   N)r   rZ   r[   r\   r   r   r   r   ri   r$   rS   r<   r`   r+   r%   r!   rb   rb      sk    d xS'9   OD OXb#g-> O# ORW O Or%   rb   c                   f   \ rS rSr\S\S\4S j5       rS\S\S\\	\
R                  R                  \
R                  R                  4      4S jrS\S\\	\
R                  R                  \
R                  R                  4      S\4S jr\    SS\S	\/ \4   S
\S\S\S\S\S\4S jj5       rSrg)InductorBenchmarker   r   r   c                     [         R                  R                  5       n[         R                  R                  U5      nUR                  $ )z7Get the L2 cache size, in bytes, of the current device.)r7   cudacurrent_deviceget_device_propertiesL2_cache_size)r   r9   propss      r!   r|   !InductorBenchmarker.L2_cache_size   s6     **,

008"""r%   itersc                     [        U5       Vs/ s H=  n[        R                  R                  SS9[        R                  R                  SS94PM?     sn$ s  snf )z!Get `iters` pairs of CUDA events.T)enable_timing)ranger7   ry   Event)r   r   _s      r!   get_event_pairs#InductorBenchmarker.get_event_pairs   s]     5\

 " 

  t 4

  t 4 "
 	
 
s   AAevent_pairsc           	      l    [        U VVs/ s H  u  p#UR                  U5      PM     snn5      $ s  snnf )zIGet the minimum timing, in milliseconds, for a group of CUDA event pairs.)minelapsed_time)r   r   start_event	end_events       r!   get_event_pairs_min_timing.InductorBenchmarker.get_event_pairs_min_timing   sA      /:.9*K ((3.9
 	
s   0
r?   estimation_itersmemory_warmup_itersbenchmark_itersmax_benchmark_durationr   c           	         [         R                  R                  5         U" 5         [         R                  R                  5         [         R                  " U R                  S-  [         R
                  SS9nUR                  5         U R                  U5      nU H<  u  pUR                  5         U	R                  5         U" 5         U
R                  5         M>     [         R                  R                  5         U R                  U5      n[        [        U[        X[-  5      5      S5      n[        U5       H  nUR                  5         M     U R                  U5      nU H<  u  pUR                  5         U	R                  5         U" 5         U
R                  5         M>     [         R                  R                  5         U R                  U5      nA[        X5      $ )a  Benchmark a GPU callable using a custom benchmarking implementation.

Arguments:
- _callable: The callable to benchmark.

Keyword Arguments:
- estimation_iters: Optionally, the number of iterations to run `_callable`
during runtime estimation.
- memory_warmup_iters: Optionally, the number of iterations to flush the L2
cache before starting benchmarking.
- benchmark_iters: Optionally, the number of iterations to run `_callable`
during the benchmarking.
- max_benchmark_duration: Optionally, the maximum duration of the benchmarking,
in milliseconds. An estimated duration is calculated based on the values
of `memory_warmup_iters` and `benchmark_iters`, along with the estimated
runtime of `_callable` and various other factors, and we then shrink
`benchmark_iters` to fit in the alloted maximum duration.
- **kwargs: Additional kwargs that may be passed to the fallback.

Returns:
- The minimum runtime of `_callable`, in milliseconds.
   ry   )dtyper9   r   )r7   ry   synchronizeemptyr|   rQ   zero_r   recordr   maxr   r   )r   r?   r   r   r   r   r   bufferr   r   r   estimated_timingr   benchmarked_timings                 r!   r<   !InductorBenchmarker.benchmark_gpu   sy   B 	

  	

  T//14EIIfU **+;<&1"KLLN K	 '2
 	

 ::;G %;%O!PQST

 *+ALLN , **?;&1"KLLN K	 '2
 	

 !<<[I  #88r%   r+   N)   rY   rY      )r   rZ   r[   r\   r   r   rQ   r|   rR   r]   r7   ry   r   r   rS   r   r$   r   r   r<   r`   r+   r%   r!   rv   rv      s#   #D #S # #





	eEJJ$$ejj&6&667	8

	
	
!%eEJJ,<,<ejj>N>N,N&O!P	
		
  !"#&"&(M9M9BG$M9 M9 !	M9
 M9 !$M9 M9 
M9 M9r%   rv   )%rn   rG   	functoolsr   r   	itertoolsr   
statisticsr   typingr   r   typing_extensionsr	   r
   r   r   r7   torch._dynamo.utilsr   r   torch._inductor.configr   _logginggetArtifactLoggerr   loggerry   is_availablerJ   r   r   r$   r'   rb   rv   benchmarkerr+   r%   r!   <module>r      s      ,     C C  6 ? 
	)	)(N	C >UZZ%<%<%> 
  cNCLS!V$a'(k#q&!1$%$Z" Z"z$O $ONm9+ m9b : ?P?R r%   