
    sh                         S SK r S SKrS SKrS SKrS SKrS SKrS SKJrJr  S q	    SS jr
S rS rS rS rS	 r/ qS
\S\4S jr   SS jrg)    N)profileProfilerActivityc                      g )N r       t/Users/tiagomarins/Projetos/claudeai/copy_bank/venv/lib/python3.13/site-packages/torch/_functorch/benchmark_utils.pysynchronizer	      s    r   c	                 b   Uc  S/nUS/:w  a=  [         R                  R                  5       (       a  [         R                  R                  qUc  0 nUc  0 nU   [         R                  " S5        [        S5       H  n	U " U40 UD6  [        5         M     [         R                  " S5        [        R                  " 5       n
[        U5       H  n	U " U40 UD6  [        5         M     [        R                  " 5       nSSS5        WW
-
  n[        SSU0UD6 nU   [        5         [         R                  " S5        [        U5       H  n	U " U40 UD6  [        5         M     SSS5        SSS5        WR                  U5        U$ ! , (       d  f       N= f! , (       d  f       N:= f! , (       d  f       NC= f)a  
Output the chrome trace of running f(input, **kwargs_for_f) with [optimize_ctx]
[num_runs] times to [trace_filename].

[activities] are the activities that the profiler will record, e.g. ProfilerActivity.CUDA.
Return total runtime without the profiler

Outputs to trace_filename
Ncudacpui9     
activitiesr   )
torchr   is_availabler	   manual_seedrangetimeperf_counterr   export_chrome_trace)finputtrace_filenameoptimize_ctxr   num_runsdeviceskwargs_for_fkwargs_for_profiler_t0t1timingprofs                 r   dump_chrome_tracer#      sd   * ( 5'ejj5577jj,," 	$qAe$|$M  	$ xAe$|$M !   
 "WF		>J	>*=	>$Md#8_%(<( %  
? 	^,M- 
 \ 
?	>s2   B!E>F AFF >
F
F	F  
F.c                 R    [        U 5      n[        R                  " U5      nUS   nU$ )NtraceEvents)openjsonload)filenamer   dataeventss       r   get_chrome_trace_eventsr,   K   s'    XA99Q<D- FMr   c                 h    SU ;   =(       a'    U S   [         ;   =(       a    SU ;   =(       a    U S   S:H  $ )NpidphX)gpu_pidsevents    r   is_gpu_compute_eventr4   R   s@     	 	%LH$	EM	 $K3	r   c                     / nU  H&  n[        U5      (       d  M  UR                  U5        M(     [        U[        R                  " S5      S9$ )Nts)key)r4   appendsortedoperator
itemgetter)r+   sorted_gpu_eventsr3   s      r   get_sorted_gpu_eventsr=   \   sI    #E**  '  #)<)<T)BCCr   c                     [        U 5      S:X  a  gU S   nUS   US   -   nUS   nU SS   H9  n[        US   U5      nUS   US   -   nU[        XT-
  S5      -   n[        X%5      nM;     U$ )Nr   r6   dur   )lenmax)r<   r3   current_end_timetotal_duration
start_timeend_times         r   get_durationrG   e   s    
"a ET{U5\15\N"12&t&67
;u-'#h.CQ*GG/:	 '
 r   c                 x    S n[        U 5      n/ nU H#  nU" U5      (       d  M  UR                  U5        M%     U$ )Nc                     SU ;   =(       a5    SU S   ;   =(       d&    SU S   ;   =(       d    SU S   ;   =(       d    SU S   ;   $ )Nnamegemmconvcutlasswgradr   r2   s    r   is_mm_conv_event7get_sorted_gpu_mm_conv_events.<locals>.is_mm_conv_eventt   sT     
eFm# (v&(E&M)( %-'		
r   )r=   r8   )r+   rO   
gpu_eventssorted_eventsr3   s        r   get_sorted_gpu_mm_conv_eventsrS   s   sF    
 'v.JM&&U#  r   r)   total_lengthc                    [        U 5      n/ qU H<  nSU;  a  M  US   S:X  d  M  SUS   S   ;   d  M$  [        R                  US   5        M>     US-  n[        U5      n[	        U5      U-  n[        U5      n[	        U5      U-  nXW4$ )a  
Process the chrome traces outputs by the pytorch profiler to compute GPU Utilization
and percent of times spent on matmul and convolution

Args:
    filename(str): Name of chrome traces file produced by pytorch profiler

    total_length(float): total length of the process without profiler in second

Return:
    tuple: (GPU Utilization, percent of time spent on matmul and convolution)
rJ   process_labelsGPUargslabelsr.   g    .A)r,   r1   r8   r=   rG   rS   )r)   rT   r+   r3   r<   utilizationsorted_gpu_mm_conv_eventsmm_conv_utilizations           r   compute_utilizationr]      s     %X.F H=,,%-:Q1QOOE%L)	   #%L-f501L@K =f E&'@ALP++r   c           
      h   [         R                  R                  U5      nU(       d$  [         R                  " U5        [	        SU-   5        Uc  [
        R                  " 5       n[         R                  R                  X$S-   5      n[        U UUU[        R                  /US/S9n[        Xx5      u  pX4$ )am  
Benchmark the GPU Utilization and percent of time spent on matmul and convolution operations of
running f(input, **kwargs_for_f) with [optimize_ctx] [num_runs] times.
It will produce a chrome trace file in trace_folder/trace_file_name.json

Example:

```
def f(a):
    return a.sum()
a = torch.rand(2**20, device="cuda")
utilization, mm_conv_utilization = benchmark_utilization(f, a, "tmp", trace_file_name = "tmp_chrome_trace")
```

Args:
    f: function to benchmark

    input: input to :attr:`f`

    trace_folder: name of the folder to store the chrome trace

    optimize_ctx: the context in which f will run

    trace_file_name: name of the dumped chrome trace file, default to "tmp_chrome_trace"

    num_runs: number of times to run f, excluding the warm-up runs, default to 1.

Return:
    tuple: (GPU Utilization, percent of time spent on matmul and convolution)

zcreate folder z.jsonr   )r   r   )ospathexistsmakedirsprint
contextlibnullcontextjoinr#   r   CUDAr]   )r   r   trace_folderr   trace_file_namer   isExistchrome_trace_file_namerT   rZ   r\   s              r   benchmark_utilizationrl      s    N ggnn\*G
L!-.!--/WW\\,'8QR$				L (;($K ++r   )r@   NNN)Ntmp_chrome_tracer@   )rd   r'   r:   r_   r   r   torch.profilerr   r   r	   r#   r,   r4   r=   rG   rS   r1   strfloatr]   rl   r   r   r   <module>rq      sz       	   4	 7tD$ ,# ,U ,L &=,r   