
    Ph#(                         d dl Z d dlZd dlmZ d dlZd dlmZ ddlmZm	Z	m
Z
 g dZd Zd Zd	 Ze j                   G d
 d             Zd Zd Zy)    N)defaultdict)
DeviceType   )create_bandwidth_info_strdo_benchget_num_bytes)	pointwise	reductionpersistent_reductiontemplateforeachc                 j    t         D cg c]  }d| | v s| }}t        |      dk(  r|d   S yc c}w )z
    Similar to get_kernel_category but use the source code. Call this API
    if we have not compile the src_code to module yet.
    @r   r   unknown)_kernel_category_choiceslen)src_codechchoicess      lC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/_inductor/wrapper_benchmark.py"get_kernel_category_by_source_coder      sF    
 5M4b!B4H8Lr4GM
7|qqz	 Ns   00c                 x    t         D cg c]  }|| j                  v s| }}t        |      dk(  r|d   S yc c}w )a  
    Given the module defining a triton kernel, return the category of the kernel.
    Category can be one of:
    - pointwise
    - reduction
    - persistent_reduction

    Currently we simply decide the category depending on what decorator is imported
    by the kernel.
    r   r   r   )r   __dict__r   )
kernel_modr   r   s      r   get_kernel_categoryr      sF     5R4bj>Q>Q8Qr4GR
7|qqz	 Ss   77c                    ddl m} d }d}|j                  j                         D ]  \  }t	        d      rt	        d      s  |      }t              }j                         t        |j                  j                  D cg c]  }|j                  d      r| c}      }	t        d|	idz  dfd		}
| d
d|dd j                          d|dd  }|rt	        d      sJ j                        }t        |       |j                         D ]G  \  }}t        d |
||j                  |j                   |j"                         d|j$                          I nwt'        fddd      }t        |j(                        dk(  sJ d       |j(                  d   }t         |
||j                  |j                   |j"                  | d             |dz  } |dk(  rt        d       yyc c}w )aX  
    An experimental API used only when config.benchmark_kernel is true.

    Run the kernel benchmarks for all the kernels cached in PyCodeCache.
    Used in the compiled modules.

    Put this method here rather than codegen it for convenience since its implementation
    does not change based on different graph modules being compiled.
    r   )PyCodeCachec                     ddl m} | j                  j                         D cg c]$  \  }}|j	                  d      rt        ||      r|& }}}t        |      dk(  sJ |d   S c c}}w )Nr   )CachingAutotunertriton_r   )!torch._inductor.triton_heuristicsr   r   items
startswith
isinstancer   )modr   kv	cand_lists        r   get_triton_kernelz0benchmark_all_kernels.<locals>.get_triton_kernel<   sq    F **,
,1||I&:a9I+J , 	 

 9~"""|
s   )A%get_argscall
in_out_ptrnum_in_out_argsg    eAc                     t        d |||fD              sd|dd|dd|dd}nd}| d	z  z  }t        | |||
      S )Nc              3   $   K   | ]  }|d u  
 y wN ).0xs     r   	<genexpr>z>benchmark_all_kernels.<locals>.get_info_str.<locals>.<genexpr>Y   s     E*DQqDy*Ds     3z regs  z	 spills  8z shared mem g     @@)prefixsuffix)anyr   )msn_regsn_spillssharedr9   kernel_detail_strgb_per_snum_gbs          r   get_info_strz+benchmark_all_kernels.<locals>.get_info_strX   sj    E68V*DEE
'(1YvajT " %'!c*H,FHV<M     20 N   
   benchmark_all_configsr5   z @ c                  &    j                         S r0   )r+   )argsr   s   r   <lambda>z'benchmark_all_kernels.<locals>.<lambda>q   s    *//$"7rD   (   T)rep
fast_flushr   z.Autotuner should have selected the best config)r9   zpNo kernel with benchmark functionality found. Make sure you run inductor with config.benchmark_kernel being True)r8   )torch._inductor.codecacher   cacher"   hasattrr   r*   r   fn	arg_namesr#   r   upperrI   printr=   r>   r?   configr   	launchers)benchmark_namerI   r   r)   nfound
kernel_keytriton_kernelkernel_categoryarg_namenum_in_out_ptrsrC   kernel_descbench_resultlauncherr<   rK   r   rB   s                  @@@r   benchmark_all_kernelsrc   0   s"    6	 F"-"3"3"9"9";
Jz:.gj&6Q)*5-j9""$ !. 0 0 : : :H&&|4  :
 FoFL	 b!?2A#6#<#<#>"?qCR@QR 	 !:'>???%;;DAL+ , 2 2 4"b(//8;L;Lhoo^__bckcrcrbst !5
 7RDQBM++,1@?@1$..q1HOO%%OO)]!, 	!q #<r {~	
 cs   	G5
c                   6    e Zd ZU eed<   eed<   eed<   eed<   y)ProfileEventcategorykeyself_cuda_time_mscountN)__name__
__module____qualname__str__annotations__floatr1   rD   r   re   re      s    M	H LrD   re   c                    	
 fd	t        t              	fd}|D ]  }|j                  rJ d       |j                  t        j
                  k(  r4d}|j                  j                  d      r\|j                  j                  d      rd}n>|j                  j                  d      rd	}n |j                  j                  d
      rd}nd} |||        fd
 
fd} |        y )Nc                 (    | j                   dz  z  S )zT
        ev.self_cuda_time_total is in microsecond. Convert to millisecond.
          )self_cuda_time_total)evnrunss    r   get_self_cuda_timez4parse_profile_event_list.<locals>.get_self_cuda_time   s     &&-55rD   c                     t        || j                   |       | j                  z        }|   j                  |       y )N)rf   rg   rh   ri   )re   rg   ri   append)rt   rf   
profile_ev
all_eventsrv   ru   s      r   	add_eventz+parse_profile_event_list.<locals>.add_event   s@    !04((U"	

 	8##J/rD   z!Don't support the legacy profilerr   r    
triton_poitriton_pointwise
triton_redtriton_reduction
triton_pertriton_persistent_reductiontriton_unknownc                    ddl m } |j                  d d       g }d}t        d|  d       |D ]]  }||j                  z  }|j                  z  d	z  d
d}|j	                  |j
                  d d |j                  |j                  |g       _ |j	                  d|d|z  d	z  d
dg       t         ||g d             |S )Nr   )tabulatec                     | j                   S r0   )rh   )rt   s    r   rL   zCparse_profile_event_list.<locals>.report_category.<locals>.<lambda>   s
    2+?+?rD   T)rg   reverse        z
  == z category kernels == d   .2f%x   Totalr8   )KernelzSelf CUDA TIME (ms)CountPercent)headers)r   sortrV   rh   rx   rg   ri   )rf   profile_eventsr   rows
total_timert   percentwall_time_mss          r   report_categoryz1parse_profile_event_list.<locals>.report_category   s    % ?N
z!678 B"...J--<sB3GqIGKKr';';RXXwOP ! 	j"l)BS)H(MQ&OP	
 	S	

 rD   c                     g d} t        j                               j                  t        |             sJ t        j                                       i }d}| D ]  }|v s 
||         }|||<   ||z  } |z  dz  dd}t	        d|        t	        ddd	       d
	 }| D ]&  }|j                  |d      z  dz  dd}|d| z  }( |d| dddz  }t	        |       y )N)r}   r   r   r   r   r   r   r   r   z#
Percent of time when GPU is busy: zTotal wall time z.3fz mszOutput for tabulate: z, r<   )setkeysissubsetlistrV   get)category_listper_category_wall_timetotal_cuda_msrf   _timegpu_busy_percenttabulate_liner   rz   rY   r   r   s           r   reportz(parse_profile_event_list.<locals>.report   sU   
 :??$%..
 	(:??$%&	( 
 "$%H:%'*X2FG38&x0&	 & ,l:S@EQG45E4FGH c 2#67 0/?@%H)--h<|KcQRUVVWX  r'^+M	 &
 	2./r,s1C2FFmrD   )r   r   	is_legacydevice_typer   CPUrg   r#   )rY   
event_listr   ru   r{   rt   rf   r   rz   rv   r   s   ` ``    @@@r   parse_profile_event_listr      s    6 T"J0 <<D!DD>>Z^^+66Y'vv  .-""<0-""<08+"h# &,$L HrD   c                    ddl }|j                         }|j                  dddd       |j                  dd	dd
       |j                  dddd       |j                         }|j                  rt        | |j                         yd}d} |||      |z  dz  }|j                  syt        j                  j                  d      5 } |||       ddd       t        j                          d}	j                  |	       t        d|  d       t        d|	        |j                  d      }
t        |
j                  dd             t!        | |
|||z         y# 1 sw Y   xY w)zM
    This is the function called in __main__ block of a compiled module.
    r   Nz--benchmark-kernelsz-k
store_truez,Whether to benchmark each individual kernels)actionhelpz--benchmark-all-configsz-cz8Whether to benchmark each individual config for a kernelz	--profilez-pz&Whether to profile the compiled modulerH   )timesrepeatrr   T)record_shapesz/compiled_module_profile.jsonz4Profiling result for a compiled module of benchmark :z+Chrome trace for the profile is written to )group_by_input_shapers   )sort_by	row_limit)argparseArgumentParseradd_argument
parse_argsbenchmark_kernelsrc   rI   profiletorchprofilertempfile
gettempdirexport_chrome_tracerV   key_averagestabler   )rY   benchmark_compiled_module_fnr   parserrK   r   r   r   ppathr   s              r   compiled_module_mainr      s    $$&F
;	   !G	   5	   Dnd.H.HI(uVDuLtS 	 ||^^##$#71(uVD 8 %%'((EF	d#D^DTTUVW;D6BC^^^>
j'=LM Jefn	
 87s   E  E))dataclassesr   collectionsr   r   torch.autogradr   utilsr   r   r   r   r   r   rc   	dataclassre   r   r   r1   rD   r   <module>r      s^      #  % E E 	$T
n   aH2
rD   