
    PhM                        d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
mZ d dlmZmZmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZ d dlZd dlmZ d d	l m!Z! d d
l"m#Z# d dl$m%Z%m&Z&m'Z' erd dl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ dZ0da1 ejd                  e3      Z4 G d d      Z5 G d d      Z6ejn                  d'd       Z8ejr                   G d d             Z:ejr                   G d d             Z; e;       Z<ee#jz                  e#j|                  f   Z?ejr                   G d d             Z@ejr                   G d d             ZA G d  d!eA      ZB G d" d#eA      ZC G d$ d%eA      ZD	 	 	 	 d(d&ZEy))    )annotationsN)ThreadPoolExecutor)byrefc_size_tc_void_p)BaseProcess)Queue)	AnyCallableDictIterableListOptionalSequenceTYPE_CHECKINGUnion)multiprocessing)rand_strided)ir)CUDACodeCache
DLLWrapperPyCodeCache)TritonTemplateCaller   )config)do_bench)VCUDA_VISIBLE_DEVICESFc                      e Zd Zy)PingN__name__
__module____qualname__     kC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/_inductor/autotune_process.pyr    r    0       r&   r    c                      e Zd Zy)PongNr!   r%   r&   r'   r*   r*   4   r(   r&   r*   c              #  p  K   | d yt         j                  j                  t              }t	        |       t         j                  t        <   	 d |t         j                  t        = y|t         j                  t        <   y# |t         j                  t        = w |t         j                  t        <   w xY ww)z
    Context manager to set the CUDA_VISIBLE_DEVICES environment variable to the
    specified single device. If device is None, don't manipulate the environment.
    N)osenvirongetr   str)devicecurrents     r'   set_cuda_visible_devicer2   8   s      ~jjnn12G'*6{BJJ#$7?

/0/6BJJ+, ?

/0/6BJJ+,s   AB6B 0B61B33B6c                      e Zd ZU dZdZded<   dZded<   dZded<   dZded	<   e		 	 	 	 	 	 dd
       Z
e	dd       ZddZddZddZddZddZddZddZy)TuningProcessz
    Abstraction for launching a helper process to benchmark kernels. Spawns
    the parent process and uses multiprocessing queues to send benchmark
    requests and return results.
    NOptional[int]r0   zOptional[BaseProcess]processzOptional[Queue[Any]]request_queueresponse_queuec                    t         j                  dt        j                  j	                  t
                     	 t        j                  | |       y# t        $ r }t         j                  d|       Y d}~yd}~ww xY w)z4
        Entry point for the child process.
        z2Entering TuningProcess child. Visible devices = %szException in TuningProcess: %sN)
logdebugr,   r-   r.   r   r4   workloop	Exception	exception)r7   r8   exs      r'   process_mainzTuningProcess.process_mainZ   s^     			@JJNN/0	
	@""=.A 	@MM:B??	@s   A 	A9A44A9c                   	 | j                         }|yt        |t              r|j                  t	                      nGt        |t
              r |j                  |j                                nt        dt        |             )z<
        Work loop for the benchmarking subprocess.
        NzInvalid request type )	r.   
isinstancer    putr*   BenchmarkRequest	benchmarkRuntimeErrortype)r7   r8   objs      r'   r<   zTuningProcess.workloopk   sr    
 ##%C{C&""46*C!12""3==?3"%:49+#FGG r&   c                ^    | j                   duxr | j                  duxr | j                  duS )z?
        True if the sub-process has been initialized.
        Nr6   r7   r8   selfs    r'   validzTuningProcess.valid|   s;    
 LL$ 0""$.0##4/	
r&   c                .    dx| _         x| _        | _        y)z2
        Reset to an uninitialized state.
        NrJ   rK   s    r'   clearzTuningProcess.clear   s     CGFFt)D,?r&   c                   | j                         ryt        j                  d      }|j                         | _        |j                         | _        |j                  | j                  | j                  | j
                  f      | _        | j                  J t        | j                        5  | j                  j                          ddd       y# 1 sw Y   yxY w)z
        Create child process, request/response queues, and do the warm up.
        Set the environment to make only the provided GPU device visible
        to the process.
        Nspawn)targetargs)rM   r   get_contextr	   r7   r8   Processr@   r6   r2   r0   start)rL   ctxs     r'   
initializezTuningProcess.initialize   s     ::< ))'2 YY[!iik{{$$""## # 
 ||'''$T[[1LL  211s   ,CCc                v    | j                          | j                  J | j                  j                  |       y)z8
        Push a work item to the child process.
        N)rX   r7   rC   )rL   rH   s     r'   rC   zTuningProcess.put   s4    
 	!!---s#r&   c                    | j                   J | j                  J 	 	 | j                  j                  d      S # t        j                  $ r, | j                   j
                  }|Y K| j                           w xY w)z8
        Get a response from the child process.
        g      ?)timeout)r6   r8   r.   queueEmptyexitcoderO   )rL   statuss     r'   r.   zTuningProcess.get   s~     ||'''""...	**..s.;;;; ..>

s   ; +A:(A:c                    | j                         r8| j                  J | j                  J | j                  j                  d       yy)z8
        Signal the child process to terminate.
        N)rM   r6   r7   rC   rK   s    r'   	terminatezTuningProcess.terminate   sH     ::<<<+++%%111""4( r&   c                r    | j                   +| j                   j                          | j                          yy)z5
        Wait for the child process to exit.
        N)r6   joinrO   rK   s    r'   waitzTuningProcess.wait   s,     <<#LLJJL $r&   )r7   
Queue[Any]r8   re   returnNone)rf   boolrf   rg   )rH   r
   rf   rg   )rf   r
   )r"   r#   r$   __doc__r0   __annotations__r6   r7   r8   staticmethodr@   r<   rM   rO   rX   rC   r.   ra   rd   r%   r&   r'   r4   r4   M   s     !FM %)G")*.M'.+/N(/@!@"@ 
@ @  H H 
G!2$$)r&   r4   c                  ^    e Zd ZU dZdZded<   dZded<   ddZddZdd	Z	dd
Z
	 	 	 	 ddZy)TuningProcessPoolz
    Maintains a pool of TuningProcesses to benchmark kernels in parallel
    across devices. By default, we create one TuningProcess per device and
    set the sub-process environment to make only that device visible.
    Nz$Optional[queue.Queue[TuningProcess]]	processeszOptional[ThreadPoolExecutor]executorc                   | j                   du | j                  du k(  sJ | j                   y| j                         }t        j	                  d|       t        j                         | _         |D ]R  }t        |      }|j                          |j                  t                      | j                   j                  |       T | j                   j
                  D ]"  }t        |j                         t              r"J  t        t        |            | _        t         s"daddl}|j%                  | j&                         yy)z,
        Start the child processes.
        Nz$Sub-process autotune device list: %s)r0   )max_workersTr   )ro   rp   get_device_listr:   r;   r\   r	   r4   rX   rC   r    rB   r.   r*   r   lenEXIT_HANDLER_REGISTEREDatexitregisterra   )rL   devicesr0   prv   s        r'   rX   zTuningProcessPool.initialize   s     $&DMMT,ABBB>>%&&(		8'B FV,ALLNEE$&MNNq!	  %%Aaeegt,,, & +s7|D
 '&*#OODNN+	 'r&   c                ^   t         j                  sdgS t        j                  j	                         }t
        t        j                  v rNt        j                  t
           j                  d      D cg c]  }t        |       }}t        |      |k  sJ |S t        t        |            S c c}w )zD
        Gather the list of devices to be used in the pool.
        N,)r   autotune_multi_devicetorchcudadevice_countr   r,   r-   splitintrt   listrange)rL   countdrx   s       r'   rs   z!TuningProcessPool.get_device_list  s     ++6M

'')  2::-')zz2F'G'M'Mc'RS'R!s1v'RGSw<5(((NE%L!!	 Ts   0B*c                2   | j                   !| j                   j                          d| _         | j                  ^| j                  j                  D ]  }|j	                           | j                  j                  D ]  }|j                           d| _        yy)z:
        Signal all child processes to terminate.
        N)rp   shutdownro   r\   ra   rd   )rL   ry   s     r'   ra   zTuningProcessPool.terminate  su     ==$MM""$ DM>>%^^)) *^^)) *!DN &r&   c                   |j                   J | j                  J | j                  j                         }|j                  |j                          	 |j                         | j                  j                  |       S # t        j
                  $ rB t        j                  d| d       t        d      cY | j                  j                  |       S w xY w# | j                  j                  |       w xY w)z
        Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
        remove it from the queue, execute the benchmark in that subprocess, and return
        the TuningProcess to the queue.
        zFailed to benchmark choice 'z['. It will be ignored. Please debug the root cause in case the choice can bring perf gains.inf)	bmreqro   r.   rC   r\   r]   warningswarnfloat)rL   choicer6   s      r'   rR   zTuningProcessPool.target#  s     ||'''~~)))..$$&FLL!
	(;;= NNw' {{ 	 MM.vh 7W W
 <NNw'	  NNw's$   A> >7C5C CC C3c                    | j                   J d       | j                  J i }t        || j                  j                  | j                  |            D ]
  \  }}|||<    |S )z>
        Benchmark each choice in a separate process.
        z&Tuning process pool is not initialized)ro   rp   zipmaprR   )rL   choicesresultsr   results        r'   rE   zTuningProcessPool.benchmark:  sl     ~~)S+SS)}}((( "'4==+<+<T[['+RSNFF$GFO T r&   ri   )rf   zSequence[Optional[int]])r   r   rf   r   r   zList[TritonTemplateCaller]rf   z!Dict[TritonTemplateCaller, float])r"   r#   r$   rj   ro   rk   rp   rX   rs   ra   rR   rE   r%   r&   r'   rn   rn      sK     7;I3:-1H*1#,J"$"(.+ 
+r&   rn   c                  b    e Zd ZU ded<   ded<   ded<   ded<   d	ed
<   e	 	 	 	 dd       ZddZy)
TensorMetaztorch.devicer0   ztorch.dtypedtypeztorch._prims_common.ShapeTypesizesztorch._prims_common.StrideTypestridesr   offsetc           
        t        |t              r4|D cg c]  }| j                  |       }}t        d |D              sJ |S |}t        |t        j
                        rt	        j                  d|      }|j                         }|J t        |j                         |t        j                  j                  j                  |j                         t        j                         t        j                  j                  j                  |j#                         t        j                         t        j                  j                  j%                  |j'                         j(                  t        j                               S c c}w )Nc              3  <   K   | ]  }t        |t                y wN)rB   r   .0xs     r'   	<genexpr>z*TensorMeta.from_irnodes.<locals>.<genexpr>b  s     A&Qz!Z0&   fake)fallback)r0   r   r   r   r   )rB   r   from_irnodesallr   LayoutBuffer	get_dtyper   
get_devicer   graphsizevars
size_hintsget_sizer   unbacked_symint_fallback
get_stride	size_hint
get_layoutr   )clsirnodesr   r   noder   s         r'   r   zTensorMeta.from_irnodes\  s:    gx(>E Fg!1!1!!4gF FA&AAAAMdBII&99VT*D    ??$''""--88 .  GG$$//!88 0  77##--!((88 . 
 	
 !Gs   Fc                    t        | j                  | j                  | j                  | j                  | j
                        S )N)r0   r   
extra_size)r   r   r   r0   r   r   rK   s    r'   	to_tensorzTensorMeta.to_tensor}  s2    JJLL;;**{{
 	
r&   N)r   z/Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]rf   #Union[TensorMeta, List[TensorMeta]])rf   torch.Tensor)r"   r#   r$   rk   classmethodr   r   r%   r&   r'   r   r   T  sG    ((++K
E
	,
 
@
r&   r   c                  \    e Zd ZdZ	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 d	dZd
dZdd	 	 	 	 	 ddZy)rD   z
    Only handle triton template benchmark for now. The extern kernel benchmark
    can be done inside the same process since they usually don't cause crash.
    c                    || _         t        |t              r|g}|| _        t        |t        t
        f      rt        |      dk(  sJ |d   }|| _        || _        y )Nr   r   )	kernel_namerB   r   input_tensor_metatupler   rt   output_tensor_meta
extra_args)rL   r   r   r   r   s        r'   __init__zBenchmarkRequest.__init__  se     ''4!2 3!2(5$-8)*a///!3A!6"4$r&   c                   t               r   )NotImplementedErrorrL   output_tensorinput_tensorss      r'   make_run_fnzBenchmarkRequest.make_run_fn  s     "##r&   c                     y r   r%   rK   s    r'   cleanup_run_fnzBenchmarkRequest.cleanup_run_fn  s    r&   Nr   c                  t         j                  t        j                        }|rt	        j                         }|Ft        |      dk(  sJ t        d | j                  D              }| j                  j                         }|r+t	        j                         z
  }t	        j                         } | j                  |d|i}|r+t	        j                         z
  }t	        j                         }t        |      }t        j                  j                          |r9t	        j                         z
  }	t         j                  dt!        |       |	       | j#                          |S )Nr   c              3  <   K   | ]  }|j                           y wr   )r   r   s     r'   r   z-BenchmarkRequest.benchmark.<locals>.<genexpr>  s     !P9OA!++-9Or   r   z6InChildProcess %s: load %f, create tensor %f, bench %f)r:   isEnabledForloggingDEBUGtimert   r   r   r   r   r   r   r}   r~   synchronizer;   r/   r   )
rL   r   r   r;   start_tscreate_tensor_elapsefnload_elapseoutbench_elapses
             r'   rE   zBenchmarkRequest.benchmark  s   
   /yy{H  }%***!!P9O9O!PPM 33==?M#'99;#9 yy{HT}JMJ))+0Kyy{Hrl

 99;1LIIHD	$ 	
r&   )r   r/   r   r   r   r   r   Iterable[Any]r   r   r   r   rf   zCallable[[], None]ri   r   r   r   zOptional[torch.Tensor]rf   r   )r"   r#   r$   rj   r   r   r   rE   r%   r&   r'   rD   rD     s{    
%% ?% @	%
 "%*$*$;G$	$
 15&$& .& 
	&r&   rD   c                  2    e Zd ZdZdddZdd	 	 	 	 	 ddZy)	TestBenchmarkRequestz
    Supports unit testing. Defined in this file so that the TuningProcess
    sub-process knows how to unpickle these objects.
    Nc                    || _         y r   )value)rL   r   s     r'   r   zTestBenchmarkRequest.__init__  s	    
r&   r   c               H    | j                   t        d      | j                   S )NzFailed to run)r   r=   r   s      r'   rE   zTestBenchmarkRequest.benchmark  s#     ::O,,zzr&   r   )r   zOptional[float]rf   rg   r   )r"   r#   r$   rj   r   rE   r%   r&   r'   r   r     s0    
 UY*;Q	r&   r   c                  `     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 ddZddZ xZS )TritonBenchmarkRequestc
                p    t         
|   ||||       || _        || _        || _        || _        |	| _        y r   )superr   module_pathmodule_cache_keygrid
num_stages	num_warps)rL   r   r   r   r   r   r   r   r   r   	__class__s             r'   r   zTritonBenchmarkRequest.__init__  s?     	&79KZX& 0	$"r&   c                  t        j                  | j                  | j                        }t        j                  d| j                  | j                         t        || j                        j                  }t        j                  |g||| j                  | j                  | j                  | j                  t        j                   j#                         j$                  dS )Nz"benchmark module key: %s, path: %s)r   r   r   stream)r   load_by_key_pathr   r   r:   r;   getattrr   run	functoolspartialr   r   r   r   r}   r~   current_streamcuda_stream)rL   r   r   mod
run_methods        r'   r   z"TritonBenchmarkRequest.make_run_fn  s     **4+@+@$BRBRS		0!!	
 S$"2"2377
  	
	
 	
 __		

 nn::,,.::	
 		
r&   c                T    d| j                   d| j                  d| j                  S )Nself.kernel_name=z, self.module_path=z, self.module_cache_key=)r   r   r   rK   s    r'   __str__zTritonBenchmarkRequest.__str__  s2    #$""$$8t'7'7&99RD<Q<Q;STTr&   )r   r/   r   r   r   r   r   r   r   r/   r   r/   r   z	List[int]r   r   r   r   r   rf   r/   )r"   r#   r$   r   r   r   __classcell__r   s   @r'   r   r     s    ## ?# @	#
 "# # # # # #&
*
;G
	
.Ur&   r   c                  X     e Zd Z	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 ddZddZddZ xZS )	CUDABenchmarkRequestc                    t         |   ||||       || _        d| _        d | _        d | _        d| _        d| _        t        j                  | j                  d      \  | _        | _        y )Nr    so)
r   r   source_codeworkspace_size	workspaceDLLhash_keysource_filer   write)rL   r   r   r   r   r  r   s         r'   r   zCUDABenchmarkRequest.__init__  sj     	&79KZX&#$15)- "*7*=*=d>N>NPT*U't'r&   c          	        t        j                  | j                  d      \  | _        | _        | _        t        |      |gz   D cg c]  }t        |j                                }}t        j                  d| j                  | j
                  | j                  | j                  || j                         t        | j                  | j                        }t        t        j                  j!                         j"                        }t%               } |g || j                  t'        |      d |  |j(                  | _        | j*                  dk(  sJ d       t-        j.                  |g|| j                  d d | S c c}w )Nr  zqmake_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sr   zThings need to be fixed to support non-zero workspace_size: 1) max autotune cache needs to store workspace size; 2) memory allocation needs to allocate / deallocate workspace correctly; )r   loadr  r  r  r  r   r   data_ptrr:   r;   r   r   r   r}   r~   r   r   r   r   r   r  r   r   )rL   r   r   tensorrS   r   
stream_ptrc_workspace_sizes           r'   r   z CUDABenchmarkRequest.make_run_fn&  s    5B4F4Fd5
1$-!1
 }-?
? V__&'? 	 
 			MMHHOO	
 TXXt'7'78
ejj779EEF
 $: 	
	
__	
  	
 	
 	
 /44""a' 	
X	
'   

 __
 	

 
 
 	
I
s    Fc                ^    | j                   | j                   j                          d | _        y r   )r  closer  rK   s    r'   r   z#CUDABenchmarkRequest.cleanup_run_fnY  s!    88HHNNr&   c                T    d| j                   d| j                  d| j                  S )Nr   z, self.source_file=z, self.hash_key=)r   r  r  rK   s    r'   r   zCUDABenchmarkRequest.__str__^  s0    #$""$$8t'7'7&99JDMM;KLLr&   )
r   r/   r   r   r   r   r   r   r  r/   r   ri   r   )r"   r#   r$   r   r   r   r   r   r   s   @r'   r   r     sh    VV ?V @	V
 "V V"1
*1
;G1
	1
f
Mr&   r   c                ,    t         j                  |       S )zO
    Do benchmarking in a subprocess and return the perf number (latency).
    )tuning_poolrE   )r   s    r'   benchmark_in_sub_processr  b  s       ))r&   )r0   r5   r   )F
__future__r   
contextlibdataclassesr   r   r,   r\   r   r   concurrent.futuresr   ctypesr   r   r   multiprocessing.processr   multiprocessing.queuesr	   typingr
   r   r   r   r   r   r   r   r   r}   r   torch._dynamo.testingr   torch._inductorr   torch._inductor.codecacher   r   r    torch._inductor.select_algorithmr   r  r   utilsr   virtualizedr   r   ru   	getLoggerr"   r:   r    r*   contextmanagerr2   	dataclassr4   rn   r  r   r   LayoutOrBufferr   rD   r   r   r   r  r%   r&   r'   <module>r(     s   "     	    1 , , / (
 
 
  ! .  L LE   -  g!	 		 	 7 7( A A AH x x xv  ! ryy"))+, /
 /
 /
d I I IX+ ",U- ,U^KM+ KM\*'*&*r&   