
    Ph                    ~   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZmZmZ d dlmZ d dlZd dlmZmZ d dl m!c m"c m#Z$ d dl%Z&d dl'Z&d dl(m)c m*Z+ d d	l,m-Z- d d
l.m/Z/ d dl0m1Z1 d dl2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8 d dl9m:Z: d dl;m<Z<m=Z= d dl>m?Z?m@Z@mAZA ddlBmCZCmDZD ddlEmFZF ddlDmGZGmHZHmIZI ddl)mJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZSmTZTmUZU ddlVmWZWmXZX  ej                  eZ      Z[ ej                  ej                  d      Z\e&j                  j                  Z]	 d Z^d Z_d Z`d Zad Zbg dZcd Zddeee   d eee   fd!Zfdd#Zgd$ Zhd% Zid& Zjd' Zk G d( d)      Zlej                   G d* d+el             Znd, Zo G d- d.en      Zpej                   G d/ d0ep             Zq G d1 d2e      Zr G d3 d4e      Zs e_d5       e_d6       e_d7       e_d8       e_d9       e_d:      d;Ztd< Zuej                   G d= d>en             Zvd? Zw G d@ dAev      ZxdB ZydC ZzddDZ{ ej                  e{d"E      Z|dF Z}ej                   G dG dHel             Z~ej                   G dI dJe~             Zej                   G dK dLe~             Z G dM dNe~      Zej                   G dO dPe~             Zej                   G dQ dRe             Zej                   G dS dTe~             Z G dU dVe      Z G dW dXel      Zej                   G dY dZe             Zej                   G d[ d\e             Zej                   G d] d^el             Z G d_ d`e      Z G da dbe      Z G dc dde      Z G de dfel      Z G dg dhe      Zej                   G di djel             Z G dk dle      Z G dm dne      Z G do dpel      Z G dq drel      Zej                   G ds dte             Z G du dve      Z G dw dxe      Z G dy dze      Zej                   G d{ d|e             Z G d} d~e      Z G d de      Zej                   G d de             Zej                   G d de             Z G d de      Z G d de      Z G d de      Zd Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Zej                   G d d             Ze]jR                  jT                  e]jV                  jX                  e]jZ                  jX                  e]j\                  jX                  e]j^                  jT                  e]j`                  jb                  e]jd                  jT                  hZ G d de      Zej                   G d de             Zej                   G d del             Z G d de      Z	 	 ddddddddeee   deee   deee   deededeeee      fdZ	 	 	 	 	 	 ddZ G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G dĄ de      Zej                   G dƄ del             Z G dȄ deƫ      Z G dɄ deƫ      Z G d˄ de&j                  j                        Z G d̈́ dΫ      Z G dτ dЫ      Z G dф de      Z G dӄ de      Z G dՄ deϫ      Z G dׄ deϫ      Z G dل de      Z G dۄ de      Z G d݄ de      Z G d߄ deЫ      Z G d deЫ      Z G d deЫ      Z G d deѫ      Z G d deѫ      Z G d deѫ      Z G d deѫ      Z G d de      Z G d deܫ      Zd Z G d deѫ      Zy)    N)nullcontext)Enum)partial)	signature)AnyCallableClassVarDictIterableListOptionalSequenceSetTupleUnion)patch)ExprInteger)get_interface_for_device)identity)GraphModuleSerializer)compute_required_storage_lengthis_boolean_dtypeis_float_dtypemake_channels_last_strides_formake_contiguous_strides_for
StrideType)get_schema_info)free_unbacked_symbolsSymTypes)CleanDivFloorDivModularIndexing   )configdependencies)index_prevent_reordering)#extract_input_node_reduction_rangesextract_read_writesvar_builder)argsortcache_on_selfconvert_shape_to_inductorconvert_shape_to_symintdeveloper_warningget_kernel_metadata
is_dynamicpad_listlike	sympy_dotsympy_product
sympy_subssympy_symbol)opsVz  prefixc                      fd |        y )Nc           	         t        | t        t        f      r| D ]
  } |        y t        | t              r| j	                         D ]
  } |        y t        | t
        j                  j                  j                  t        t        t        j                  t        j                  j                  j                  t         f      sJ dt#        |        d       y )NzFound zE, which is not a supported top level IR node. See [Note: Inductor IR])
isinstancelisttupledictvaluestorch	_inductorir
ExpandViewDynamicScalar	TensorBoxsympySymbollogicboolalgBooleanr   type)nodesnode_check_tensorboxs     ]C:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/_inductor/ir.pyrP   z%validate_ir.<locals>._check_tensorboxs   s     edE]+ & t$ & ' OO&&11!LLKK''//
 
k U}$ij
k 
     )node_or_nodesrP   s    @rQ   validate_irrU   r   s    k. ]#rR   c                 6     t         t              sJ  fd}|S )Nc                  0     t        t              | i |S N)getattrr7   )argskwargsnames     rQ   fnzops_wrapper.<locals>.fn   s    !wsD!42622rR   )r=   str)r\   r]   s   ` rQ   ops_wrapperr_      s    dC   3 IrR   c           
      `    t        t        | t        t        |                         fd}|S )Nc                     t        |       t              k(  sJ t        t        |             D cg c]
  }| |       c}S c c}w rX   lenrange)indexi	inv_orders     rQ   reindexz inverse_reorder.<locals>.reindex   sE    5zS^+++-23u:->?->il#->???   A)r@   ziprd   rc   )orderrh   rg   s     @rQ   inverse_reorderrl      s*    Sc%j 123I@ NrR   c                       fd}|S )Nc                     t        |       t              k(  sJ t        t        |             D cg c]
  }| |       c}S c c}w rX   rb   )re   rf   rk   s     rQ   rh   zsame_reorder.<locals>.reindex   sD    5zSZ''').s5z):;):AeAh):;;;ri   rS   )rk   rh   s   ` rQ   same_reorderro      s    < NrR   c                       fd}|S )Nc                        |             S rX   rS   )re   reindex1reindex2s    rQ   rh   z fuse_reindexing.<locals>.reindex   s    ((rR   rS   )rr   rs   rh   s   `` rQ   fuse_reindexingrt      s    ) NrR   )   r      r$   c                     t        |       D ci c]  \  }}||
 }}}t        t        |             D cg c]  }||   	 }}|S c c}}w c c}w )z
    Convert stride order to fill order
    For channel last format,
    stride order = [3, 0, 2, 1] and fill order = [1, 3, 2, 0]
    )	enumeraterd   rc   )rk   idxposlookuprf   
fill_orders         rQ   stride_order2fill_orderr}      sX     (1'78'783c3h'7F8%*3u:%67%6&)%6J7 97s
   AAseqreturnc                     t        |       }t        t        |             D cg c]  }d }}t        |      D ]
  \  }}|||<    |S c c}w )z)
    Convert strides to stride order
    r   )r+   rd   rc   rx   )r~   
sorted_idx_outrf   elems         rQ   get_stride_orderr      sO     $CLJCHo
&o1oC
&Z(4D	 )J 's   	ATc                    | y |s%t         j                  j                  j                  }nt        }| j                         D cg c]
  } ||       }}t        |       r.| j                         j                  D cg c]
  } ||       }}nt        |      }| j                         }| j                         }t        |      }t        |      }t        j                  ||||      j                         }|S c c}w c c}w )N)sizestridedtypedevice)r8   graphsizevars	size_hintr   get_sizeis_storage_and_layout
get_layoutr   r   	get_dtype
get_devicer.   rB   empty_stridedzero_)	xguard_shapeshape_fnsr   r   r   r   ts	            rQ   ir_node_to_tensorr      s    y 77##--!".AHQKD.Q'(||~'<'<='<!(1+'<=,T2KKME\\^F"4(D$V,F&f	eg  H / >s   C7;C<c                 0    t        | t              r| sd gS | S rX   )r=   r>   values    rQ   may_convert_to_optionalr      s    %u vLrR   c                     t        | dd       rt        | j                               S t        | t        j
                        r| j                  S y )Nr   )rY   get_device_typer   r=   rB   r   rM   r   s    rQ   r   r      s9    q,%q||~..!U\\"vvrR   c                     t        |       dk(  S )Ncudar   r   s    rQ   	is_tritonr      s    1''rR   c                     t        |       dk(  S )Ncpur   r   s    rQ   is_cpur      s    1&&rR   c                      e Zd ZU  e       Zeee      ed<   e	e
j                  deej                  j                     fd              Zd Zd Zd Zd Zd Zed	        Zd
 Zd Zd Zd Zd ZddZeg ej<                  f   ed<   eg ej>                  f   ed<   eg e f   ed<   eg ef   ed<   eg ef   ed<   eg ef   ed<   eg e!f   ed<   eg eegef   f   ed<   eg eegef   f   ed<   ee"gdf   ed<   eg df   ed<   y)IRNode_current_originsoriginsc              #      K   t         j                  }|| z  t         _        	 d  |t         _        y # |t         _        w xY wwrX   )r   r   )r   olds     rQ   current_originszIRNode.current_origins   s7      %%"%-	*&)F#cF#s   A2 A?Ac                     t        | j                        | _        t        j                  rt        j                         | _        y d | _        y rX   )setr   r   r%   debug_ir_traceback	tracebackformat_stackselfs    rQ   __post_init__zIRNode.__post_init__  s3    40015;5N5N//1TXrR   c                     | j                   S rX   )r   r   s    rQ   get_tracebackzIRNode.get_traceback
  s    ~~rR   c                 T    dt        | dd       }t        |      dkD  r|d d  d}|gS )Nzorigins=r    @   =   z...)rY   rc   )r   r   s     rQ   common_reprzIRNode.common_repr  s>    WT9b9:;w<" "c*GyrR   c                     || j                         z   }t        dj                  t        t        |                  }t        |       j                   d| dS )Nz,
z(
z
))r   indentjoinmapr^   rM   __name__)r   liness     rQ   
str_helperzIRNode.str_helper  sL    ((**uzz#c5/23t*%%&c%44rR   c                 &    || j                         v S rX   )get_read_namesr   r\   s     rQ   
is_user_ofzIRNode.is_user_of  s    t**,,,rR   c                 \    | j                         D ch c]  }|j                   c}S c c}w rX   )	get_readsr\   )r   deps     rQ   r   zIRNode.get_read_names  s'    $(NN$45$4S$4555s   )c                 2    t        dt        |        d      )Nz#get_layout() is not implemented by !NotImplementedErrorrM   r   s    rQ   r   zIRNode.get_layout   s    !$GT
|ST"UVVrR   c                 2    t        dt        |        d      )Nz!get_size() is not implemented by r   r   r   s    rQ   r   zIRNode.get_size#  s    !$Ed4j\QR"STTrR   c                 4    t        | j                               S rX   )r4   r   r   s    rQ   	get_numelzIRNode.get_numel&  s    T]]_--rR   c                     t         j                  j                  j                  t	        j
                  | j                         d            S Nr   r8   r   r   is_expr_static_and_truerH   Eqr   r   s    rQ   is_zero_elementszIRNode.is_zero_elements)  0    ww77AQST8UVVrR   c                 0    t        dt        |              )a)  
        If the IRNode refers to data which has not been materialized (e.g.,
        it is a Pointwise/Reduction that could potentially have more
        compute fused into it), realize the IRNode into physical memory,
        ending the possibility of fusing into it, but allowing, e.g., multiple
        users to access the data without having to recompute.

        Check StorageBox.realize for a particularly notable implementation.

        TODO(ezyang): I think, in principle, every IRNode should have an
        implementation of this, and most of the time no-op is OK, but you
        really do have to audit each IRNode for this, so for now, raise
        an error if it's not implemented.  Note that some code in graph.py
        will catch this thrown error and suppress it with a warning.
        zrealize NYI on r   r   s    rQ   realizezIRNode.realize,  s      "ODJ<"@AArR   Nc                 0    t        dt        |              )Nzcodegen_reference NYI on r   r   writers     rQ   codegen_referencezIRNode.codegen_reference>  s    !$=d4j\"JKKrR   r   r   get_namer   
get_strideget_storage_numelhas_exceeded_max_readsmake_loadermake_indexer
mark_reuserealize_hintrX   )#r   
__module____qualname__r   r   r	   r   r   __annotations__staticmethod
contextlibcontextmanagerrB   fxNoder   r   r   r   r   r   r,   r   r   r   r   r   r   r   r   r   r   r^   boolintrS   rR   rQ   r   r      sg   +.5hs3x(0*UXX]]!3 *  *Y5
- 6 6WU.WB$L U\\)**EKK((r3wC  S!!C(($RX.."hucz22332xs
3344#%%2t8$$rR   r   c                        e Zd ZU ej                  ed<   ej                  ed<   edef   ed<   e	e
   ed<   ddZ fdZeZd Zd	 Zd
 Zd Zd Zed        Zedd       Zed        Zd Zd Zd Zd Zd Z xZS )Loopsr   r   .inner_fnrangesc                    | j                  d| j                  j                   dt        | j                        | j                         g|D cg c]  }| dt        | |        c}z   d| j                  gz         S c c}w )N'=origin_node=)r   r   rM   r^   r   inner_fn_strrY   origin_node)r   namesr\   s      rQ   __str__zLoops.__str__X  s    DKK$$%Q'DJJ!!#
 <AA54$qt,-.5AB d..1234
 	
 Bs   A?
c                 0    t         |           d | _        y rX   superr   r   r   	__class__s    rQ   r   zLoops.__post_init__c      rR   c                     | j                   S rX   r   r   s    rQ   r   zLoops.get_dtypei      zzrR   c                     | j                   S rX   r   r   s    rQ   r   zLoops.get_devicel      {{rR   c                     | j                   S rX   r   r   s    rQ   get_origin_nodezLoops.get_origin_nodeo      rR   c                     | j                   S rX   r   r   s    rQ   r   zLoops.get_sizer  r
  rR   c                      yNFrS   r   s    rQ   	is_externzLoops.is_externu      rR   c                     |j                  dd       }|j                  dd       } | |i |}||_        t        j                  r|xs t	        j
                         nd |_        t        j                  |      S )Nr   r   )popr   r%   r   r   r   rG   create)clsrZ   r[   r   tbrs         rQ   r  zLoops.createx  sm    jj5ZZT*  #.4.G.GB*)((*T 	
 ""rR   c                     t        |       D cg c].  \  }}|dk(  rt        j                  d      nt        | |       0 c}}S c c}}w Nr$   r   )rx   rH   r   r6   )r   r:   nr   s       rQ   _indexzLoops._index  sS     "&)
)1 !"QEMM!LF8A3,HH)
 	
 
s   3Ac                 4    t        | j                               S rX   )rc   r   r   s    rQ   inner_fn_str_lenzLoops.inner_fn_str_len  s    4$$&''rR   c                     | j                  | j                        }t        j                  j	                  | j
                  |      S rX   )r  r   r8   KernelFormatterHandlerir_to_stringr   )r   re   s     rQ   r   zLoops.inner_fn_str  s1    DKK(''44T]]EJJrR   c                 |   t        j                  t        dd      5  | j                         rJt	        | j                         | j                         | j                               j                  cd d d        S t	        | j                         | j                               j                  cd d d        S # 1 sw Y   y xY wNallow_indexingT)	r   objectFlexibleLayoutget_reduction_typer)   r   r   get_reduction_sizereadsr   s    rQ   r   zLoops.get_reads  s    \\.*:DA&&(*$$&MMO++- % BA +$$&MMO % BAAs   AB271B22B;c                 2    t        dt        |        d      )Nz+get_reduction_size() is not implemented by r   r   r   s    rQ   r*  zLoops.get_reduction_size      !9$t*QG
 	
rR   c                 2    t        dt        |        d      )Nz+get_reduction_type() is not implemented by r   r   r   s    rQ   r)  zLoops.get_reduction_type  r-  rR   c                 2    t        dt        |        d      )Nz+constant_to_device() is not implemented by r   r   r   r   s     rQ   constant_to_devicezLoops.constant_to_device  r-  rR   )r  )rf   )r   r   r   rB   r   r   r   r   r   r   r   r   r   __repr__r   r   r  r   r  classmethodr  r   r  r,   r   r   r   r*  r)  r1  __classcell__r  s   @rQ   r   r   Q  s    LL;;sCx  J	
  H  # # 
 
 ( (K




rR   r   c                    |j                   rt        j                  t        d      |      S t        j                  d|      S )Nnanr   )is_floating_pointr7   constantfloat)ry   r   s     rQ   nop_loader_fnr;    s1    ||E%L%00||Au%%rR   c                   *    e Zd Zd Zd Zd Zd Zd Zy)	Pointwisec                 p    | j                         rt        t        | j                        S | j                  S )Nr  )r   r   r;  r   r   r   s    rQ   r   zPointwise.make_loader  s)      "=

;;}}rR   c                     g S rX   rS   r   s    rQ   r*  zPointwise.get_reduction_size      	rR   c                      y rX   rS   r   s    rQ   r)  zPointwise.get_reduction_type      rR   c                 h    | j                         }t        j                  | ||       ||            S rX   )r   r7   storer   output_nameindexervarsloaders        rQ   store_outputzPointwise.store_output  s+    !!#yygdmVD\BBrR   c                     | j                         } t        j                  t        d|      |      }t	        || j
                  || j                        S FMove this to a given device. Requires that all reads are to constants.override_device)r   r   r'  ConstantBufferr=  r   r   r   r   rI  s      rQ   r1  zPointwise.constant_to_device  sC    !!#Hn.?HPVT[[AArR   N)r   r   r   r   r*  r)  rJ  r1  rS   rR   rQ   r=  r=    s    CBrR   r=  c                   J    e Zd ZU eee   gef   ed<   dZee	   ed<   d Z
d Zy)Scatteroutput_indexerNscatter_modec                     | j                         } t        j                  t        d|      |      }t	        || j
                  || j                  | j                  | j                        S rL  )	r   r   r'  rO  rR  r   r   rS  rT  rP  s      rQ   r1  zScatter.constant_to_device  s]    !!#Hn.?HPJJKK
 	
rR   c                     | j                         }t        j                  | || j                  |             ||      | j                        S )N)mode)r   r7   rD  rS  rT  rE  s        rQ   rJ  zScatter.store_output  sG    !!#yyD''-.4L""	
 	
rR   )r   r   r   r   r   r   r   rT  r   r^   r1  rJ  rS   rR   rQ   rR  rR    s0    d4j\4/00"&L(3-&

rR   rR  c                       e Zd ZdZdZdZdZy)ReductionHintr   r$   rv   ru   N)r   r   r   INNEROUTER
OUTER_TINYDEFAULTrS   rR   rQ   rY  rY    s    EEJGrR   rY  c                       e Zd ZdZdZy)TileHintr   r$   N)r   r   r   SQUAREr]  rS   rR   rQ   r_  r_    s    FGrR   r_  
logical_ormaximumminimummuladdbitwise_xor)anymaxminprodsumxor_sumc                 t      t         v rt             }|S  dv r fd}|S  dk(  rd }|S t        d        )N>   argmaxargminc           
         | \  }}|\  }}dk(  rt        j                  ||      }nt        j                  ||      }t        j                  ||      }t	        
      rt        j
                  ||      }t        j
                  ||      }	t        j                  |t        j                  ||	            }t        j                  |t        j                  ||	            }t        j                  |t        j                  |t        j                  ||                  }t        j                  |||      t        j                  |||      fS )Nro  )	r7   ltgteqr   nera  logical_andwhere)aba_valuea_indexb_valueb_indexmaskequala_isnanb_isnanr   reduction_types             rQ   
combine_fnz,get_reduction_combine_fn.<locals>.combine_fn  s    GW GW)vvgw/vvgw/FF7G,Ee$&&'2&&'2~~dCFF7G,DEucoogw.OP>>cooeSVVGW-EFD 		$1		$1 rR   welford_combinec                 l    | \  }}}|\  }}}||z
  }||z   }	||	z  }
|||
z  z   ||z   ||z  |z  |
z  z   |	fS rX   rS   )rw  rx  a_meana_m2a_weightb_meanb_m2b_weightdelta
new_weight	w2_over_ws              rQ   r  z,get_reduction_combine_fn.<locals>.combine_fn!  sk    %&"FD(%&"FD(VOE!H,J :-I**teemh6BB rR   zunknown reduction_type=)REDUCTION_COMBINE_FNr   )r  r   r  s   `` rQ   get_reduction_combine_fnr    sg    --).9
Z Y 
/	/	T % 
,	,	   "$;N;K"LMMrR   c                   v   e Zd ZU ee   ed<   eed<   ej                  ed<   e	ed<   d Z
d Zd Zd Zd	 Zd
 Zd Zd Ze	 d'dee   fd       Zed        Zee	j0                  dfdej2                  dej                  dej                  dedef   dee   dee   dede	dee   fd       Zed        Zed        Zededede	de	fd       Z ed        Z!ed        Z"edej2                  dej                  dej                  dedef   d ee   d!ee   d"ee   d#ee   dedede	fd$       Z#edej2                  dej                  dej                  dedef   dee   dee   dedede	fd%       Z$edej2                  dej                  dej                  dedef   d ee   d!ee   d"ee   d#ee   dede	fd&       Z%y)(	Reductionreduction_rangesr  	src_dtypereduction_hintc                 0    t         j                  | d      S )N)r   r  r  )r   )r   r   r   s    rQ   r   zReduction.__str__<  s    }}H  
 	
rR   c                 "    | j                         S rX   )r   r   s    rQ   r2  zReduction.__repr__A  s    ||~rR   c                     | j                   S rX   )r  r   s    rQ   r*  zReduction.get_reduction_sizeD  s    $$$rR   c                     | j                   S rX   r  r   s    rQ   r)  zReduction.get_reduction_typeG  s    """rR   c           	          t        j                  | j                  | j                  | j                  | j                  ||            }t        j                  | ||      |      S rX   )r7   	reductionr   r  r  r   store_reduction)r   rF  rG  rH  reduction_varsr   s         rQ   r  zReduction.store_reductionJ  sP    JJNNMM$/	
 "";uEErR   c                 X    t        | j                        t        | j                        z   S rX   )rc   r   r  r   s    rQ   index_lengthzReduction.index_lengthS  s!    4;;#d&;&;"<<<rR   c                     | j                  | j                        }| j                  | j                  d      }t        j                  j                  | j                  ||      S )Nr  )r  r   r  r8   r"  r#  r   )r   re   rindexs      rQ   r   zReduction.inner_fn_strV  sP    DKK(T22C8''44MM
 	
rR   c           
         | j                         } t        j                  t        d|      |      }t	        || j
                  || j                  | j                  | j                  | j                  t        j                        S rL  )r   r   r'  rO  r  r   r   r  r  r  rY  r]  rP  s      rQ   r1  zReduction.constant_to_device_  sm    !!#Hn.?HPJJKK!!NN!!	
 		
rR   N
input_nodec	           
         !"#$ d }	t         j                  j                  j                  |      }
t         j                  j                  j                  t	        |            }t        |       xr* |dvxr$ t        j                  xr  |	|
      xr  |	|      }|st        j                  dfS t        t        |             }|j                  j                  |       j                  #d"d d$"#z  $z  ! #z  $z   !"#$fd} !"#$fd}|dk(  r ||
|      }|dk(  rt        j                  |fS t!        |      d	k(  r|t#        |t$              rxt'        |      \  }}|h|ft         j                  j                  j                  t	        ||z               }|
|k(  r,t(        j+                  d
|||||       t        j                  dfS t        j                  |fS |
"k  s|#dz  dz  k\  rt        j                  dfS t-        | ||||||t        j                        }d } ||      \  }}|r ||      \  }}t!        |      d	k(  rt        j                  dfS t/        j0                  |j3                         |j5                               \  \  }}}d	}d	}|D ]  }t         j                  j                  j7                  ||      }t         j                  j                  j9                  |||j;                               }t=        d |D              }|r|dz  }|dz  } ||kD  rt        j                   ||
|      fS t        j>                   ||
|      fS )Nc                 B    t        | t        t        j                  f      S rX   )r=   r   rH   r   r   s    rQ   
_is_staticz(Reduction.num_splits.<locals>._is_staticz  s    a#u}}!566rR   >   rn  ro  r$          i   c                   	 d}d|z  }|dz  k\  ry| dk  ry| |z  k  r}n| |z  
k  rmz  d|z  z  }||z   dz
  |z  }| ||z  z   dz
  ||z  z  	t        j                  |       }t        |	fd      }t        |	z
        dk  rt	        |      }n>	}n;t        j                  |       }t        |fd	      }t        |z
        d
k  r|}n}| ||z  z   dz
  ||z  z  S )N   r  rv   r$   i    c                      t        | z
        S rX   absr   tmp_split_sizes    rQ   <lambda>zFReduction.num_splits.<locals>.inner_reduction_splits.<locals>.<lambda>      c!n:L6MrR   key   c                      t        | z
        S rX   r  r   max_elements_per_threads    rQ   r  zFReduction.num_splits.<locals>.inner_reduction_splits.<locals>.<lambda>      c!>U:U6VrR   2   rH   divisorsri  r  rh  )reduction_numel_hint
numel_hint	num_warpsnum_threads
split_sizetarget_blocksblocks_per_outputr  closestr  max_elements_per_devicer  min_elements_per_devicemin_elements_per_threadnum_smthreads_per_sms            @rQ   inner_reduction_splitsz4Reduction.num_splits.<locals>.inner_reduction_splits  s8    Iy.KQZ'#t+#j04KK4
%
25LL & 7AO L%2Z%?!%C
$R!(;9J+JJQN!$55"7 !>>*>?h,MNw/025!$W.E!FJ!/J >>*>?h,VWw!889B>!(J!8J(:+CCaG[( rR   c                    d}|dz  }d}d}||z   dz
  |z  }| |z  k  r}n| |z  k  rjz  |z  }||z   dz
  |z  }| ||z  z   dz
  ||z  z  t        j                  |       }	t        |	fd      }
t        |
z
        dk  rt	        |
      }n>}n;t        j                  |       }	t        |	fd	      }
t        |
z
        d
k  r|
}n}| ||z  z   dz
  ||z  z  S )Nr  r        r$   c                      t        | z
        S rX   r  r  s    rQ   r  zFReduction.num_splits.<locals>.outer_reduction_splits.<locals>.<lambda>  r  rR   r     c                      t        | z
        S rX   r  r  s    rQ   r  zFReduction.num_splits.<locals>.outer_reduction_splits.<locals>.<lambda>  r  rR   r  r  )r  r  r  r  rvals_per_threadxvals_per_blockxblocksr  r  r  r  r  r  r  r  r  r  r  s              @rQ   outer_reduction_splitsz4Reduction.num_splits.<locals>.outer_reduction_splits  s8    I#b.K !O!O3a7OKG#j03JJ4
%
25LL & 7K H!.!81!< H(+;m+KKaO&6"8 !>>*>?h,MN~/025!$W.E!FJ!/J >>*>?h,VWw!889B>!(J!8J(+;j+HH1L :- rR   r   zUse previous IRNode's range and reduction_ranges instead of split. current ranges: %s, current reduction ranges: %s, current split: %d, new ranges: %s, new reduction ranges: %srv   c           	         t        d t        | j                         | j                         | j	                               |       }|j                         }|j                  D  cg c]8  } t        | t        j                        rt        | t        j                        s| : }} g }d}t        |j                  d       D ]  t        fd|D              s|j                  j                         j                   t"        j$                  j&                  v sZt"        j$                  j&                  j                      }|j(                  j*                  }|j-                          |j(                  j*                  |k7  sd} ||fS c c} w )Nr   r   r   r\   layoutdataFc                     | j                   S rX   r\   r   s    rQ   r  z@Reduction.num_splits.<locals>.get_read_indices.<locals>.<lambda>*  s    affrR   r  c              3   N   K   | ]  }|j                   j                  v   y wrX   )re   free_symbols).0r  mds     rQ   	<genexpr>zAReduction.num_splits.<locals>.get_read_indices.<locals>.<genexpr>+  s!     F:aqBHH111:s   "%T)ComputedBufferr(  r   r   r   get_read_writes
range_varsr=   rH   r   Numbersortedr+  allappendre   r\   r8   r   name_to_bufferr  r   decide_layout)	r  cbread_writesr  indiceschangedbuforiginal_strider  s	           @rQ   get_read_indicesz.Reduction.num_splits.<locals>.get_read_indices  s@   %<<>++-
 B ,,.K %///Aa,Z5<<5P /  
 GG[..4DEF:FFNN288,ww!''"8"88gg44RWW=*-***;*;))+::,,?&*G F G##!s   $=Fc              3   &   K   | ]	  }|d kD    yw)r$   NrS   r  r   s     rQ   r  z'Reduction.num_splits.<locals>.<genexpr>E  s     /w!Aws   ) r8   r   r   symbolic_hintr4   r   r%   split_reductionsrY  r]  r   r   Workerget_device_propertiesmulti_processor_countrZ  rc   r=   rG   r(   logdebugr  r&   index_vars_squeezer   r*  simplify_with_rangesstride_hintskeysr  r[  )%r   	dst_dtyper  r   r   r  r  reduction_numelr  r  r  r  should_splitdevice_interfacer  r  split
new_rangesnew_reduction_rangesextracted_numel_hintr  r  r  r  r   r  	num_outer	num_innerrf   stridesouterr  r  r  r  r  r  s%                                  @@@@@@rQ   
num_splitszReduction.num_splitsn  s   	7  !ww//==oNWW%%33M&4IJ
 f 	'	' ''	' /0	' :& 	  ((!++3OF4KL!((>>


 	 #%"%"9F"B^"S"9F"B^"S"	 "	H!	 !	H ?*+?LEz$**E11Fq *z95
 4W40
0 ).B.N+,77+;+;+I+I%j3G&GH,( ,/CC		G #,!&0	  -22B66 &&-- $;;VaZ"_, ((!++!!	
	$@ ,A.)!,JGQw<1 ((!++&2&E&EJJL!..0'
#NV 		A  55a@Agg&&33A~v{{}UG/w//EQ	Q	  y  &&(>$j)   !&&(>$j)  rR   c                 0    D cg c]+  }t         j                  j                  j                  |      - c}t	        ||      fd|dv r:t        ddt        j                              j                          fdfdS  S c c}w )z1Convert inner_fn from a reduction to an pointwisec                      t        j                   fdt        j                  D cg c]  }t	        |       c} D              S c c}w )Nc              3   0   K   | ]  } |        y wrX   rS   )r  r  re   value_fns     rQ   r  z=Reduction._unroll_reduction_fn.<locals>.fn.<locals>.<genexpr>_  s$      # UF+#   )	functoolsreduce	itertoolsproductrd   )re   r   r  r  r  s   ` rQ   r]   z*Reduction._unroll_reduction_fn.<locals>.fn\  sN    ##"+"3"3,<=,<q%(,<=# 
 >s   A
ro  rn  Nc                     |D cg c]  }t        j                  |       }} | |      t        j                   |      t        j
                        fS c c}w rX   )rH   expandr7   
index_exprrB   int64)re   r  rf   flatten_indexr   s      rQ   r  z0Reduction._unroll_reduction_fn.<locals>.value_fno  sO    39:6a%,,q/6:UF+NN=#8%++F  ;s   Ac                      |       d   S Nr$   rS   )re   r]   s    rQ   r  z0Reduction._unroll_reduction_fn.<locals>.<lambda>v  s    E1rR   )	r8   r   r   evaluate_static_shaper  FixedLayoutr(  contiguous_stridesr   )	r   r  r  r  r   r  r  r]   r  s	   ``   @@@@rQ   _unroll_reduction_fnzReduction._unroll_reduction_fnS  s     @P
?O!AGG2215?O
 .niH
		 11' 112BC	
 ln  .-HIG
s   0Br   r  r   .r   c
                    t         j                  j                  j                  t	                    }
|
dk(  rifd} |d       |d       |d       |d      dj                         v s
J  d       fd}t        j                  |||t        |            S |
dk(  r(dv rfd	}nfd
}t        j                  |||      S t        |
t        j                        rrt         j                  j                  j                  |
      t        j                  k  r8t	        |      dk7  r*t        j                  || j                  |      |      S | j!                  ||||
|		      \  }}|t"        j$                  k(  r|}|dk(  r4|	J t'        |	      \  }}|J |J | j)                  ||||||
      S |dkD  r| j+                  |||||	      S t,        j                  t/        ||||            S )Nr   c                     t         j                  k(  rt        |       S j                  rt        |       S t	        |       S rX   )rB   r   r8  r:  r   )valr  s    rQ   py_cnstz!Reduction.create.<locals>.py_cnst  sF     !EJJ. I !22 s
 SrR   r$   )rk  rl  rj  rg  z* not supported for zero-dimension tensors!c                 6    t        j                           S rX   r7   r9  )re   r  r  rtypes_to_initss    rQ   const_fnz"Reduction.create.<locals>.const_fn  s    ||ON$CYOOrR   r   r   r   r   r  c                 0    t        j                  d      S r   r%  )re   r  s    rQ   r]   zReduction.create.<locals>.fn  s    <<955rR   c                 d    D cg c]  }t        j                  d       }} | |      S c c}w r   rH   r   )re   r   reduction_indexr   r  s      rQ   r]   zReduction.create.<locals>.fn  s5    AQ&RAQAu}}Q'7AQO&R#E?;; 'S   -r  )r8   r   r   simplifyr4   r  r=  r  r>   r=   rH   r   r   r%   unroll_reductions_thresholdr  r
  rY  r]  r(   !create_multilayer_existing_rangescreate_multilayerrG   r  )r  r   r  r  r   r   r  r  r  r  r  r#  r'  r]   hintr  r  r  r&  s     ` ` ``          @rQ   r  zReduction.create{  s    ''**33MBR4STa qz"1:
qz	O /"6"6"88M !!KLM8P ##!F|	 $   a!556
< ##FIr6BB 6  **?;001f%*##((.	   nn

e ]222!NB;)))/R0,J, )))'33388 $  QY(( 
 
  	
 	
rR   c                 .   | dv rAt        |      rt        d      S t        |      ryt        j                  |      j
                  S | dv rAt        |      rt        d      S t        |      ryt        j                  |      j                  S ddddddd|    S )	N>   rh  rn  z-infr   >   ri  ro  infr$   r   r   r   )rk  rj  rl  rg  welford_reducer  )r   r:  r   rB   iinfori  rh  r  r   s     rQ   default_accumulatorzReduction.default_accumulator
  s    ..e$V}$!%({{5)---..e$U|#!%({{5)--- '(
  	rR   c                 :    | dk(  ryt         j                  | |      S )Nr6  r   )r  r9  r8  s     rQ   default_valuezReduction.default_value$  s!    --,,^UCCrR   r  r  r   c                     | dk(  r|S | dk  r(|dk  r#|t         j                  k(  rt         j                  S | dk  r(|dk  r#|t         j                  k(  rt         j                  S |S )Nr  r  i      )rY  r[  r\  )r  r  r  s      rQ   _multilayer_second_step_hintz&Reduction._multilayer_second_step_hint*  sg     B;!!C<J#-.MDWDW2W +++TMc!-"5"55 +++rR   c                    	 t         j                  |g      	t        j                  j                  j                  t        j                  |z  d             	fd}|S )Nr   c                 .   |\  }| ^ }|z  |z   	fd}
rqt        j                  t        j                  t        j                        t        j                  t        j                              }t        j
                  ||      S  |       S )Nc                  $       g            S rX   rS   )r  rI  	new_indexrh   s   rQ   bodyzCReduction._multilayer_wrap_loader.<locals>.wrapper_fn.<locals>.bodyO  s    i');<<rR   )r7   rq  r  rB   int32masked)re   r,  reduction_blockrC  r}  r  rB  
block_sizedefaultrI  	need_maskr  rh   s        @@rQ   
wrapper_fnz5Reduction._multilayer_wrap_loader.<locals>.wrapper_fnJ  s|    !0_*/'Y ?2_DG= vvNN7EKK8NN?EKK@ zz$g66vrR   )Viewdynamic_reshape_indexerr8   r   r   r   rH   r   )
r  rI  r  r  r  rG  rH  rJ  rI  rh   s
    ` ` `` @@rQ   _multilayer_wrap_loaderz!Reduction._multilayer_wrap_loader;  s_     ../?/ARS((@@HH_u,a0
 
		 	" rR   c                     t        |      dk(  s
J | d       t        j                  |t        |      t        |      z         fd}|S )Nr   z= is not equal to []c           	      L     g  t        |       t        |      z               S rX   r?   )re   r,  rI  rh   s     rQ   rJ  zEReduction._multilayer_wrap_loader_existing_ranges.<locals>.wrapper_fnl  s#    "geElU?5K&KLMMrR   )rc   rK  rL  r?   )	r  rI  original_rangesoriginal_reduction_rangesr  r  rH  rJ  rh   s	    `      @rQ   '_multilayer_wrap_loader_existing_rangesz1Reduction._multilayer_wrap_loader_existing_ranges]  sY     ?#q(R_,==Q*RR(..%uZ'85AU;V'V
	N rR   rJ  rQ  rR  r  r  c                     |t         j                  t         j                  fvr|nt         j                  }t        j                  |||||||	|      }|j                          |j                         fd}t        j                  j                  j                  t        |            }| j                  |
||      }||dt        |       k(  sJ t        j                  t	        |||||t        |      d |	||            S )a
        Break a large reduction up into multiple smaller reductions
        recursively
        c                      g | |      S rX   rS   )re   r,  intermediate_loaders     rQ   intermediate_fnz;Reduction.create_multilayer_helper.<locals>.intermediate_fn  s    &'A'A'ABBrR   N)rB   float16bfloat16r:  r  r  r   r   r8   r   r   r   r4   r>  rc   rG   )r  r   r  r  rJ  rQ  rR  r  r  r  r  r  intermediate_dtypeintermediaterX  r  rW  s                   @rQ   create_multilayer_helperz"Reduction.create_multilayer_helperq  s   0  ??  	
 !'' 	
 	*668	C WW%%//o0NO
99:~
 *-Cs?/C"DDDD3/12	
 	
rR   c
                     t        |      }
t        |
|dz
  z   |      }| j                  ||      }| j                  |||
|||      }| j	                  ||||||g |||g|||	      S )rU  r$   )r4   r"   r;  rM  r]  )r  r   r  r  r   r   r  r  r  r  r  rG  rH  rJ  s                 rQ   r1  zReduction.create_multilayer  s    $ ((89o;UC
##NI>00&
G

 ++feL
 	
rR   c                     | j                  |	|      }| j                  ||||||      }| j                  |||||||||	d|
      S )rU  r  )r;  rS  r]  )r  r   r  r  r   rQ  rR  r  r  r  r  rH  rJ  s                rQ   r0  z+Reduction.create_multilayer_existing_ranges  sn    $ ##NI>@@% 

 ++% 
 	
rR   rX   )&r   r   r   r   r   r   r^   rB   r   rY  r   r2  r*  r)  r  r  r   r1  r   r   r   r
  r  r3  r]  r   r   r   r  r9  r;  r   r>  rM  rS  r]  r1  r0  rS   rR   rQ   r  r  4  s   4j {{!!

%#F=

  (,b V$b bH % %N  )6(=(='+L
L
 ;;L
 ;;	L

 38$L
 T
L
 t*L
 L
 &L
 V$L
 L
\  2 D D
  #5B	    B  & ;
;
 ;;;
 ;;	;

 S#X&;
 d;
 $(:;
 J;
 #4j;
 ;
 ;
 &;
 ;
z $
$
 ;;$
 ;;	$

 38$$
 T
$
 t*$
 $
 $
 &$
 $
L &
&
 ;;&
 ;;	&

 38$&
 d&
 $(:&
 J&
 #4j&
 &
 &&
 &
rR   r  c                     d| v rdS dS )Nwelfordru   r$   rS   r  s    rQ   num_reduction_outputsrb     s    ^+122rR   c                   @    e Zd ZU eed<    fdZd Zeej                  fde
j                  de
j                  deedef      dee   d	ee   d
edefd       Zed        Zede
j                  de
j                  deedef      dee   d	ee   d
ededefd       Z xZS )WelfordReductionoutput_indexc	           
      t    t              dk(  rd   }	nfd}	t        
| 	  |||	|||||       || _        y )Nr$   r   c                 2     t         fdD              S )Nc              3   0   K   | ]  } |        y wrX   rS   )r  r]   ry   reduction_idxs     rQ   r  z<WelfordReduction.__init__.<locals>.loader.<locals>.<genexpr>  s     HiR]3ir  rP  )ry   ri  	inner_fnss   ``rQ   rI  z)WelfordReduction.__init__.<locals>.loader  s    HiHHHrR   )rc   r  __init__re  )r   r   r   rj  r   r  r  r  re  rI  r  s      `      rQ   rk  zWelfordReduction.__init__  sQ     y>Qq\FI 			
 )rR   c           	          t        j                  | j                  | j                  | j                  | j                  ||            }|| j                     }t        j                  | ||      |      S rX   )r7   r  r   r  r  r   re  r  )r   rF  rG  rH  r  rA   r   s          rQ   r  z WelfordReduction.store_reduction%  s`    JJNNMM$/	
 t(()"";uEErR   r   r   rj  .r   r  r  r  c                    |dv sJ t         j                  j                  j                  t	                    }fd}	|dk(  r |	d      }
 |	d      } |	d      }|
||fS |dk(  r;fd|dk(  r |d          |	d       |	d      fS t        fd|D              S t        j                  |d   ||      \  }}|t        j                  k(  r|}|dkD  r| j                  ||||      S t        d	      D cg c]'  }t        j                  t        ||||            ) }}|D ]  }|j                           |S c c}w )
N>   r6  r  c                 V      fd}t         j                  |t                    S )Nc                 0    t        j                        S rX   r%  )ry   r   r"  s    rQ   r   z8WelfordReduction.create.<locals>.const.<locals>.inner_fn?  s    || rR   r(  r=  r  r>   )r"  r   r   r   r   s   ` rQ   constz&WelfordReduction.create.<locals>.const>  s2     ##!F|	 $  rR   r   r$   c                 V      fd}t         j                  |t                    S )Nc                 d    D cg c]  }t        j                  d       }} | |      S c c}w r   r+  )ry   r   r,  rI  r  s      rQ   r   z7WelfordReduction.create.<locals>.copy.<locals>.inner_fnU  s5    AQ&RAQAu}}Q'7AQO&R!#77 'Sr-  r(  rp  )rI  r   r   r   r   r  s   ` rQ   copyz%WelfordReduction.create.<locals>.copyT  s2    8 !''!%<	 (  rR   r6  c              3   .   K   | ]  } |        y wrX   rS   )r  r]   rt  s     rQ   r  z*WelfordReduction.create.<locals>.<genexpr>c  s     :	"T"X	   )r  r  ru   )r8   r   r   r.  r4   r?   r  r
  rY  r]  r1  rd   rG   r  rd  r   )r  r   r   rj  r   r  r  r  r  rq  meanm2weightr2  r  
output_idxresultsr   rt  s    `` ``            @rQ   r  zWelfordReduction.create/  s    !FFFF''**33MBR4ST	 a8DqB1XFV##a
 !11IaL)58U1X==:	:::&  **aL)+ + 	
e ]222!N19(( 	 	0 $Ah
 '
  $""	 ' 	 
 AIIK #
s   	,Ec                      y)Nr5  rS   r8  s     rQ   r;  zWelfordReduction.default_value  s    rR   r  c	                     t              t        j                  j                  j	                  t        j                  z  d             }	|	r?|dk7  r:fd}
 j                  ||d   t        |
d      t        |
d      f|d|      S t        dz
  z         t        j                  |t         fd|D              g |g||      }|D ]  }|j                           |D cg c]  }|j                          }}d t        j                  j                  j                  t        |            } j!                  ||      }t        j                  |t        fd	|D              |gd|      S c c}w )
rU  r   r  c                 0    t        j                  |      S rX   r%  )ry   ri  r   r   s      rQ   r9  z4WelfordReduction.create_multilayer.<locals>.constant  s    ||E511rR   r   r$   )r   r   rj  r   r  r  r  r  c           	   3   L   K   | ]  }j                  |d         yw)r   )rH  N)rM  )r  rI  rG  r  r  r  r  s     rQ   r  z5WelfordReduction.create_multilayer.<locals>.<genexpr>  s?      
 (F ++$# ,  (s   !$c                      |g | |      S rX   rS   )re   r,  rI  s      rQ   intermediate_loader_fnzBWelfordReduction.create_multilayer.<locals>.intermediate_loader_fn  s    4E4O455rR   c              3   T   K   | ]  }t        |j                                 ! yw))rI  N)r   r   )r  rf   r  s     rQ   r  z5WelfordReduction.create_multilayer.<locals>.<genexpr>  s*      &A .q}}GG&   %()r4   r8   r   r   r   rH   r   r1  r   r"   rd  r  r?   r   r   r   r>  )r  r   r   rj  r   r  r  r  r  rI  r9  intermediatesrf   	i_loadersr  rG  r  r  s   ` `  ` `       @@@rQ   r1  z"WelfordReduction.create_multilayer  s     ((89((@@HH_u,a0
 
	 +<<2 ((aLHA.HA.
 !10- )   o;UC
(// 
 (
 
 feL#
& AIIK  /<<mQ]]_m	<	6 WW%%//f0EF
99:~
  && &  G
 	
 =s   8F)r   r   r   r   r   rk  r  r3  rY  r]  rB   r   r   r   r   r   r   r   r^   r  r   r;  r1  r4  r5  s   @rQ   rd  rd    s=   )<F  )6(=(=ss {{s HS#X./	s
 T
s t*s s &s sj   V
V
 {{V
 HS#X./	V

 T
V
 t*V
 V
 V
 &V
 V
rR   rd  c                 >    	 t        | d       y# t        $ r Y yw xY w)NFfreezeT)as_storage_and_layoutr   r   s    rQ   r   r     s&    a. s    	c                 b    	 t        | d      \  }}|j                         S # t        $ r Y yw xY wNFr  )r  is_contiguousr   )r   bufferr  s      rQ    is_contiguous_storage_and_layoutr    s8    .q?##%% s   " 	..c                 L   t        | t              rt        | j                  |||      S t        | t              rt        | j                  t
              r|rz|r@| j                  j                          | j                  j                  j                         s:J || j                  j                  |       n| j                  j                          | | j                  j                  fS t        | t              r(t        | j                  |      \  }}|| j                  fS t        )z0Try to simplify x into a StorageBox and a Layoutr  want_contiguousstride_orderr  )r=   rG   r  r  
StorageBoxBufferfreeze_layoutr  r  freeze_layout_with_stride_orderr  ReinterpretViewr   )r   r  r  r  r  r   s         rQ   r  r    s    !Y$FF+%	
 	
 !Z Z%?$$&vv}}22444)66|D$$&!&&--!_% *FF
	 qxx
rR   )r  c                 d    	 t        | d      \  }}|j                  |      S # t        $ r Y yw xY wr  )r  is_stride_orderedr   )r   r  r  r  s       rQ   "is_stride_order_storage_and_layoutr  6  s:    .q?''55 s    # 	//c                   ~    e Zd ZU eed<   d Zd Zd Zd Zd Z	d Z
d Zd	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zy)BaseViewr  c                     t        d|        )Nzmake_reindexer NYI on r   r   s    rQ   make_reindexerzBaseView.make_reindexerB  s    !$:4&"ABBrR   c                 j    | j                   j                         | j                         fd}|S )Nc                        |             S rX   rS   ry   innerrh   s    rQ   rG  z&BaseView.make_indexer.<locals>.indexerI      &&rR   )r  r   r  )r   rG  r  rh   s     @@rQ   r   zBaseView.make_indexerE  s/    		&&(%%'	' rR   c                 j    | j                   j                         | j                         fd}|S )Nc                        |             S rX   rS   r  s    rQ   rI  z$BaseView.make_loader.<locals>.loaderR  r  rR   )r  r   r  )r   rI  r  rh   s     @@rQ   r   zBaseView.make_loaderN  s/    		%%'%%'	' rR   c                 6    | j                   j                         S rX   )r  r   r   s    rQ   r   zBaseView.get_dtypeW      yy""$$rR   c                 6    | j                   j                         S rX   )r  r   r   s    rQ   r   zBaseView.get_layoutZ      yy##%%rR   c                 6    | j                   j                         S rX   )r  r   r   s    rQ   r   zBaseView.get_device]  r  rR   c                      y rX   rS   r   s    rQ   r  zBaseView.get_origin_node`  rB  rR   c                 6    | j                   j                         S rX   r  r   r   s    rQ   r   zBaseView.get_namec      yy!!##rR   c                 8    | j                   j                  |      S rX   )r  r   r   userss     rQ   r   zBaseView.mark_reusef  s    yy##E**rR   c                 6    | j                   j                         S rX   )r  r   r   s    rQ   r   zBaseView.has_exceeded_max_readsi  s    yy//11rR   c                 6    | j                   j                         S rX   r  r   r   s    rQ   r   zBaseView.realizel      yy  ""rR   c                 6    | j                   j                         S rX   )r  r   r   s    rQ   r   zBaseView.realize_hinto  s    yy%%''rR   c                 6    | j                   j                         S rX   )r  r   r   s    rQ   r   zBaseView.get_storage_numelr  s    yy**,,rR   c                 6    | j                   j                         S rX   )r  r  r   s    rQ   r  zBaseView.is_externu  r  rR   c                     t        j                  t        dd      5  t        | j	                         | j                               j                  cd d d        S # 1 sw Y   y xY wr%  )r   r'  r(  r)   r   r   r+  r   s    rQ   r   zBaseView.get_readsx  sD    \\.*:DA&  " e	 BAAs   2AA!c                 d    | }t        |t              r|j                  }t        |t              r|S rX   )r=   r  r  r   r   s     rQ   unwrap_viewzBaseView.unwrap_view  s+    H%A H%rR   c                     | j                         } t        j                  t        d|      |      }t	        || j                         || j                               S rL  )r   r   r'  rO  r=  r   r   rP  s      rQ   r1  zBaseView.constant_to_device  sH    !!#Hn.?HP!164==?KKrR   N)r   r   r   r   r   r  r   r   r   r   r   r  r   r   r   r   r   r   r  r   r  r1  rS   rR   rQ   r  r  >  s^    
LC%&&$+2#(-%LrR   r  c                   J    e Zd ZU ee   ed<   ed        Zed        Z	d Z
d Zy)rE   r   c                 N   t        t        t        j                  |            }| j	                         }dgt        |      t        |      z
  z  t        |      z   }t        |      t        |      k(  sJ t        t        |            D ]  }||   dk(  s||   J ||   ||<    |S )zReplace `-1` with correct sizesNr  )r>   r   rH   r  r   rc   rd   )r   new_sizeold_sizerf   s       rQ   _normalize_sizezExpandView._normalize_size  s     ELL(34::<6S]S]:;d8nL8}H---s8}%A{b {...&qk & rR   c                    | j                  ||      }t        |      rt        |      \  }}t        |      t        |j                        z
  }|dk\  sJ t        j                  d      g|z  }t        |j                  |j                        D ]0  \  }}|j                  |dk7  r|nt        j                  d             2 t        |j                  |j                  t        |      ||j                        }	t        ||	      S t!        ||      S Nr   r$   )r  r   r  rc   r   rH   r   rj   r   r  r  r   r   r>   offsetr  rE   )
r  r   r  storage
old_layoutskip
new_strider   r   
new_layouts
             rQ   r  zExpandView.create  s    &&q(3 #"7":GZx=3z#77D199--*+d2J #J$5$5z G!!DAI&5==;KL !H$!!  X!!J #7J77!X&&rR   c                     | j                   S rX   r   r   s    rQ   r   zExpandView.get_size      yyrR   c                     | j                         }| j                  j                         t        |      t              z
  fd}|S )Nc                     t        | d        } t        |       t              k(  sJ t        t                    D ]#  }|   dk(  st        j                  d      | |<   % | S r  )r>   rc   rd   rH   r   )re   rf   actualr  s     rQ   rh   z*ExpandView.make_reindexer.<locals>.reindex  s`    tu&Eu:V,,,3v;'!9>$}}Q/E!H ( LrR   )r   r  rc   )r   targetrh   r  r  s      @@rQ   r  zExpandView.make_reindexer  s>    ##%6{S[(	 rR   N)r   r   r   r   r   r   r   r  r3  r  r   r  rS   rR   rQ   rE   rE     s<    
t*
 
 ' '*rR   rE   c                   J    e Zd ZU ee   ed<   ed        Zed        Zd Z	d Z
y)PermuteViewdimsc           
         | j                  |      }t        |      t        t        t        |                  k(  sJ t	        |      r}t        |      \  }}t        |j                  |j                  |D cg c]  }|j                  |    c}|D cg c]  }|j                  |    c}|j                        }t        ||      S t        ||      S c c}w c c}w rX   )_map_neg_dimsr   rd   rc   r   r  r  r   r   r   r   r  r  r  )r  r   r  r  r  rf   r  s          rQ   r  zPermuteView.create  s      &4yCc$i 01111 #"7":GZ$!!  -12T#T2/34t!""1%t4!!J #7J771d## 34s   5CCc                 R    |D cg c]  }|dk\  r|nt        |      |z    c}S c c}w r   )rc   )r  r  dims      rQ   r  zPermuteView._map_neg_dims  s-    @DEsaxSY_4EEEs   $c                    t        | j                  | j                              t        t        t	        | j                                    k(  sJ | j
                  j                         }| j                  D cg c]  }||   	 c}S c c}w rX   )r   r  r  rd   rc   r  r   )r   r   rf   s      rQ   r   zPermuteView.get_size  sh    4%%dii01Ss499~9N5OOOOyy!!#!%+AQ+++s   7Bc                 B   t        | j                        D ci c]  \  }}||
 c}}t        t        | j                              D cg c]  }|   	 c}t	              t	        t        t        | j                                    k(  sJ fd}|S c c}}w c c}w )Nc                 4    D cg c]  }| |   	 c}S c c}w rX   rS   )re   rf   invs     rQ   rh   z+PermuteView.make_reindexer.<locals>.reindex  s    &)*cE!Hc***s   )rx   r  rd   rc   r   )r   rf   jrh   r  s       @rQ   r  zPermuteView.make_reindexer  s     )$)) 45 41q!t 45$S^454!s1v453x3uS^45555	+  65s   BBN)r   r   r   r   r   r   r3  r  r  r   r  rS   rR   rQ   r  r    s>    
t*$ $" F F,
rR   r  c                   \    e Zd Zeddd       Zedeej                  df   fd       Z	d Z
y)SqueezeViewN)r  c          	         t        |      rt        |      \  }}g }g }|6t        |t              sJ d       d|k  r|t	        |j
                        k  sJ t        t        |j
                  |j                              D ]g  \  }\  }}	|)|dk7  s|j                  |       |j                  |	       4||k7  r#|j                  |       |j                  |	       \|dk(  rbJ d        t        |j                  |j                  |||j                        }
t        ||
      S |8t        j!                  ||j#                         D cg c]
  }|dk7  s	| c}      S |j#                         |   dk(  sJ t        j!                  |t        |j#                               D cg c]  \  }}||k7  s| c}}      S c c}w c c}}w )Nzexpected integer dim argumentr   r$   zexpected squeezed size to be 1)r   r  r=   r   rc   r   rx   rj   r   r  r  r   r   r  r  rK  r  r   )r  r   r  r  r  r  r  rf   r   r   r  r   s               rQ   r  zSqueezeView.create  s    #"7":GZHJ!#s+L-LL+CxC#joo*>$>>>%.s:??JDUDU/V%W!>D&;qy -"))&1Cx -"))&1#qyJ*JJy &X %!!  !!J #7J77;;;qajjl"Ela1f1l"EFF::<$)));;q1::<1H"U1HAAQTH11H"UVV #F #Vs   
F;
F;
!G /G r   .c                    | D cg c]
  }|dk7  s	| }}t        |       D cg c]  \  }}|dk7  s| c}}t        |       dt        t        j                     dt
        t        j                  df   ffd}||fS c c}w c c}}w )Nr$   re   r   .c                     t        |       t              k(  sJ |  d        t        j                  d      gz  }t        |       D ]
  \  }}|||<    t	        |      S )N r   )rc   rH   r   rj   r?   )re   rB  ry   r   lengthnot_ones       rQ   rh   z%SqueezeView.squeezer.<locals>.reindex  sh    u:W-C%'/CC-q)*V3Igu-Q!"	# .##rR   )rx   rc   r   rH   r   r   )r   r   r  rf   rh   r  r  s        @@rQ   squeezerzSqueezeView.squeezer  s    #.t!qAvAt.!*4;AAF1;T	$4

+ 	$ejj#o0F 	$    /;s   
BBBBc                     t        d      )Nzuse SqueezeView.create())AssertionError)r   r  s     rQ   rk  zSqueezeView.__init__'  s    788rR   )r   r   r   r3  r  r   r   rH   r   r  rk  rS   rR   rQ   r  r    sG    " #W #WJ !uUZZ_- ! !9rR   r  c                   ^    e Zd ZU ee   ed<   edef   ed<   d Zd Z	d Z
e
Zed        Zd Zy	)
GenericViewr   .rh   c                     | j                   S rX   )rh   r   s    rQ   r  zGenericView.make_reindexer0  s    ||rR   c                     t        t        | j                              D cg c]  }t        d|        }}t	        | j                  |            }ddj                  t        t        |             d| S c c}w )Nrf   zlambda , : )	rd   rc   r   r6   r>   rh   r   r   r^   )r   r  	index_old	index_news       rQ   reindex_strzGenericView.reindex_str3  sm    49#dii.4IJ4Iq\AaS'*4I	Ji01	3sI#6789+FF Ks   A7c                 z    | j                  | j                  d| j                   d| j                          g      S )Nsize=zreindex=)r   r  r   r  r   s    rQ   r   zGenericView.__str__8  s=    YY%		{+x8H8H8J7K-LM
 	
rR   c                 (     | |t        |      |      S rX   )r>   )r  r   r  rh   s       rQ   r  zGenericView.create?  s    1d8ng..rR   c                     | j                   S rX   r  r   s    rQ   r   zGenericView.get_sizeC  r  rR   N)r   r   r   r   r   r   r   r   r  r  r   r2  r3  r  r   rS   rR   rQ   r  r  +  sI    
t*c3hG


 H/ /rR   r  c                   \    e Zd Zed        Zed        Zed        Zed        Zed        Z	y)rK  c                     t        j                  |       } t        j                  |      }t        j                  j                  j
                  j                  } |t        j                  | d            r| |z   } | S r   )rH   r  r8   r   r   	shape_envevaluate_exprLt)ry   r   r  s      rQ   handle_negative_indexzView.handle_negative_indexI  sZ    ll3||D!((22@@#q)**C
rR   c                   	 t        |t        t        f      sJ | j                  |j	                         |      \  	}t
        j                  j                  j                  	|      r|S d}t        t        	            dkD  st        t        |            dkD  rd}d|v r	fd} | |t        |      |      S t        |      s|rw|rt        |      s|j                          t        |      \  }}t        |j                  |j                   |t"        j%                  |      |j&                        }t)        ||      S | j+                  	|      } | |t        |      |      S )NFr   Tc                 4    t        dgt              z        S r   )r?   rc   )re   r  s    rQ   fake_reindexz!View.create.<locals>.fake_reindexd  s    aS3x=011rR   )r=   r?   r>   resolve_negative_sizer   r8   r   r   statically_known_list_equalsrc   r   r  r    as_contiguous_storage_and_layoutr  r   r   r(  r  r  r  rL  )
r  r   r  unbacked_symbols_in_sizesr  r  r  r  rh   r  s
            @rQ   r  zView.createR  sC   (UDM222 66qzz|XN( 77888LH$)!%h/014(23a7(,%=2 q$x.,77-a04M(2RST2U 		"B1"EGZ$!!  11(;!!J #7J77--hA1d8ng..rR   c                 <   |D cg c]+  }t         j                  j                  j                  |      - }}| D cg c]+  }t         j                  j                  j                  |      - } }t	        |      }t        t        |            D ]E  }||   dk(  st        j                  d      ||<   t        t        |       t        |            ||<    n t         j                  j                  j                  t        |       t        |             | |fS c c}w c c}w )Nr  r$   )r8   r   r   r.  r>   rd   rc   rH   r   r!   r4   guard_equals)r  r  r   rf   s       rQ   r   zView.resolve_negative_size{  s    :BC(QAGG$$--a0(C:BC(QAGG$$--a0(C>s8}%A{b #mmA.&}X'>h@WX	 & 	
%%mH&=}X?VW!! DCs
   0D0Dc                     	 | j                  ||      }|S # t        t        f$ r@ t        |      g}| j                  ||      }| j                  ||      }t	        ||      }Y |S w xY wrX   )_dynamic_reshape_indexerr  
IndexErrorr4   rt   )r  r  r  rh   flatrr   rs   s          rQ   rL  zView.dynamic_reshape_indexer  sx    	:228XFG  
+ 	:!(+,D33HdCH33D(CH%h9G	:s    AA%$A%c                     t         j                  j                  j                  }t	        t        |            D cg c]  }t        d|        c}t        t        |            }t        |       }g |r7|r4|j                         }|j                         \  }}|dk(  r9j                  t        j                  d             |j                  ||f       n|dk(  r|j                  |       n ||       ||      k(  r=j                  |       t         j                  j                  j                  ||       ng ||       ||      k  r ||       ||      k  r2|j                         \  }	}
|	|z  |z   }||
z  } ||       ||      k  r2j                  |       t         j                  j                  j                  ||       n ||       ||      kD  rt        j                  d      }|}j                  t        |||             ||z  } ||       ||      kD  rH|j                         }j                  t        |||             ||z  }||z  } ||       ||      kD  rHt         j                  j                  j                  ||       n
t               |r|r4|ra|j                         }t         j                  j                  j                  |d       j                  t        j                  d             |ra|r@|j                         \  }}t         j                  j                  j                  |d       |r@t        t!                    t              t        |       k(  sJ fd}|S c c}w )zG
        Perform a reshape entirely by modifying indexing math
        viewr$   r   c                     t        |       t              k(  sJ t        |       t              f       t        t        |             t        fdD              S )Nc              3   6   K   | ]  }t        |        y wrX   )r5   )r  r   replacementss     rQ   r  zAView._dynamic_reshape_indexer.<locals>.reindex.<locals>.<genexpr>  s     HiA|4is   )rc   r@   rj   r?   )re   r  rH  	view_exprs    @rQ   rh   z.View._dynamic_reshape_indexer.<locals>.reindex  sM    u:T*CSZT,CC*D% 01LHiHHHrR   )r8   r   r   r   rd   rc   r6   r>   rj   r  r  rH   r   r  r#   r  reversed)r  r  r   rf   	stack_new	stack_oldsize_oldvarsize_newvar2	size_new2divisormodulusrh   rH  r  s                 @@rQ   r  zView._dynamic_reshape_indexer  s   
 GG$$..	27H2FG2FQtA3Z(2FGT8,-	N		I }}H%MMOMC1}  q!12  #x1Q  *8$	((;;  %  --hA8$y'::)Ih,??&/mmoOD)/C/C')3H  )Ih,??   %  --hA8$y'::--*"  gw!GH!G+)Ih,??'mmoG$$_S'7%KL%/G''1H	  )Ih,??
   --hA$&&= I@  }}HGG))(A6U]]1-. 
 %MMOMCGG))(A6  ),-	9~X...	I
 o Hs   NN)
r   r   r   r   r  r3  r  r   rL  r  rS   rR   rQ   rK  rK  G  se      &/ &/P " " 	 	 < <rR   rK  c                   |     e Zd ZU dZded<    fdZd ZeZd Zd Z	d Z
d	 Zd
 Zd Zd Zd Zd Zd ZddZ xZS )r  z*Pretend our storage has a different layoutLayoutr  c                     t         |           t        | j                  t              r | j                  j                         | _        y y rX   )r  r   r=   r  r  r  r  s    rQ   r   zReinterpretView.__post_init__  s5    dii*		--/DI +rR   c                 P    | j                  | j                  | j                  g      S rX   )r   r  r  r   s    rQ   r   zReinterpretView.__str__  s&    		
 	
rR   c                 6    | j                   j                         S rX   r  r   s    rQ   r   zReinterpretView.get_name  r  rR   c                 .    | j                   j                  S rX   r  r   r   s    rQ   r   zReinterpretView.get_device      {{!!!rR   c                      y rX   rS   r   s    rQ   r  zReinterpretView.get_origin_node  rB  rR   c                 .    | j                   j                  S rX   )r  r   r   s    rQ   r   zReinterpretView.get_dtype  s    {{   rR   c                 @    t        | j                  j                        S rX   r>   r  r   r   s    rQ   r   zReinterpretView.get_size      DKK$$%%rR   c                 @    t        | j                  j                        S rX   r>   r  r   r   s    rQ   r   zReinterpretView.get_stride      DKK&&''rR   c                       fd}|S )Nc                     j                   j                         }t        j                  j	                          ||             S rX   )r  r   r7   loadr   re   rG  r   s     rQ   rI  z+ReinterpretView.make_loader.<locals>.loader  s0    kk..0G88DMMOWU^<<rR   rS   r   rI  s   ` rQ   r   zReinterpretView.make_loader  s    	= rR   c                 6    | j                   j                         S rX   r  r   r   s    rQ   r   zReinterpretView.make_indexer      {{''))rR   c                     | j                   S rX   r  r   s    rQ   r   zReinterpretView.get_layout  r
  rR   c                      y rX   rS   r   s    rQ   r  zReinterpretView.freeze_layout
      rR   c                     t         j                  j                  j                  | j                  | j
                  j                  | j
                  j                  | j
                  j                  |      S rX   )	r8   r   wrapper_codecodegen_reinterpret_viewr  r  r   r   r  r   s     rQ   r   z!ReinterpretView.codegen_reference  sR     ww##<<IIKKKKKK
 	
rR   rX   )r   r   r   __doc__r   r   r   r2  r   r   r  r   r   r   r   r   r   r  r   r4  r5  s   @rQ   r  r    sT    40

 H$"!&(*

rR   r  c                       e Zd Zedd       Zy)	SliceViewc           	         t        j                        dkD  sJ 	 dk(  r|dk\  rdk(  r|S t        j                  j
                  }t        |j                               | j                           | j                  |         }t              st        |      r0t        j                  |         }t        j                  |      n'|j                  |         }|j                  |      t        |z
  dz
  z         <   t        |      rzt        |      \  }}t        |j                        }	|	   z  |	<   t!        |j"                  |j$                  |	|j&                  |j                     z  z         }
t)        ||
      S fd}t+        ||      S # t        $ r Y w xY w)Nr   l    r$   c                     t        |       t              k(  sJ d|  d        t        |       } |    z  z   | <   | S )Nzwrong ndim r  )rc   r>   )re   r  r  startsteps    rQ   rh   z!SliceView.create.<locals>.reindexB  sP    u:X.P+eWAhZ0PP.KEsd*U2E#JLrR   )r   rh   )rH   r  	TypeErrorr8   r   r   r>   r   r  r   Minevaluate_minr"   r   r  r   r  r   r   r  r  r;  )r  r   r  r>  endr?  r   r  r  r  r  rh   r  s     `` `      @rQ   r  zSliceView.create  s   ||D!axx	zcY.419 77##

%))%#?''Xc]; '+@+E))C#/CIIeS)E''Xc];C))%5E uq!94@ #"7":GZj//0J(o4JsO$!!  !!J$5$5c$:U$BBJ #7J77	 7;;O  		s   F0 0	F=<F=N)r$   )r   r   r   r3  r  rS   rR   rQ   r;  r;    s    -< -<rR   r;  c                   z    e Zd ZU ej                  ed<   ej                  ed<   d Zd Zd Z	d Z
d Zd Zd	 Zd
 Zy)BaseConstantr   r   c                      yNrS   rS   r   s    rQ   r   zBaseConstant.get_sizeP      rR   c                     | j                   S rX   r  r   s    rQ   r   zBaseConstant.get_dtypeS  r  rR   c                     | j                   S rX   r	  r   s    rQ   r   zBaseConstant.get_deviceV  r
  rR   c                      y rX   rS   r   s    rQ   r  zBaseConstant.get_origin_nodeY  rB  rR   c                      y rX   rS   r  s     rQ   r   zBaseConstant.mark_reuse\  r5  rR   c                      yr  rS   r   s    rQ   r   z#BaseConstant.has_exceeded_max_reads_  r  rR   c                      yrG  rS   r   s    rQ   r   zBaseConstant.get_readsb  rH  rR   c                      yr  rS   r   s    rQ   r  zBaseConstant.is_externe  r  rR   N)r   r   r   rB   r   r   r   r   r   r   r  r   r   r   r  rS   rR   rQ   rE  rE  L  s>    ;;LLrR   rE  c                   f    e Zd ZU eed<   ej                  ed<   ej                  ed<   d Zd Z	d Z
y)Constantr   r   r   c                       fd}|S )Nc                 X    t        j                  j                  j                        S rX   )r7   r9  r   r   re   r   s    rQ   rI  z$Constant.make_loader.<locals>.loaderp  s    <<

DJJ77rR   rS   r.  s   ` rQ   r   zConstant.make_loadero  s    	8 rR   c                      y rX   rS   r   s    rQ   r   zConstant.realizeu  r5  rR   c                 D    t        | j                  | j                  |      S rX   )rQ  r   r   r0  s     rQ   r1  zConstant.constant_to_devicex  s    

DJJ77rR   N)r   r   r   r   r   rB   r   r   r   r   r1  rS   rR   rQ   rQ  rQ  i  s*    J;;LL8rR   rQ  c                   `    e Zd ZU eed<   ej                  ed<   ej                  ed<   d Zd Z	y)IndexingConstantre   r   r   c                       fd}|S )Nc                 X    t        j                  j                  j                        S rX   )r7   r  re   r   rT  s    rQ   rI  z,IndexingConstant.make_loader.<locals>.loader  s    >>$**djj99rR   rS   r.  s   ` rQ   r   zIndexingConstant.make_loader  s    	: rR   c                 D    t        | j                  | j                  |      S rX   )rX  re   r   r0  s     rQ   r1  z#IndexingConstant.constant_to_device  s    

DJJ??rR   N)
r   r   r   r   r   rB   r   r   r   r1  rS   rR   rQ   rX  rX  |  s&    J;;LL@rR   rX  c                       e Zd Z ed      fdej
                  dej                  dee   de	e
eeef         def
dZed        Zd	 ZeZd
 Zd Zd Zd Zd Zd Zd ZdefdZdej                  fdZy)r  r   r   r   r   r   r  c                     |$t        |      t        |      k(  sJ d| d|        || _        || _        t        d |D              sJ || _        || _        || _        y )Nr  	, stride=c              3   H   K   | ]  }t        |t        t        f        y wrX   )r=   r   r   r  s     rQ   r  z"Layout.__init__.<locals>.<genexpr>  s     <t!:a$-ts    ")rc   r   r   r  r   _strider  )r   r   r   r   r   r  s         rQ   rk  zLayout.__init__  sv     ~Tc/
 "
 	+4&	&*	+ 
 
<t<<<<	rR   c                     | j                   S rX   )r`  r   s    rQ   r   zLayout.stride  s    ||rR   c                     d}| j                   dk7  rd| j                    }t        |       j                   d| j                  j                   d| j                   d| j
                   d| j                   | dS )	Nr   r   z	, offset=z('', z, size=r^  ))r  rM   r   r   r   r   r   )r   r  s     rQ   r   zLayout.__str__  sw    ;;! .FDz""#2dkk&6&6%7s4::, GII;i}VHA?	
rR   c                     t        | j                  t        j                  | j                        | j                        D ]  \  }}}|dk7  s||k7  s y yNr$   FT)rj   r   r(  r  r   r   leftrightr   s       rQ   r  zLayout.is_contiguous  sM    !$KK::499Etyy"
D% qyTU]	"

 rR   c                     t        | j                        }|dvryt        | j                  t	        | j                        | j                        D ]  \  }}}|dk7  s||k7  s y y)N)r     Fr$   T)rc   r   rj   r   r   )r   ndimrh  ri  r   s        rQ   is_channels_last_contiguousz"Layout.is_channels_last_contiguous  s`    499~v!$KK7		BDII"
D% qyTU]	"

 rR   c                     t        | j                  t        t        j	                  | j
                              | j
                        D ]  \  }}}|dk7  s||k7  s y yrf  )rj   r   r  r(  r  r   rg  s       rQ   is_transposedzLayout.is_transposed  sT    !$KK^66tyyABII"
D%
 qyTU]"
 rR   c                    t        | j                        t        |      k(  sJ t        | j                        D cg c]5  \  }}t        j
                  j                  j                  |d      dk7  r|7 }}}|D cg c]  }| j                  |    }}|D cg c]  }||   	 }}d } ||      }dgt        |      z  }t        t        |            D ]4  }t        j
                  j                  j                  ||         |||   <   6 t        t        |      dz
        D ]  }||   ||dz      kD  s y yc c}}w c c}w c c}w )Nrv   )fallbackr$   c                 `    t        |       }| D cg c]  }|j                  |       c}S c c}w rX   )r  re   )arr
sorted_arrelements      rQ   sorted_indicesz0Layout.is_stride_ordered.<locals>.sorted_indices  s.    J=@AS'J$$W-SAAAs   +r  FT)	rc   r   rx   r   r8   r   r   r   rd   )r   rk   rf   r  non_1_indicesr   rv  stride_ordereds           rQ   r  zLayout.is_stride_ordered  sF   4;;3u:---
 $DII.
.3ww))#):a? . 	 
 +88-Q$++a.-8#01=aq=1	B
 u% E
*s5z"A'(ww'7'7'A'A&)'LN58$ # s5zA~&Aa >!a%#88 ' 1
 91s   :D9=D?Ec                     dgt        t        t        dt        | j                        dz
                    z   }t        |      g|z   }| j                  |      S r  )r>   r  rd   rc   r   r  r   rk   s     rQ   is_channels_last_stride_orderedz&Layout.is_channels_last_stride_ordered  sN    d8E!S-=-A$BCDDUu$%%e,,rR   c                     t        | j                  | j                  | j                  | j                  | j
                        S rX   )r  r   r   r   r   r  r   s    rQ   as_fixedzLayout.as_fixed  s2    KKJJIIKKKK
 	
rR   c                     t         j                  sJ dt        |       j                   d       | j	                         j                         S )Nzconvert z to FixedLayout first)r(  r&  rM   r   r}  r   r   s    rQ   r   zLayout.make_indexer  sG    ))	Ad4j))**?@	A)}}++--rR   r   c                    | j                   |j                   k(  xrj | j                  |j                  k(  xrO | j                  |j                  k(  xr4 | j                  |j                  k(  xr | j                  |j                  k(  S rX   r   r   r   r   r  )r   others     rQ   __eq__zLayout.__eq__  so    KK5<<' ,

ekk),		UZZ', u||+, u||+	
rR   c                 X    t        | j                  | j                  | j                        S rX   )r   r   r   r  r   s    rQ   storage_sizezLayout.storage_size	  s    .tyy$++t{{SSrR   N)r   r   r   r   rB   r   r   r   r   r   r   r   r   rk  propertyr   r   r2  r  rm  ro  r  r{  r}  r   r   r  rH   r  rS   rR   rQ   r  r    s     qz {{ 4j	
 %c	"234 $  
 H	<-
.
t 
Tejj TrR   r  c                        e Zd ZdZd ed      fdej                  dej                  dee	e
   e	e   f   deeee
ef         dee
ef   f
 fd	Zd
 Z xZS )r  z A Tensor layout we cannot changeNr   r   r   r   r   r  c                 Z    |t         j                  |      }t        |   |||||       y rX   )r(  r  r  rk  )r   r   r   r   r   r  r  s         rQ   rk  zFixedLayout.__init__	  s6     >#66t<F	
rR   c                       fd}|S )z1A closure containing math to read a given elementc                    t        |       t        j                        cxk(  rt        j                        k(  sJ  J j                  }t	        | j                  j                        D ]  \  }}}|dk7  s|||z  z   } |S r  )rc   r   r   r  rj   )re   resultry   r   szr   s        rQ   rG  z)FixedLayout.make_indexer.<locals>.indexer$	  sx    u:T[[!1CS^CCCCC[[F#&udkk499#EVR7#cFl2F $F MrR   rS   r   rG  s   ` rQ   r   zFixedLayout.make_indexer!	  s    	 rR   )r   r   r   r9  r   rB   r   r   r   r   r   r   r   r   rk  r   r4  r5  s   @rQ   r  r  	  s    * 8<#*1:

 {{
 DJS	)*	

 %c	"234
 dCi 
$rR   r  c                   z     e Zd ZdZdZed        Zed        Zed        Zed        Z	d Z
d Zd	 Zd fd
	Z xZS )r(  z(A Tensor layout we are allowed to changeFc                     t        |       dk(  rg S t        j                  d      g}t        | dd        D ]  }|j	                  ||d   z          t        t        |            S )Nr   r$   r  )rc   rH   r   r  r  r>   )sizesreversed_stridesr   s      rQ   r  z!FlexibleLayout.contiguous_strides4	  sc    u:?I!MM!,-U12Y'D##D+;B+?$?@ (H-.//rR   c                     t        t        t        |                   t        |      k(  sJ t        j                  d      }dgt        |      z  }|D ]  }|||<   || |   z  } |S )z
        Create a stride based on the order the dimensions should be filled in.

        In this format, channels last would be:
            [1, 3, 2, 0]
        r$   N)r   rd   rc   rH   r   )r  rk   next_strider  rf   s        rQ   fill_orderedzFlexibleLayout.fill_ordered=	  sj     5U$%U333mmA&&3u:%A$GAJ%a0K  rR   c                     t        t        t        |                   t        |      k(  sJ t        |      }t        j                  | |      S )z
        Create a stride based on the sorted order of a permuted range.

        In this format, channels last would be:
            [3, 0, 2, 1]
        )r   rd   rc   r}   r(  r  )r  rk   r|   s      rQ   rx  zFlexibleLayout.stride_orderedN	  sA     5U$%U333,U3
**5*==rR   c                 (   t        |       t        |      k(  sJ |D cg c]+  }t        j                  j                  j	                  |      - }}t        t        t        |            |j                        }t        j                  | |      S c c}w )z
        Create a stride that has the same stride order as given stride

        For example, if given stride is [1000, 1, 100, 10],
        the fill order should be [1, 3, 2, 0]
        r  )
rc   r8   r   r   r   r  rd   __getitem__r(  r  )r  r   r   r|   s       rQ   same_orderedzFlexibleLayout.same_orderedZ	  sw     5zS[(((9?@A!''"",,Q/@E#f+.F4F4FG
**5*== As   0Bc           	          t        | j                  | j                  | j                  | j	                  | j                  |      | j
                        S rX   )r  r   r   r   rx  r  rz  s     rQ   as_stride_orderzFlexibleLayout.as_stride_orderg	  s?    KKJJII		51KK
 	
rR   c           	          t        | j                  | j                  | j                  | j	                  | j                  |      | j
                        S rX   )r  r   r   r   r  r  rz  s     rQ   as_fill_orderzFlexibleLayout.as_fill_orderp	  s?    KKJJIIdii/KK
 	
rR   c           	          t        | j                  | j                  | j                  | j	                  | j                  |      | j
                        S rX   )r  r   r   r   r  r  r   r   s     rQ   as_same_orderzFlexibleLayout.as_same_ordery	  s?    KKJJIIdii0KK
 	
rR   c                     |rt         j                  ||      }nt         j                  |      }t        |   ||||       y rX   )r(  r  r  r  rk  )r   r   r   r   r  r  r  s         rQ   rk  zFlexibleLayout.__init__	  s;    $11$EG$77=Gg6rR   rX   )r   r   r   r9  r&  r   r  r  rx  r  r  r  r  rk  r4  r5  s   @rQ   r(  r(  /	  sr    2N0 0    	> 	> 
> 
>


7 7rR   r(  c                   >     e Zd ZdZdeedf   f fdZd Zd Z xZ	S )AliasedLayoutz)Shares the same storage as another tensorr  rG   c                     |j                         }t        | 	  |j                  |j                  |j
                  |j                         || _        y rX   )r   r  rk  r   r   r   r   r  )r   r  r  r  s      rQ   rk  zAliasedLayout.__init__	  sA    "MMLLKKMM		
 	rR   c                 >    | j                         j                         S rX   )r}  r   r   s    rQ   r   zAliasedLayout.make_indexer	  s    }}++--rR   c                     | j                   j                         j                  }|dk(  ryddlm} t
        j                  j                  j                  ||      S )Nr   Tr$   )	ALIGNMENT)	r  r   r  
compile_fxr  r8   r   r   statically_known_multiple_of)r   r  r  s      rQ   maybe_guard_alignedz!AliasedLayout.maybe_guard_aligned	  sD    %%'..Q;)ww<<VYOOrR   )
r   r   r   r9  r   r  rk  r   r  r4  r5  s   @rQ   r  r  	  s'    3U8[#89 .PrR   r  c                       e Zd Zd Zd Zd Zy)
NoneLayoutc                 2    || _         dg| _        dg| _        y r   )r   r   r   r0  s     rQ   rk  zNoneLayout.__init__	  s    C	crR   c                      yr   rS   r   s    rQ   r  zNoneLayout.storage_size	      rR   c                     | S rX   rS   r   s    rQ   r}  zNoneLayout.as_fixed	      rR   N)r   r   r   rk  r  r}  rS   rR   rQ   r  r  	  s    
rR   r  c                        e Zd Zdef fdZej                  j                  d        Zdej                  fdZ
ddZd Zedd       Zd	 Zd
 Z xZS )MutationLayoutr  c                    t         |   |j                         |j                         |j	                         d        || _        | j                         j                         }t        j                  j                  |       y rX   )r  rk  r   r   r   r  
get_bufferr   r8   r   mark_buffer_mutated)r   r  r\   r  s      rQ   rk  zMutationLayout.__init__	  se    OO		
  ))+	##D)rR   c                 6    | j                         j                  S rX   )real_layoutr   r   s    rQ   r   zMutationLayout.stride	  s    !(((rR   r   c                 >    | j                         j                         S rX   )r  r  r   s    rQ   r  zMutationLayout.storage_size	  s    !..00rR   c                 d    fd | j                         }t        |t              sJ d       |S )Nc                     t        | t              r | j                        S t        | t              r | j	                               S t        | t
              r | j                        S | S rX   )r=   r  r  r  r  
MutableBoxr  )r  unwrap_viewss    rQ   r  z/MutationLayout.get_buffer.<locals>.unwrap_views	  sX    &.1#FMM22&(+#F$6$6$899&*-#FKK00MrR   z%MutationLayout must refer to a buffer)r  r=   r  )r   r  r  s     @rQ   r  zMutationLayout.get_buffer	  s3    	 dkk*&&)R+RR)rR   c                 6    | j                         j                  S rX   )r  r  r   s    rQ   r  zMutationLayout.real_layout	  s     '''rR   c                    |j                          t        j                  j                  |j	                                t        |t              r|j                  }|j                          |st        j                  |j                         |j                         |j                         t        |j                         |j                               D cg c]/  \  }}t        j                  j                   j#                  ||      1 c}}      j                  }|j                          t        |j                  j$                  t&              sJ t)        |      |j                  _        |j                  S c c}}w )Nr(  )r   r8   r   r  r   r=   rG   r  r   r=  r  r   r   r   rj   r   r   r  r  r(  r  )r  srcdstunsafe_aliasrw  rx  s         rQ   realize_intozMutationLayout.realize_into	  s    	
##CLLN3c9%((C 	""~~'mmo* !$CLLNCLLN C C1 GG$$11!Q7 C	 #  d  	#((//>:::(-xxs   4E6c                     | S rX   rS   r   s    rQ   r}  zMutationLayout.as_fixed	  r  rR   c                 6    | j                   j                         S rX   )r  r   r   s    rQ   r   zMutationLayout.make_indexer
  r1  rR   )r   r  )F)r   r   r   r   rk  r  r   getterrH   r   r  r  r  r3  r  r}  r   r4  r5  s   @rQ   r  r  	  sf    	*v 	* ]]) )1ejj 1(    D*rR   r  c                        e Zd ZU ee   ed<   eed<    fdZd ZdefdZ	d Z
d Zd	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd#dZd Zd Zd Zd Zd Zd Z d Z!d Z"d  Z#d! Z$d" Z% xZ&S )$r  r\   r  c                 0    t         |           d | _        y rX   r   r  s    rQ   r   zBuffer.__post_init__
  r  rR   c                 6    | j                   j                         S rX   r0  r   s    rQ   r   zBuffer.make_indexer
  r1  rR   r   c                 6    | j                   sJ | j                   S rX   r  r   s    rQ   r   zBuffer.get_name
  s    yyyyyrR   c                 .    | j                   j                  S rX   r   r   s    rQ   r   zBuffer.get_device
  r!  rR   c                     | j                   S rX   r  r   s    rQ   r  zBuffer.get_origin_node
  r  rR   c                 0    t        | j                  dd       S )Nr   )rY   r  r   s    rQ   r   zBuffer.get_dtype!
  s    t{{GT22rR   c                 @    t        | j                  j                        S rX   r%  r   s    rQ   r   zBuffer.get_size$
  r&  rR   c                 @    t        | j                  j                        S rX   r(  r   s    rQ   r   zBuffer.get_stride'
  r)  rR   c                 .    | j                   j                  S rX   )r  r  r   s    rQ   
get_offsetzBuffer.get_offset*
  r!  rR   c                     | j                   S rX   r3  r   s    rQ   r   zBuffer.get_layout-
  r
  rR   c                 "    | j                         S rX   )r   r   s    rQ   r   zBuffer.get_storage_numel0
  s    ~~rR   c                      yr  rS   r   s    rQ   r  zBuffer.is_extern3
  r  rR   c                     t        | j                  t        t        f      s | j                  j	                         | _        y y rX   )r=   r  MultiOutputLayoutr  r}  r   s    rQ   r  zBuffer.freeze_layout6
  s0    $++(9='IJ++..0DK KrR   c                 |    t        | j                  t              sJ | j                  j                  |      | _        y rX   )r=   r  r(  r  rz  s     rQ   r  z&Buffer.freeze_layout_with_stride_order:
  s,    $++~666kk11%8rR   c                 |    t        | j                  t              sJ | j                  j                  |      | _        y rX   )r=   r  r(  r  rz  s     rQ   freeze_layout_with_fill_orderz$Buffer.freeze_layout_with_fill_order>
  s,    $++~666kk//6rR   c                 |    t        | j                  t              sJ | j                  j                  |      | _        y rX   )r=   r  r(  r  r  s     rQ   freeze_layout_with_same_orderz$Buffer.freeze_layout_with_same_orderB
  s,    $++~666kk//7rR   c                     t         j                  j                  j                  t	        j
                  | j                         d            S r   r   r   s    rQ   r   zBuffer.is_zero_elementsF
  r   rR   c                 p      j                         rt        t         j                               S  fd}|S )Nr  c                     j                   j                         }t        j                  j                   ||             S rX   )r  r   r7   r,  r\   r-  s     rQ   rI  z"Buffer.make_loader.<locals>.loaderN
  s.    kk..0G88DIIwu~66rR   )r   r   r;  r   r.  s   ` rQ   r   zBuffer.make_loaderI
  s/      "=0@AA	7 rR   c                      yr  rS   r   s    rQ   is_no_opzBuffer.is_no_opT
  r  rR   c                 "    | j                         S rX   )r   r   s     rQ   r   zBuffer.codegen_referenceW
  s    }}rR   c                      y rX   rS   r   s    rQ   r  zBuffer.decide_layoutZ
  r5  rR   c                     t        | j                  t              r%| j                  j                  j	                         gS yrG  )r=   r  r  r  r   r   s    rQ   get_alias_nameszBuffer.get_alias_names]
  s/    dkk=1KK$$--/00rR   c                     t        | j                  t              r%| j                  j                  j	                         gS yrG  )r=   r  r  r  r   r   s    rQ   get_mutation_nameszBuffer.get_mutation_namesb
  s/    dkk>2KK&&//122rR   c                     t        j                  t        dd      5  t        | j	                         | j                               cd d d        S # 1 sw Y   y xY wr%  )r   r'  r(  r)   r   r   r   s    rQ   r  zBuffer.get_read_writesg
  s=    \\.*:DA&  " BAAs   (AAc                 6    | j                         j                  S rX   )r  r+  r   s    rQ   r   zBuffer.get_readsn
  s    ##%+++rR   c                    t        | j                  t        t        f      r
t	               S t        | j                               t        | j                               z  t        | j                               z  }|| j                         z
  S )z
        Returns the unbacked symbols which are defined by this IR node,
        because this is a data-dependent IR node, or item()
        )
r=   r  r  r  r   r   r   r   r  get_unbacked_symbol_uses)r   defss     rQ   get_unbacked_symbol_defszBuffer.get_unbacked_symbol_defsq
  sr    < dkkJ0A#BC5L
 "$--/2#DOO$567#DOO$567 	
 d33555rR   c                     t               S )a  
        Returns the unbacked symbols which are required to be in scope in
        order to successfully perform codegen for this buffer.  For example,
        a buffer that corresponds to an extern kernel call that takes i0 as
        an argument would return {i0} here.  This is used to generate necessary
        dependencies that ensure we actually bind i0 in codegen before you
        try to use it.

        Note that this is NOT transitive; in particular, if this buffer takes
        in as input another buffer with dynamic shape (e.g., (i0,)), we will
        not report it here, because you will already have a dependency
        on that buffer, which will eventually have a dependency on i0 if
        necessary.
        )r   r   s    rQ   r  zBuffer.get_unbacked_symbol_uses
  s     urR   c           
         | j                         }t        | j                               D ]^  \  }}||v s|j                  |j	                  |       d| j                          d| d|j                          |j                  |       ` t        | j                               D ]^  \  }}||v s|j                  |j	                  |       d| j                          d| d|j                          |j                  |       ` | j                         x}|v rQ|j                  |j	                  |       d| j                          d|j                          |j                  |       |rJ d| d       y )N = z.size(rd  z.stride(z.storage_offset()zunbacked symint z% not written out, check comment above)
r  rx   r   	writelinecodegen_unbacked_symbol_declr   endingremover   r  )r   wrappersymbols_to_definerf   r   s        rQ   codegen_unbacked_symbol_defsz#Buffer.codegen_unbacked_symbol_defs
  s    !99;dmmo.DAq%%!!;;A>?s4==?BSSYZ[Y\\]^e^l^l]mn "((+ / doo/0DAq%%!!;;A>?s4==?BSS[\][^^_`g`n`n_op "((+ 1 ""A'8877:;3t}}>OO`ahaoao`pq $$Q'!	GaS EF	G!!rR   c                      y rX   rS   r   s    rQ   r   zBuffer.realize
  r5  rR   c                      y)z
        Gets extra global memory size needed by this buffer.
        Some algorithms (e.g. group gemm) may require extra global memory in the generated code.
        r   rS   r   s    rQ   get_workspace_sizezBuffer.get_workspace_size
  s    
 rR   c                      yr  rS   r   s    rQ   should_allocatezBuffer.should_allocate
  s    rR   rX   )'r   r   r   r   r^   r   r  r   r   r   r   r  r   r   r   r  r   r   r  r  r  r  r  r   r   r  r   r  r  r  r  r   r  r  r  r   r  r  r4  r5  s   @rQ   r  r  
  s     3-N
 *# " 3&(" 1978W	

,(6T"G4rR   r  c                       e Zd Zy)InputBufferNr   r   r   rS   rR   rQ   r  r  
      rR   r  c                   B    e Zd ZU dZeej                     ed<   d Zd Z	y)rO  NrN  c                       fd}|S )Nc                     j                   j                         }t        j                  t        j
                  j                  j                         j                         ||             S rX   )	r  r   r7   r,  r8   r   constant_namer   rN  r-  s     rQ   rI  z*ConstantBuffer.make_loader.<locals>.loader
  sM    kk..0G88%%dmmot7K7KL rR   rS   r.  s   ` rQ   r   zConstantBuffer.make_loader
  s    	 rR   c                     t        t        j                  j                  | j	                         |      | j
                        S rX   )rO  r8   r   r  r   r  r0  s     rQ   r1  z!ConstantBuffer.constant_to_device
  s/    GG!!$--/6:DKK
 	
rR   )
r   r   r   rN  r   rB   r   r   r   r1  rS   rR   rQ   rO  rO  
  s    .2OXell+2
rR   rO  c                       e Zd ZddZy)NoneAsConstantBufferNc                 J    t         j                  j                  j                  S rX   )r8   r   r7  none_strr   s     rQ   r   z&NoneAsConstantBuffer.codegen_reference
  s    ww##,,,rR   rX   )r   r   r   r   rS   rR   rQ   r  r  
  s    -rR   r  c                   &     e Zd Z fdZddZ xZS )ShapeAsConstantBufferc                 0    t         |           || _        y rX   )r  rk  shape)r   r
  r  s     rQ   rk  zShapeAsConstantBuffer.__init__
  s    
rR   c                     t         j                  j                  j                  t         j                  j                  j                  | j                              }t         j                  j                  rd| dS |S )Nztorch::tensor(rd  )r8   r   r7  expr_printerr   r.  r
  cpp_wrapper)r   r   exprs      rQ   r   z'ShapeAsConstantBuffer.codegen_reference
  sV    ww##001A1A1J1J4::1VW77#D6++KrR   rX   )r   r   r   rk  r   r4  r5  s   @rQ   r  r  
  s    rR   r  c                        e Zd ZU eed<   d Zed        Zd Zd Z	 fdZ
d Zd Zd	 Zd
 Ze	 	 dd       Zd Zd Zd Zd Zd Z xZS )r  r  c                     | j                   | j                   S t        | j                  d      r| j                  j                   S y)z
        Returns self.name if it exists, otherwise returns the name of the data node if that exists.
        If neither exist, returns None.
        Nr\   )r\   hasattrr  r   s    rQ   get_computed_buffer_namez'ComputedBuffer.get_computed_buffer_name  s7    
 99 99499f%99>>!rR   c                 H    t        | j                         j                        S rX   )rc   r  r+  r   s    rQ   	num_readszComputedBuffer.num_reads  s    4'')//00rR   c                    t        j                  t        dd      5  | j                  j	                         rTt        | j                         | j                  j                         | j                  j                               cd d d        S t        | j                         | j                  j                               cd d d        S # 1 sw Y   y xY wr%  )	r   r'  r(  r  r)  r)   get_store_functionr   r*  r   s    rQ   r  zComputedBuffer.get_read_writes  s    \\.*:DAyy++-*++-II&&(II002 BA +++-II&&( BAAs   A%C1CCc                     t        | j                               t        | j                               z  t        | j                               z  S rX   )r   r   r   r  r   s    rQ   r  z'ComputedBuffer.get_unbacked_symbol_uses   s?    & "$--/2#DOO$567#DOO$567	
rR   c                     t        | j                  d      rS| j                  t        j                  j
                  vr-| j                         dk(  r| j                  j                         S t        |          S )Nr   r   )	r  r  r\   r8   r   mutated_buffersr  r   r  r  s    rQ   r   zComputedBuffer.make_loader8  s[     DII}-		!8!88 A% 99((**w"$$rR   c                 j   | j                   j                         j                         }t        | j                  t
              r+t        | j                  j                  | j                  |      S t        | j                  t              sJ t        | j                  j                  | j                  |      S rX   )r  r}  r   r=   r  r  r   r  r\   r=  rJ  r  s     rQ   r  z!ComputedBuffer.get_store_functionC  sy    ++&&(557dii+49944diiIIdii33349911499gFFrR   c                    t        | j                  t              rt        j                  | j
                  j                         | j
                  j                               \  \  }}}| j                         j                  }|D cg c]_  }|j                  t        j                  j                  j                         v r't        j                  j                  |j                     nda }}t        d |D              sJ |D cg c]Z  }t        |t        j                         r>t#        |j$                  |D ci c]  }|dk7  s	|t'        j(                  d        c}      \ }}}|rT|D cg c],  }t        j                  j*                  j-                  ||      . }	}ddlm}
  |
|	| j                               S yc c}w c c}w c c}}w c c}w )al  
        If our layout is still flexible, try to determine the stride order based on stride orders of reads.

        TODO(jansel): A better algorithm here would look at downstream consumers of this
                      value and try to do global graph-level layout optimization.
                      This is also something just begging to be autotuned.
        Nc              3   p   K   | ].  }t        |t        j                  t        j                  f       0 y wrX   )r=   r&   StarDep	MemoryDepr  r  s     rQ   r  z0ComputedBuffer.get_fill_order.<locals>.<genexpr>`  s0      A 1|33\5K5KLMs   46r   r$   pick_loop_order)r=   r  r(  r&   r  r  r   r*  r  r+  r\   r8   r   r  r  r  r  r5   re   rH   r   r   r  	schedulerr!  )r   
index_varsr  r   r+  r  
reads_bufsvr  stride_lengthsr!  s              rQ   get_fill_orderzComputedBuffer.get_fill_orderK  s    dkk>2.:.M.M		""$dii&B&B&D/+(Z! ((*00E
 	 A 66QWW3388:: &&qvv. 	        	 Aa!7!78	 GG>T>aQRSVaq!11>T 	   PU"PUAGG$$11$
CPU  " 7&~t}}GG9 U"s+   A$F>3G8
GG	G-1GGc                     t        | j                  t              r5| j                         }|r| j	                  |       y | j                          y y rX   )r=   r  r(  r'  r  r  rz  s     rQ   r  zComputedBuffer.decide_layoutv  s@    dkk>2'')E2259""$ 3rR   c                     t        j                   j                  j                          j                  j	                         d      \  }}t        j                  t        d j                               5  t         j                          j                         r|n|dd |      }ddd       g j                  j                         |j                  j                         D cg c]K  }|t         j"                  j$                  j                         v rt         j"                  j$                  |   ndM }}g |j                  j                         |j&                  j                         g }g }g }g }	|j)                         D ]^  \  }
}|
|d   v r'|rJ |j+                  |
       |j+                  |       4|
|d   v sJ |j+                  |
       |	j+                  |       ` t-        t/        t1        |                  gt1              z  }t3        |      D ]2  \  }}t5        |t6              st9        |d      s$|j:                  ||<   4 d
 fd	}||z   } |||||      \  }}} ||||	      \  }}}t1        |      t1        |      k(  r| _        t        j<                  ||d	      \  \  }}}t        | ||       ||      g|      }||f|fS # 1 sw Y   4xY wc c}w )a  
        This is a main place where we do loop transformations in a
        backend-agnostic way.

        Here we:
            1) Remove any 1 dimensions
            2) Fuse contiguous dimensions together
            3) Reorder dimensions based on stride orders
        qr9   rN  Nr$   r   iter_reordering_reindexc           	          j                  | ||
|      \  }}} ||       } t        j                  j                  j	                  | |t        	| |            \  }}} ||       } t        ||      }|||fS rX   )_apply_loop_reorderingr8   r   r   _simplify_loopsr'   rt   )x_varssupport_varsr  reordering_reindexreindex0rr   rs   prunerh   index_formulasmemory_addrsr   s            rQ   simplify_and_reorderzAComputedBuffer.simplify_and_reorder.<locals>.simplify_and_reorder  s    (,(C(Ce\;M)%E8X f%F%&WW%5%5%E%E(G&"E8U
 6]F &h9G'8++rR   zrX   )r&   r  r  r   r*  r   r'  rO  r   LoopBodyr  r)  indexing_exprsrA   reads_name2exprr  r8   r   r  writes_name2expritemsr  ro   rd   rc   rx   r=   r  r  r+  index_vars_no_squeeze)r   rZ   
var_rangesrC  
reads_namer$  r#  reduce_vars
index_sizereduce_sizer%  r   r1  rf   	reads_bufr6  r0  iter_rangesiter_reindexr+  reduce_rangesreduce_reindexr   	iter_varsr4  r5  s   `                       @@rQ   r6  z#ComputedBuffer.simplify_and_reorder~  s    (::II $))">">"@
j \\.*;T__=NO'')002RaD P 94..5578
 #22779	
 :
 QWW3388:: GG"":. :	 	 

!!((*
""))+
 
!#
$$&DAqDG|&&!!!$!!!$DG|#|""1%""1% ' +5Z+ABCc,FWW%j1LAy)^44: )2(I(I"1%	 2	,$ "K/=Qj2D>
:\#: ,@{,
(~q
 {s:.+BD(/;/Q/Qs0
, K* <	*N;,GH*
 ]+T11W PO
s   81K)AK(K%c           
         ddl m} |g }	 |D cg c]-  }t        j                  j                  j                  || |      / }}t        |      t        |      k(  rt        |d         t        |       k(  sJ |+t        t        |            D ]  }		  ||	   ||	         ||	<    t        t         ||||                  }
|
D 	cg c]  }	||	   	 }}	|t%        |
      t'        |
      fS c c}w # t        $ r Y lw xY w# t        $ rZ t        j                  r*t        j                  dt!        t#        | |            |       t        t        t        |                  }
Y w xY wc c}	w )zU
        Shuffle the order of loops around to hopefully improve performance.
        r$   r   r   z%Did not simplify complex index:
%s
%s)r"  r!  r8   r   r   r  rc   rd   r  r>   r  	Exceptionr%   r  r  warningr@   rj   ro   rl   )r#  r0  r  r5  r1  priority_idxr!  r  r  rf   rk   s              rQ   r-  z%ComputedBuffer._apply_loop_reordering  sy    	/L	, )(D   --dJM(   w<3|#44WQZCM :   "-s<01A%:%7%:71:%F
 2 /'5,"OPQE $))5aq5)l5)?5+AAA3 *   	,||=Z/0 
 s5z*+E	, *sN   C= 2C)AC= C."C= E#)C= .	C:7C= 9C::C= =A E E c                 6    | j                   j                         S rX   )r  r*  r   s    rQ   r*  z!ComputedBuffer.get_reduction_size      yy++--rR   c                 6    | j                   j                         S rX   )r  r)  r   s    rQ   r)  z!ComputedBuffer.get_reduction_type  rN  rR   c                 6    | j                   j                         S rX   )r  r   r   s    rQ   r  zComputedBuffer.is_no_op  s    yy))++rR   c                      yNTrS   r   s    rQ   r  zComputedBuffer.should_allocate  rB  rR   c                 8    | j                   j                  |      S )rM  )r  r1  r0  s     rQ   r1  z!ComputedBuffer.constant_to_device  s    yy++F33rR   )NN)r   r   r   r   r   r  r,   r  r  r  r   r  r'  r  r6  r   r-  r*  r)  r  r  r1  r4  r5  s   @rQ   r  r  
  s    
K	 1 1
0	%G)V%X2t   *B *BX..,4rR   r  c                   L     e Zd ZdZ fdZd Zd Zd Zd Zd Z	d Z
d	 Z xZS )
TemplateBufferzt
    Represents a Triton (in the future other type) of template operator
    that we can fuse an epilogue onto.
    c                     t         |   d |       t        j                  |      | _        || _        t        j                  j                  |       | _	        y )N)r\   r  )
r  rk  InputsKernelunwrap_storageinputsmake_kernel_renderr8   r   register_bufferr\   )r   r  rY  rZ  r  s       rQ   rk  zTemplateBuffer.__init__  sE    d62"11&9"4GG++D1	rR   c                 "    | j                         S rX   )normalized_read_writesr   s    rQ   r  zTemplateBuffer.get_read_writes"  s    **,,rR   c                 B   | j                         | j                  j                         fd}t        j                  || j                         dd      }| j                  D ch c]%  }t        j                  |j                               ' c}|_        |S c c}w )Nc                 ^    t        |      dk(  sJ t        j                   |       d      S )Nr   fake)rc   r7   rD  )re   r  rG  r\   s     rQ   dummyz4TemplateBuffer.normalized_read_writes.<locals>.dummy)  s,    v;!###99T75>6::rR   rS   T)	normalize)	r   r  r   r&   r)   r   rY  r  r+  )r   ra  depsr   rG  r\   s       @@rQ   r]  z%TemplateBuffer.normalized_read_writes%  s    }}++**,	; //4==?B$
 CG++N+Ql**1::<8+N
 Os   (*Bc                      yr  rS   r   s    rQ   r*  z!TemplateBuffer.get_reduction_size3  r  rR   c                      y rX   rS   r   s    rQ   r)  z!TemplateBuffer.get_reduction_type6  rB  rR   c                      yr  rS   r   s    rQ   r  zTemplateBuffer.is_no_op9  r  rR   c                      yrR  rS   r   s    rQ   r  zTemplateBuffer.should_allocate<  rB  rR   c                 *    | j                         dfd fS rG  )r   r   s    rQ   r6  z#TemplateBuffer.simplify_and_reorder?  s$      
 	
rR   )r   r   r   r9  rk  r  r]  r*  r)  r  r  r6  r4  r5  s   @rQ   rU  rU    s0    
2-
rR   rU  c                       e Zd Zy)TritonTemplateBufferNr  rS   rR   rQ   rj  rj  I  r  rR   rj  c                   .     e Zd Zdeddf fdZd Z xZS )CUDATemplateBufferworkspace_sizetemplateCUDATemplatec                 D    t         |   |||       || _        || _        y rX   )r  rk  rm  rn  )r   r  rY  rZ  rm  rn  r  s         rQ   rk  zCUDATemplateBuffer.__init__N  s&     	);<, rR   c                 6    | j                   | j                   S dS r   )rm  r   s    rQ   r  z%CUDATemplateBuffer.get_workspace_size[  s    &*&9&9&Et""L1LrR   )r   r   r   r   rk  r  r4  r5  s   @rQ   rl  rl  M  s"    !
 ! !!MrR   rl  c                   P    e Zd ZU ee   ed<   d Zd Zed        Z	ed        Z
d Zy)rW  rY  c                 H    t        j                  |j                               S rX   )r&   r  r   r  s     rQ   get_read_writes_inputz"InputsKernel.get_read_writes_inputc  s    ##AJJL11rR   c           	         g }| j                   D ]a  }t        |t              r/|j                  |D cg c]  }| j	                  |       c}       B|j                  | j	                  |             c t        j                  t        |      t        j                  | j                               ht               g d t        j                               S c c}w )N)	op_counts)rY  r=   r>   extendrt  r  r&   
ReadWritesr   r  r   collectionsCounter)r   star_depinputr   s       rQ   r  zInputsKernel.get_read_writesf  s    [[E%& N1!;!;A!> NO : :5 AB	 ! &&M!!$--/23E!))+
 	
	 !Os   C
c                    t        | t              r| j                  } t        | t              r| j                  } t        | t              r%t        | t
              st        j                  |       } t        | t        t
        f      sJ |        | S rX   )	r=   rG   r  r  r  r  ExternKernelrealize_inputr  r   s    rQ   unwrap_storage_for_inputz%InputsKernel.unwrap_storage_for_inputw  sj    a#Aa$Aa":a+I**1-A!fo67::7rR   c                     g }| D ][  }t        |t              r#|D cg c]  }t        j                  |       }}nt        j                  |      }|j	                  |       ] |S c c}w rX   )r=   r>   rW  r  r  )rY  
inputs_newr   rf   s       rQ   rX  zInputsKernel.unwrap_storage  sf    
A!T"GHIq!\::1=qI 99!<a   	 Js   A%c                      yrR  rS   r   s    rQ   r  zInputsKernel.is_extern  rB  rR   N)r   r   r   r   r  r   rt  r  r   r  rX  r  rS   rR   rQ   rW  rW  _  sC    L2
"    rR   rW  c                       e Zd Zd Zy)	NopKernelc                      yrR  rS   r   s    rQ   r  zNopKernel.is_no_op  rB  rR   N)r   r   r   r  rS   rR   rQ   r  r    s    rR   r  c                   F    e Zd ZdZed        Zed        Zed        Zd Zy)ConcatKernelzn
    There isn't actually a real kernel for concat, we just change the
    storage for the upstream data.
    c                    |d   j                         }|d   j                         }t        |d   j                               }dg}||   g}d|cxk  rt	        |      k  sJ  J t        dt	        |            D ]  }||   j                         }	|j                  ||          t	        |	      t	        |      k(  sJ ||   j                         |k(  sJ ||   j                         |k(  sJ t        t	        |            D ]I  }
|
|k(  r||
   |	|
   z   ||
<   t        j                  j                  j                  ||
   |	|
         ||
<   K |j                  ||           t        j                  |      }t        t	        |            D ]Q  }||   }t        |      s|j                         }t        |t               s5|j#                         sFt%        |      } n t'        d t!        ||||      g       }t)        |      }g }t        t	        |            D ]  }| j+                  ||   t,        j/                  ||||   ||               }|j0                  j                  |       t        ||   j2                  t4              r||   j2                  j7                         }n||   j2                  }|j9                         s||   j                         j:                  dk(  st=        |      r|j                  |j?                                 t	        |      dkD  rt        j                  jA                  |       t        j                  jC                  |      |_"        | jG                  |j0                        |_        |S )Nr   r$   )r   r   r   r   r\   r  rY  r   )$r   r   r>   r   rc   rd   r  r8   r   r   r  r(  r  r   r   r=   r  rm  r   r  r  r  r;  r  rY  r  r  r  is_input_bufferrM   r1   r   register_listr[  r\   rX  )r  rY  r  r   r   r  offsets_startoffsets_endrf   
input_sizer  output_strider   r  concat_kernelkernelbuffer_namesinput_bufferinput_unwrappeds                      rQ   r  zConcatKernel.create  s0   %%'q	##%q	**,-}oC'#h-'''''q#f+&A++-J  #/z?c(m333!9&&(E111!9'')V3333x=)8"*1+
1"=HQK"#''"2"2"?"? Z]#HQK	 * x}- ' '99(Cs6{#Aq	A$Q'v{3::< %C8$LM $ %$	 	
 M*s6{#A++q	  mA.>AOL   ''5&)..(3"().."<"<">"()..  //11I((*//69"<0##L$9$9$;<# $& |q GG!!,/WW44]C"11-2F2FGrR   c                     t        |t              r| j                  |j                        S t        |j                  j                  t
              xr t        |j                  t               S rX   )r=   rG   can_realize_into_without_copyr  r  r(  ExternKernelAlloc)r  r  s     rQ   r  z*ConcatKernel.can_realize_into_without_copy  sU    c9%44SXX>>#((//>: 
:HH'D
 @
 	
rR   c                 H   t        |t              s%t        |      rt        |      \  }}t        ||      }t        |t              sJ |       t        |t              r| j                  |j                  |      S t        |t              r_|j                          t        |j                  d      sJ | j                  |      r&t        |      |j                  _        |j                  S t        j                  |j                         |j!                         |j#                         t%        |j'                         |j'                               D cg c]/  \  }}t(        j*                  j,                  j/                  ||      1 c}}      }| j                  ||      S c c}}w )Nr  r(  )r=   r  r   r  rG   r  r  r  r   r  r  r  r  r=  r  r   r   r   rj   r   r8   r   r   r  )r  r  r  r  r  rw  rx  pws           rQ   r  zConcatKernel.realize_into  sK   
 #/$S)"7"<%gv6#/44/c9%##CHHc22c:&KKM388X...005"/"4xx>>#--/__&  ??DAq   --a3?	  
 C((s   4Fc                      yrR  rS   r   s    rQ   r  zConcatKernel.should_allocate  rB  rR   N)	r   r   r   r9  r3  r  r  r  r  rS   rR   rQ   r  r    sK    
 I IV 
 
 ) )>rR   r  c                      e Zd ZU dZeedf   ed<    ej                  e	      Z
eeef   ed<   dZee   ed<    ej                  e      Zee   ed<   d	 Zd
 Zd Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zd Z d Z!d Z"d Z#d Z$d Z%d Z&d Z'd Z(d Z)d Z*e*Z+y)r~  rS   .constant_args)default_factoryr[   Noutput_viewordered_kwargs_for_cpp_kernelc                 z    t        | j                  t              r!| j                          | j	                          y y rX   )r=   r  r(  apply_constraintr  r   s    rQ   r  zExternKernel.decide_layout  s-    dkk>2!!#  3rR   c                 J    t        | |      \  }}|r|j                  |       y y rX   )r0   r  )r   r  
origin_strdetailed_origin_strs       rQ   codegen_commentzExternKernel.codegen_comment$  s*    *=dG*L'
'j) rR   c                     t               rX   r  r   r  s     rQ   codegenzExternKernel.codegen)  s    !##rR   c           	         t         j                  | j                         | j                         | j	                         | j                         | j                         | j                               }|j                          |S )N)r   r   r   r   r   r   )	r=  r  r   r   r   r   r  r   r   )r   r  s     rQ   
copy_inputzExternKernel.copy_input,  sa    <<>++-]]_::<))+oo'  
 	

	rR   c                 j    t        |      j                  |i |j                  }t        j                  |      \  }g g }g }|D ]  }j                  t        |t                     d   r|j                  |       9t        |t        j                        r5t        j                  j                  j                  j                  |d       }|j                  |        fd}	|D 
cg c]  }
| j                  |
       }}
|D ]  }
t!        |
      st#        |
d        g }|D ]  }
|
j%                         t        j                  j&                  v r;|j                  t        j                  j&                  |
j%                                   h|j                  t)        |
d               |	||      \  }} ||i |}t        |t*        t,        f      s|gn|}|D ]  }t        |t.        j0                        s|j2                  s+dt        j                  _        d}t        j                  j6                  j8                  j;                  dd       x}r| d	| }|t        j                  _         t?        |      r't        j                  j6                  j8                  d
   }||||	fS c c}
w )Nr  )r2  c                 $   g }t        |       }t        |      }D ]9  }|r|j                  t        |              |j                  t        |             ; t        j                  |      }|j                  dg       |j                  di       fS )NrZ   r[   )iterr  nextpytreetree_unflattenget)	new_tensor_argsnew_non_tensor_argsr  
it_tensorsit_non_tensors	is_tensorr  	args_specis_arg_tensors	          rQ   unflatten_argsz3ExternKernel.process_kernel.<locals>.unflatten_argsK  s    Fo.J!"56N*	MM$z"23MM$~"67	 +
 %%fi8A55$aeeHb&999rR   Tr  r   zEsparsity not handled. Please file issue for sparse inference weights.stack_tracez Found from : 
 r"  ) r   bind	argumentsr  tree_flattenr  r=   r   rH   r   r8   r   r   r  create_symintnoder  r   r  r   	constantsr   r>   r?   rB   Tensor	is_sparsedisable_cudagraphscurrent_nodemetar  disable_cudagraphs_reasonmaybe_free_unbacked_symbols)r  r  rZ   r[   binded_args	args_flattensor_argsnon_tensor_argsargr  r   example_argsnew_args
new_kwargsexample_outputexample_out_lir   msgr  r  r  s                      @@rQ   process_kernelzExternKernel.process_kernel9  s[   ,i',,d=f=GG%22;?	9%'C  C!89R ""3'c5::.''**44FFsQUFVC&&s+ 
	: 6AA[s((+[A A$Q'%a5  
 Azz|qww000##AGG$5$5ajjl$CD##$5aT$JK	   .lOL*8Z8 ntUm<  	
  A!U\\*q{{-1*]"#''"6"6";";"?"?t"TT;T E!2;-@C471   '~6WW1166u=N{O^KKW Bs   -J0c           
         t        |t              sJ t        |t              r|S |j                         j	                          t        j                  |j                         d      \  }}|d   } |j                         |      }t        j                  j                  j                  ||      }t        j                  j                  j                  ||      }t        j                  j                  j                  ||      }t        ||      |z   }||k7  r"t         j#                  d|||       t%               t        |j&                  t)        |j+                         |j-                         |j                         ||            S )z
        In order to pass this to an extern kernel we need a
        ReinterpretView not a View.  This allows us to avoid some
        unneeded copies.
        r  r9   r   z@convert_to_reinterpret_view failed: stride=%s offset=%s index=%sr  r  r  )r=   r  r  r  r  r&   r  r   r   r8   r   r   r  stride_vars
offset_varr3   r  r  r   r  r  r   r   )	r  r   
index_argsr>  r  re   r  r  expecteds	            rQ   convert_to_reinterpret_viewz(ExternKernel.convert_to_reinterpret_view  sB    !X&&&a)H 	
%%'!-!@!@JJL"

J  ]
  ,  55eZH''""..ujA!!,,UJ?Z1F:HIIR	 &''||~kkmZZ\	
 		
rR   c                 D   |
t               S t        |t        j                  t        j                  j
                  j                  t        f      rt        |      S t        |t              r[t        j                  j                  t        j                  |j                  |j!                         |j#                                     S t        |t$              r|S t        |t&              r| j)                  |j*                        S t        |t,              r|S t        |t.              r;|j1                          t3        |j5                               r	 | j7                  |      S t        |t:              r|j1                          |S | j=                  |      S # t8        $ r Y >w xY w)N)r   r   )r  r=   rH   r   rJ   rK   rL   r   r  rQ  r8   r   add_tensor_constantrB   tensorr   r   r   rO  rG   r  r  r  r  r   r   r  r  r   r  r  r  r   s     rQ   r  zExternKernel.realize_input  s4   9'))a%**ekk&9&9&A&A3GH(++a"77..QWWAKKM!,,.Q  a(Ha#$$QVV,,a)Ha"IIK$Q]]_5::1== a$IIKH~~a   + s   F 	FFc                     t        |      r<t        |j                               dk(  r|S |j                         D ]  }|dk(  s	|c S  | j                  |      S r  )r   rc   r   r  )r  r   r   s      rQ   require_stride1zExternKernel.require_stride1  sO     #1<<>"a',,.Q;H ) ~~a  rR   c                 ~   |j                         dk(  r|S t        |      rut        |j                         t              r9|j                         j
                  }t        |j                         t              r9t        |j                         t              rt        |dd|       |S t        |j                         t              r!|j                         j                  |      r|S t        |j                         t              rt        |j                         j                         t              rt        d      t        |j                         j                         t              r/|j                         j                         j                  |      r|S t        |t              r!|j                         j                  |      r|S t        |t              rt        |j                  t               rt        |j                  t"              stt        |j%                               r[t        |j%                         j                  t&              s3	 | j)                  |j                        |_        | j+                  ||      S | j/                  |      }t        |dd|       t1        ||      sJ |S # t,        $ r Y ;w xY w)Nr   TFr  z<the MutationLayout's real layout shouldn't be FlexibleLayout)r   r   r=   r   r  r  r(  r  r  r  r  r  r  r  rG   r  r  r  r  r  r  require_stride_orderr   r  r  )r  r   rk   s      rQ   r  z!ExternKernel.require_stride_order  s   ;;=AH !#Q\\^];LLN'' Q\\^];!,,..9%dE ,,.2259ALLNN;alln88:NK(V   LLN..0+lln002DDUKH a%!,,.*J*J5*QHq)$1668,qvv7%ammo6q}}335FG88@//599 NN1aeRWX1!U;;; ' s   1J0 0	J<;J<c                 .    | j                  |t              S rX   )r  NHWC_STRIDE_ORDERr  s     rQ   require_channels_lastz"ExternKernel.require_channels_last	  s    ''+<==rR   c                     | j                  |t        t        t        t	        |j                                                       S rX   )r  r>   r  rd   rc   r   r  s     rQ   require_contiguouszExternKernel.require_contiguous  s/    ''4s1::<?P9Q0R+STTrR   c                      y rX   rS   r   s    rQ   r  zExternKernel.apply_constraint  r5  rR   c                 r    t        t        j                  j                  j                  | j
                        S rX   )r   r8   r   r7  val_to_arg_strr  r   s    rQ   codegen_const_argszExternKernel.codegen_const_args  s%    177''668J8JKKrR   c                 Z   g }| j                   D ]u  }t        |t              rD|D cg c]  }|j                          }}ddj	                  |       d}|j                  |       W|j                  |j                                w |j                  | j                                |S c c}w )N[r  ])rY  r=   r>   r   r   r  rw  r  )r   rZ   r   rf   r   r   s         rQ   codegen_argszExternKernel.codegen_args  s    A!T"89:1,,.:&'		%(8'9$;!-.A//12  	D++-. ;s   B(c                     || j                   v r| j                   j                  |      S t        | d      r8|| j                  v r*| j                  j                  |      j                  d      S t	        d| d      )Nkwargs_default_valuer   zarg z6 not found in self.kwargs or self.kwargs_default_value)r[   r  r  r  r  )r   arg_names     rQ   get_kwargs_valuezExternKernel.get_kwargs_value#  sv    t{{";;??8,,D01D555,,00:>>wGG8*RS
 	
rR   c                      yr  rS   r   s    rQ   is_legacy_abi_kernelz!ExternKernel.is_legacy_abi_kernel/  r  rR   c           	         t         j                  j                  r| j                  r| j                  st        d      g }| j                  D ]  }| j                  |      }t        |t        j                        r|j                  |       @t        | d      r+| j                  j                  |      j                  d      }nd }|j                  t         j                  j                  j                  ||| j!                                       |S | j                  j#                         D cg c]3  \  }}| dt         j                  j                  j%                  |       5 }}}|S c c}}w )Nz(ordered_kwargs_for_cpp_kernel is missingr  rM   r   )r8   r   r  r[   r  r  r  r=   rH   r   r  r  r  r  r7  val_to_cpp_arg_strr  r<  r  )r   r[   r  r%  type_ks         rQ   codegen_kwargszExternKernel.codegen_kwargs2  s9   77 {{4#E#E$%OPPF >>))(3a,MM!$ t%;< $ 9 9 = =h G K KF S $MM,,??!1d&?&?&A ?(  !KK--//DAq #Qqww++::1=>?/   	s   .8E+c           	         t         j                  rt        j                  j                  st        j                  j
                  j                  | j                               }t        j                  j
                  j                  | j                               }|j                  d| j                          d| d| d       y y y )Nzassert_size_stride(r  rd  )r%   size_assertsr8   r   r  r7  codegen_shape_tupler   r   r  r   )r   r  r   r   s       rQ   codegen_size_assertsz!ExternKernel.codegen_size_assertsO  s    qww':':77'';;DMMOLDWW))==doo>OPF%dmmo%6bbJ (;rR   c                 N    | j                         }| j                         }|g g|fS )zD
        get output sizes and strides, for template_codegen
        )r   r   )r   _sizer`  s      rQ   get_group_stridezExternKernel.get_group_strideW  s*     //#r{G##rR   c                    t         j                  j                  }| j                         }| j	                         }|D cg c]  }|j                  |       }}t        t        |            D cg c]  }t        d|        }}t        t        t        |            |j                  d      }t        |      D 	ci c]  \  }}	|	|
 }
}}	t        t        |
            D cg c]  }|
|   	 }}|D cg c]  }||   	 }}| j                         } ||      }t         j                  j                  j                  |||g      \  }}}t        d      \  }}t        t!        | ||D cg c]
  } ||       c}                  }t#        t%        j&                  |      |      }|t)        |      fS c c}w c c}w c c}	}w c c}w c c}w c c}w )zC
        Manually get canonicalization of the output index
        dT)r  reversec)r8   r   r   r   r   r   rd   rc   r6   r  r  rx   r   r.  r*   r@   rj   r5   rH   r  r?   )r   r   r  r  r   rf   r#  index_orderry   rz   r{   rk   rG  re   	new_sizesrh   r3  r   add_varreplacements                       rQ   canonicalizezExternKernel.canonicalize`  s   
 77##//#29:'Q8%%a(':5:3u:5FG5FlQqc7+5F
GU3w<0g6I6ISWX+4[+AB+AxsC#s(+AB$)#f+$67$6q$67-23UjmU
3##%
#$%GG$4$4$D$Dw%
!	7E !%
73z7	3R	1GAJ	3R+STU5<<.<eI&&&) ;G C73 4Ss#   F5/F:>F?$G6G
+Gc                     t               }| j                  D ]  }|t        |      z  } | j                  j	                         D ]  }|t        |      z  } |S rX   )r   r  r  r[   rA   )r   r  r  s      rQ   r  z%ExternKernel.get_unbacked_symbol_uses~  sW     E%%C,S11A &;;%%'C,S11A (rR   c           
      "   t        | dd       }d|g}|t        j                  |       D cg c]'  }|j                   dt        | |j                         ) c}z  }|j	                  d| j
                         | j                  |      S c c}w )Nr  zkernel=r   r   )rY   dataclassesfieldsr\   r  r   r   )r   kernel_namer   fields       rQ   r   zExternKernel.__str__  s    dHd3k_%
 	$++D1
1 zzl!GD%**5671
 	
 	|D$4$4#789u%%
s   ,B),r   r   r   r  r   r   r   r  r  r@   r[   r
   r^   r  r   r  r>   r  r   r  r  r  r   r  r3  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r2  rS   rR   rQ   r~  r~    sZ   %'M5c?'.[..tDFDcND-1K/*13D;3D3D4!8C= !
*
$ 
 
 HL HLT *
 *
X ! !8 ! ! . .` > > U UL


:$'<
& HrR   r~  c                   N     e Zd ZU dZee   ed<   d Z	 	 	 	 	 	 d fd	Zd Z	 xZ
S )ExternKernelOutNr  c                 ,   | j                  |       g | j                         | j                         }|j                  | j                  | j                         |t        j                  j                  r| j                         y | j                         y rX   )r  r  r  generate_extern_kernel_outr  r   r8   r   r  
cpp_kernelr  r   r  rZ   s      rQ   r  zExternKernelOut.codegen  sx    W%=""$=t':':'<=**""$ ww22DOO		
 9=		
rR   c	                     t         	|   d || j                  |      ||xs i        || _        t        j
                  j                  |       | _        || _        || _	        || _
        y rX   )r  rk  rX  r  r8   r   r[  r\   r  r  r  )
r   r  rY  r  r[   r  r  r  r  r  s
            rQ   rk  zExternKernelOut.__init__  sc     	&$--f5}flPR	
 'GG++D1	$-J*rR   c                      yrR  rS   r   s    rQ   r  zExternKernelOut.should_allocate  rB  rR   )rS   NNNNrS   )r   r   r   r  r   r  r   r  rk  r  r4  r5  s   @rQ   r  r    s6    -1K/*1
 &(K(rR   r  c                   <     e Zd Zdedej
                  f fdZ xZS )RandomSeedscountr   c                     t        j                  t         j                        }t        |   t        |t         j                  |g      g |j                  |j                  |ggdd       y )Nr  zaten.randint.low_outzat::randint_out)r  rY  r  r  r  )rB   r7  r  r  rk  r  ri  rh  )r   r  r   limitsr  s       rQ   rk  zRandomSeeds.__init__  s]    U[[)kkW
 !::vzzE7;)( 	 
	
rR   )r   r   r   r   rB   r   rk  r4  r5  s   @rQ   r  r    s    
c 
5<< 
 
rR   r  c                   B     e Zd Zd Zd Z	 	 	 	 	 d fd	Zd Zd Z xZS )r  c                 f    t         j                  j                  r| j                  S | j                  S rX   )r8   r   r  r  r  r   s    rQ   codegen_kernel_namez%ExternKernelAlloc.codegen_kernel_name  s!    "#''"5"5tF4;;FrR   c                    | j                  |       g | j                         | j                         }t        j                  j
                  j                  | |       t        | j                  t              r| j                  |       y y rX   )r  r  r  r8   r   r7  generate_extern_kernel_allocr=   r  r  r  r  s      rQ   r  zExternKernelAlloc.codegen  sl    W%=""$=t':':'<=	99$Edkk6*%%g. +rR   c                     t         |   d || j                  |      ||xs i        t        j                  j                  |       | _        || _        || _        || _	        y rX   )
r  rk  rX  r8   r   r[  r\   r  r  r  )	r   r  rY  r  r[   r  r  r  r  s	           rQ   rk  zExternKernelAlloc.__init__  s[     	&$--f5}flPR	
 GG++D1	$-J*rR   c                      yr  rS   r   s    rQ   r  z!ExternKernelAlloc.should_allocate  r  rR   c                     t         rX   r  r   s    rQ   r  z"ExternKernelAlloc.apply_constraint  s    !!rR   )rS   NNNrS   )	r   r   r   r!  r  rk  r  r  r4  r5  s   @rQ   r  r    s/    G/ &(K$"rR   r  c                   H     e Zd Zd Zd Zd Zd Zd Zd Z fdZ	d Z
 xZS )	UserDefinedTritonKernelc                     ddl m} ddlm} |j	                  | j
                        }g }t        ||      r|j                  }|j                  }||fS )Nr   )	Autotuner)kernel_side_table)	triton.runtime.autotunerr*  *torch._higher_order_ops.triton_kernel_wrapr+  
get_kernel
kernel_idxr=   configsr]   )r   r*  r+  r  r0  s        rQ   get_kernel_and_configsz.UserDefinedTritonKernel.get_kernel_and_configs  sG    6P"--doo>fi(nnGYYFwrR   c                 z   | j                         \  }}|j                  ||| j                        }| j                         }t        j
                  j                  r,t        |      D cg c]  \  }}||j                  vs| }}}| j                  |       |j                  || j                  ||       y c c}}w rX   )r1  !define_user_defined_triton_kernelr[   r  r8   r   r  rx   
constexprsr  #generate_user_defined_triton_kernelgrid)r   r  r  r0  new_namerZ   rf   r  s           rQ   r  zUserDefinedTritonKernel.codegen  s    557 <<GT[[
 ""$77 '0oToFAs&BSBS9SCoDT 	W%33II		
	 Us   )B7 B7c                      yr  rS   r   s    rQ   r  z'UserDefinedTritonKernel.should_allocate  r  rR   c                      yrR  rS   r   s    rQ   has_side_effectsz(UserDefinedTritonKernel.has_side_effects  s     rR   c                     i S rX   rS   r   s    rQ   r  z0UserDefinedTritonKernel.get_unbacked_symbol_defs  r@  rR   c                     g S rX   rS   r   s    rQ   r  z*UserDefinedTritonKernel.get_mutation_names!  r@  rR   c          
         g }t               }g }|j                         D ]f  \  }}t        |t              r;t        j                  | j                  |            }	|j                  |	       |	||<   Q|j                  |       |||<   h t        |      dk7  sJ |d   j                         }
t        | -  d t        |
      |t        |      |       t        j                  j!                  |       | _        || _        || _        | j)                         \  }}|j*                  D cg c]	  }||v s| c}| _        t/        | g|j1                         D cg c]  }t        |t              s| c}  y c c}w c c}w r   )r@   r<  r=   rG   rW  r  r  r  rc   r   r  rk  r  r?   r8   r   r[  r\   r/  r6  r1  	arg_namesr  mark_node_as_mutatingrA   )r   r/  r6  kernel_argsrY  r[   r  r  r%  r   r   r  r   r  rw  r  s                  rQ   rk  z UserDefinedTritonKernel.__init__$  so   %%'DAq!Y' 99$:L:LQ:OPa q	$$Q'q	 ( 6{a%%'v- 	
 GG++D1	$	//1	 "++.
+Csk/AC+.
* 		
*113P3!z!Y7OA3P	
	.

 Qs   $	E4.E4E9
)E9
c                 \    | j                   D cg c]  }|j                          c}S c c}w rX   rY  r   )r   rf   s     rQ   r  z'UserDefinedTritonKernel.get_alias_namesI  s$    &*kk2k

k222s   ))r   r   r   r1  r  r  r:  r  r  rk  r  r4  r5  s   @rQ   r(  r(    s,    

0
#
J3rR   r(  c                     |D ]X  }t        |t              sJ t        j                  j	                  |j                                t        |j                  ||        Z y)z
    Allows ops in mutated_ops to be marked as being mutated as well as
    indicates to the scheduler that these ops depend on cur_buffer.
    N)r=   rG   r8   r   r  r   MutationOutputr  )
cur_buffermutated_opsops      rQ   r?  r?  M  sF    
 "i(((	##BKKM2ryy"j1 rR   c                   <     e Zd Zd Z fdZd Zd Zd Zd Z xZ	S )rD  c                 >    | j                   d   j                         gS r   rB  r   s    rQ   r  z!MutationOutput.get_mutation_namesY      A'')**rR   c                 v    t         |   d |||gd       t        j                  j	                  |       | _        y rG  r  rk  r8   r   r[  r\   )r   r  r|  parentr  s       rQ   rk  zMutationOutput.__init__\  s0    vv;GG++D1	rR   c                      yr  rS   r   s    rQ   r  zMutationOutput.should_allocate`  r  rR   c                      yrR  rS   r   s    rQ   r  zMutationOutput.is_no_opc  rB  rR   c                      yrR  rS   r   s    rQ   r:  zMutationOutput.has_side_effectsf  rB  rR   c                 >    | j                   d   j                         gS r   rB  r   s    rQ   r  zMutationOutput.get_alias_namesi  rJ  rR   )
r   r   r   r  rk  r  r  r:  r  r4  r5  s   @rQ   rD  rD  X  s!    +2+rR   rD  c                   >     e Zd ZdZdZd Zd Zd Zd Z fdZ	 xZ
S )InplaceBernoulliFallbackE
    This needs to be a custom class to handle mutation properly
    zaten.bernoulli_c                     d | j                   D        \  }|j                  | j                   d| ddj                  t	        t
        | j                               d       y )Nc              3   <   K   | ]  }|j                           y wrX   r   r  r   s     rQ   r  z3InplaceBernoulliFallback.codegen.<locals>.<genexpr>u  s     ;{!##%{   (r  rd  )rY  r  r  r   r   reprr  )r   r  r   s      rQ   r  z InplaceBernoulliFallback.codegent  sQ    ;t{{;{{m1QCr$))Cd6H6H,I"J!K1M	
rR   c                      yr  rS   r   s    rQ   r  z(InplaceBernoulliFallback.should_allocatez  r  rR   c                 >    | j                   d   j                         gS r   rB  r   s    rQ   r  z+InplaceBernoulliFallback.get_mutation_names}  rJ  rR   c                     i S rX   rS   r   s    rQ   r  z1InplaceBernoulliFallback.get_unbacked_symbol_defs  r@  rR   c                     t         |   d t        |j                               | j	                  |g      |       t
        j                  j                  |       | _        t        | |       y rX   
r  rk  r  r   rX  r8   r   r[  r\   r?  )r   r   r  r  s      rQ   rk  z!InplaceBernoulliFallback.__init__  sV    q||~&$		
 GG++D1	dA&rR   r   r   r   r9  r  r  r  r  r  rk  r4  r5  s   @rQ   rS  rS  m  s-     F
+' 'rR   rS  c                   >     e Zd ZdZdZd Zd Zd Zd Z fdZ	 xZ
S )AccumulateGradrT  zinductor_ops.accumulate_grad_c                 x    d | j                   D        \  }}|j                  | j                   d| d| d       y )Nc              3   <   K   | ]  }|j                           y wrX   rW  rX  s     rQ   r  z)AccumulateGrad.codegen.<locals>.<genexpr>  s     K{! 3 3 5{rY  rZ  r  rd  )rY  r  r  )r   r  variablenew_grads       rQ   r  zAccumulateGrad.codegen  s:    Kt{{K8T[[M8*BxjBCrR   c                      yr  rS   r   s    rQ   r  zAccumulateGrad.should_allocate  r  rR   c                 >    | j                   d   j                         gS r   rB  r   s    rQ   r  z!AccumulateGrad.get_mutation_names  rJ  rR   c                     i S rX   rS   r   s    rQ   r  z'AccumulateGrad.get_unbacked_symbol_defs  r@  rR   c                     t         |   d t        |j                               | j	                  ||g             t
        j                  j                  |       | _        t        | |       y rX   r`  )r   rf  rg  r  s      rQ   rk  zAccumulateGrad.__init__  sY    x**,-8 45	

 GG++D1	dH-rR   ra  r5  s   @rQ   rc  rc    s.     -FD+. .rR   rc  c                   \     e Zd ZdZd Zd Zd Zd Zd Zddd	d
e	de
e   def fdZ xZS )ScatterFallbackz
    This needs to be a custom class to handle mutation properly.
    This class handles both aten.scatter_ and aten.scatter_reduce_.
    It also handle the case `src` being a scalar properly.
    c           
      :   | j                   d   }t        j                  j                  r/ddd}||v r||   }| j	                  | j
                  |      | _        | j                  rd | j                  D        \  }}}n%d | j                  D        \  }}| j                  d   }|j                  ||| j                  d   ||gt        j                  j                  r| j                  n| j                  | j
                  | j                  || j                                y )	Nr  rk  rj  )re  multiplyc              3   <   K   | ]  }|j                           y wrX   rW  rX  s     rQ   r  z*ScatterFallback.codegen.<locals>.<genexpr>       Jkq224krY  c              3   <   K   | ]  }|j                           y wrX   rW  rX  s     rQ   r  z*ScatterFallback.codegen.<locals>.<genexpr>  s     EA!--/rY  r$   r   )r[   r8   r   r  get_cpp_kernelr]   r  src_is_tensorrY  r  generate_scatter_fallbackr  r  )r   r  r  get_operator_enumr   re   r  s          rQ   r  zScatterFallback.codegen  s    X&77(-6 B***62"11$''6BDOJdkkJOQsEEJQ$$Q'C))""1%uc2 ww22DOOGG!	
rR   c                      yr  rS   r   s    rQ   r  zScatterFallback.should_allocate  r  rR   c                 h    |dk(  r!| j                   r|dnd}|S |J d       d}|S |J d       d}|S )Naten.scatter_zat::scatter_outzat::scatter_reduce_outz:Expect reduce to be None for aten.scatter_ with scalar srcz5Expect reduce to be not None for aten.scatter_reduce_)rt  )r   r]   r  r  s       rQ   rs  zScatterFallback.get_cpp_kernel  ss     !!)/%=U   NPOP"*  "GFG"-FrR   c                 >    | j                   d   j                         gS r   rB  r   s    rQ   r  z"ScatterFallback.get_mutation_names  rJ  rR   c                     i S rX   rS   r   s    rQ   r  z(ScatterFallback.get_unbacked_symbol_defs  r@  rR   NTr  include_selfr  r  r}  c          	          |dv sJ t        |t              | _        || _        || _        | j                  r%|||fD cg c]  }| j                  |       }	}|f}
n$||fD cg c]  }| j                  |       }	}||f}
t        |   d t        |j                               | j                  |	      |
||d       ddg| _        t        j                  j                  |       | _        t!        | |       y c c}w c c}w )N>   aten.scatter_reduce_ry  r|  r  r}  )r=   rG   rt  r  r]   r  r  rk  r  r   rX  r  r8   r   r[  r\   r?  )r   r]   r   r  re   r  r  r}  r   tensorsr  r  s              rQ   rk  zScatterFallback.__init__  s    >>>>'Y7 78%oFot))!,oGF FM78%jAjt))!,jGA #JMq||~&(|<	
 /7-G*GG++D1	dA& G Bs   C6"C;)r   r   r   r9  r  r  rs  r  r  r   r   r^   r   rk  r4  r5  s   @rQ   rm  rm    sU    
0$+ !%!!' 	!' !' !' !'rR   rm  c                   :     e Zd ZdZd Zd Zd Zd Z fdZ xZ	S )IndexPutFallbackzQ
    This needs to be a custom class to handle mutation and indices properly
    c                    d | j                   D        ^}}}g }t        |      }t        | j                        D ]b  \  }}| j                  |   |j	                  t        |             0|j	                  t        j                  j                  j                         d t        j                  j                  j                   dj                  |       t        j                  j                  j                   }	||	|g| j                         }
|j                  |j                  t        j                  j                   r| j"                  n| j$                  |
             y )Nc              3   <   K   | ]  }|j                           y wrX   rW  rX  s     rQ   r  z+IndexPutFallback.codegen.<locals>.<genexpr>  s     &Rkq':':'<krY  r  )rY  r  rx   r  r  r  r8   r   r7  r  open_bracketr   closed_bracketr  r  wrap_kernel_callr  r  r  )r   r  r   rA   valid_indicesr  iter_valid_indicesrf   r   indices_strrZ   s              rQ   r  zIndexPutFallback.codegen  s   &Rdkk&R#F]!-0dll+DAq||A*t$678qww33<<=	 , --::;DIIg<N;OPQPWPWPdPdPsPsOtu;C)@)@)BC$$#$77#6#6DKK	
rR   c                      yr  rS   r   s    rQ   r  z IndexPutFallback.should_allocate!  r  rR   c                 >    | j                   d   j                         gS r   rB  r   s    rQ   r  z#IndexPutFallback.get_mutation_names$  rJ  rR   c                     i S rX   rS   r   s    rQ   r  z)IndexPutFallback.get_unbacked_symbol_defs'  r@  rR   c                 ~   || _         |D cg c]  }||	 }}||g|D cg c]  }| j                  |       }}t        |   d t	        j                               | j                  |      |f       t        j                  j                  |       | _
        d| _        d| _        t        | |       y c c}w c c}w )Nzat::index_put_zaten.index_put_)r  r  r  rk  r  r   rX  r8   r   r[  r\   r  r  r?  )	r   r   r  rA   
accumulaterf   r  r  r  s	           rQ   rk  zIndexPutFallback.__init__*  s    $+=Gqq}G=34f2M}2MN2MQ4%%a(2MNq||~&(M		
 GG++D1	*'dA& >Ns   B5B5B:)
r   r   r   r9  r  r  r  r  rk  r4  r5  s   @rQ   r  r  
  s&    
$+' 'rR   r  c                   "    e Zd Zed        Zd Zy)
DeviceCopyc                    |j                         s1t        d |j                         D              r|j                  |      S t        j
                  j                  |       t        j
                  j                  |j                                t        d       t        t        ||j                         |j                               | j                  |      g      S )Nc              3      K   | ]F  }|j                   t        j                  j                  v xr t	        |t
        j                         H y wrX   )r\   r8   r   r  r=   r&   r  r  s     rQ   r  z$DeviceCopy.create.<locals>.<genexpr>=  s@      %
" VVqww(((RZ<;Q;Q-RR"s   AAzDeviceCopy in input programr  )r  r  r   r1  r8   r   add_device_infor   r/   r  r(  r   r   r  )r  r   r   s      rQ   r  zDeviceCopy.create;  s    {{} %
[[]%
 "
 ''//	'	/78kkmZZ\
 q!"
 	
rR   c                     | j                         }t        |      dk(  sJ | j                  r.|j                  |d   | j                  j	                                y |j                  |d   | j	                                y r  )r  rc   r  codegen_device_copyr   r  s      rQ   r  zDeviceCopy.codegenP  si      "4yA~~''Q1A1A1S1S1UV''Q1G1G1IJrR   N)r   r   r   r3  r  r  rS   rR   rQ   r  r  :  s    
 
(KrR   r  c                   :     e Zd ZdZd Zd Z fdZd Zd Z xZ	S )rF   z;
    The result of a call to aten._local_scalar_dense.
    c                      yrG  rS   r   s    rQ   r   zDynamicScalar.get_reads^  rH  rR   c                      yr  rS   r   s    rQ   r  zDynamicScalar.should_allocatea  r  rR   c                    t         |   d t        t        j                  d            |g       t        |t        j                        r|| _        d| _	        y t        |t        j                        sJ |       t        |j                  d   t        j                        sJ |       |j                  d   dk(  sJ |       |j                  d   | _        d| _	        y )Nr   Fr   r$   T)r  rk  r  rB   r   r=   rH   rI   symis_boolr   rZ   )r   r  r  r  s      rQ   rk  zDynamicScalar.__init__e  s    z%,,u*=>Gc5<<(DH DL c588,1c1,chhqk5<<8=#=888A;!#(S(#xx{DHDLrR   c                     | j                   hS rX   )r  r   s    rQ   r  z&DynamicScalar.get_unbacked_symbol_defsw  s    zrR   c                    d | j                   D        \  }| j                  r"|j                  | j                   d| d       n!|j                  | j                   d| d       |j                  | j	                          d       y )Nc              3   <   K   | ]  }|j                           y wrX   rW  rX  s     rQ   r  z(DynamicScalar.codegen.<locals>.<genexpr>{  s     >+Q1&&(+rY  z = 1 if z.item() else 0r  z.item()z = None)rY  r  r  r  r   )r   r  r  s      rQ   r  zDynamicScalar.codegenz  su    >$++><<
(4&GH
#dV7;< 	T]]_-W56rR   )
r   r   r   r9  r   r  rk  r  r  r4  r5  s   @rQ   rF   rF   Y  s!     $7rR   rF   c                   6    e Zd ZU eed<   ej                  ed<   y)ExternKernelNoder\   rO   N)r   r   r   r^   r   export_schemar   rS   rR   rQ   r  r    s    
I


rR   r  c                        e Zd ZU eeeef      ed<   	 d fd	Zd Z	d Z
d Zd Zd Zed        Zd	 Zd
 Zd Zd Zedej*                  fd       Zed        Z fdZ xZS )FallbackKernelargs_default_valuec                    t         |   |t        |      t        |             g | _        d| _        d | _        t        |t        j                  j                  t        j                  j                  f      sJ d| dt        |       d       || _        || _        |i n|| _        t        j                   j#                  | j$                         y )NFz#Fails to create FallbackKernel for r  z not supported)r  rk  r?   outputsuse_runtime_dispatchabi_compatible_kernelr=   rB   _ops
OpOverloadHigherOrderOperatorrM   op_overloadr  r[   r8   r   warn_fallbackr  )r   r  r  r  nontensor_argsr  r[   r  s          rQ   rk  zFallbackKernel.__init__  s     	+.!	
 ')$)!%)"

%%

..
 	X 14<.W	X 
 ","Nb	dkk*rR   c                    ddl m} |j                  j                  rJ d|j                   d       d t        fd|j                  j                  D              sJ |j                   d       t        fd|j                  j                  D              sJ |j                   d	       |j                  j                  | _	        |j                  j                  | _        | j                  j                  d
d       d| j                   | _         ||      | _        |j                  j                  D cg c]  }|j                  s|j                   c}| _        y c c}w )Nr$   get_cpp_op_schemazmutable z" is not supported with cpp_wrapperc                 P    | j                   d u xs | j                   j                   S rX   )
alias_infois_write)r  s    rQ   is_not_writez3FallbackKernel.set_cpp_kernel.<locals>.is_not_write  s#    >>T)H1H1H-HHrR   c              3   .   K   | ]  } |        y wrX   rS   r  r   r  s     rQ   r  z0FallbackKernel.set_cpp_kernel.<locals>.<genexpr>  s      
%=LO%=rv  z< with alias_info arguments is not supported with cpp_wrapperc              3   .   K   | ]  } |        y wrX   rS   r  s     rQ   r  z0FallbackKernel.set_cpp_kernel.<locals>.<genexpr>  s      
%;LO%;rv  z: with alias_info returns is not supported with cpp_wrapper::r   )codegen.wrapperr  _schema
is_mutabler   r  r  returnsr\   r  overload_namecpp_kernel_overload_namereplacecpp_kernel_keycpp_op_schema
kwarg_onlyr  )r   r  r  r   r  s       @rQ   set_cpp_kernelzFallbackKernel.set_cpp_kernel  sY   6 ))	Jfoo&&HI	J)	I  
%+^^%=%=
 
 	\ooZ[	\ 
  
%+^^%;%;
 
 	ZooXY	Z 
 !..--(.(D(D%&&tS12!D4Q4Q3RS 	 /v6"NN44.
4qAFF4.
* .
s   9E"E"c                 0    dt        | j                        v S )N#_scaled_dot_product_flash_attention)r^   r  r   s    rQ   r  z#FallbackKernel.is_legacy_abi_kernel  s    4DKK8HHHrR   c                     t        | d      sJ d       |t        | j                        k  s J d| dt        | j                                | j                  |   d   S )Nr  z*self.args_default_value has to be providedzexpected the index z2 to be smaller than len(self.args_default_value): r   )r  rc   r  )r   rz   s     rQ   get_arg_default_valuez$FallbackKernel.get_arg_default_value  s    &
 	87	8 
 S##
 
 	w %WX[\`\s\sXtWuv	w 
 &&s+G44rR   c                      t         j                  j                  s j                  S  fd}d|i}|j	                   j
                  d       x} |       S  j
                  S )Nc                  v    t         fd j                  D              r j                   dS  j                  S )Nc              3   D   K   | ]  }j                  |      d u   y wrX   )r  )r  r  r   s     rQ   r  zQFallbackKernel._get_abi_compatible_kernel.<locals>.sdpa_ver_fn.<locals>.<genexpr>  s)       BH %%h/47 Bs    _v2)rg  r  r  r   s   rQ   sdpa_ver_fnz>FallbackKernel._get_abi_compatible_kernel.<locals>.sdpa_ver_fn  s>       $ B B  //*#..&rR   z'at::_scaled_dot_product_flash_attention)r8   r   r  r  r  r  )r   r  kernel_to_verver_fns   `   rQ   _get_abi_compatible_kernelz)FallbackKernel._get_abi_compatible_kernel  sY    ww"";;		' CKP#''>>FK8OrR   c           
         t         j                   G d d             }| j                  D cg c]  } ||j                                }}| j	                  || j
                        \  }}| j                         | _        t        j                  j                  rt        | j                  t        j                  j                        r~t!        | j                  j"                  j$                  |      D cg c]H  \  }}t        j                  j&                  j)                  |j*                  || j-                               J }}}n6|D cg c]+  }t        j                  j&                  j/                  |      - }}t        j                  j                  rt1        | d      rt3        |      }t3        | j4                        }||k  rot7        ||      D 	cg c]  }	| j9                  |	       }
}	|
D cg c]+  }t        j                  j&                  j/                  |      - }
}|j;                  |
       | j<                  j?                  |       |S c c}w c c}}w c c}w c c}	w c c}w )Nc                       e Zd ZU eed<   d Zy))FallbackKernel.codegen_args.<locals>.Shimrefc                     | j                   S rX   )r  r   s    rQ   r2  z2FallbackKernel.codegen_args.<locals>.Shim.__repr__  s    xxrR   N)r   r   r   r   r   r2  rS   rR   rQ   Shimr    s    H rR   r  r  ) r  	dataclassrY  r   r  r  r  r  r8   r   r  r=   r  rB   r  r  rj   r  r  r7  r  	real_typer  r  r  rc   r  rd   r  rw  r[   update)r   r  r   r  rZ   r[   paramn_args
n_pos_argsrf   pos_argss              rQ   r  zFallbackKernel.codegen_args  s   				  	  
	  =AKKHKqtA//12KH**;8J8JKf &*%D%D%F"77:d.>.>

@U@U#V
 !$D$4$4$<$<$F$F M	 !NHE1 $$77OOQ(A(A(C !N	   EIIDqAGG((77:DDI 77741E#FYFT445J
";@;T;TaD..q1;T   MUUHqAGG00??BHUH% 	6"K I J Vs   I4AI	0II"20I'c                    | r| d   j                         S t        |t        j                        r|j                  S t        |t
        t        f      rg|D ch c]  }t        j                  d |       }}|D cg c]  }|s|	 }}t        |      dk(  r|d   S |D ]  }|j                  dk(  s|c S  |d   S y c c}w c c}w )Nr   r$   r   )r   r=   rB   r  r   r>   r?   r  find_devicerc   rM   )r  r  r   devicesr   s        rQ   r  zFallbackKernel.find_device,  s    q>,,..nell3!(((ntUm4DRSNq~11$:NGS,3>G&vvGG>7|q qz!!;;&(!M " 1: T>s   B:9B?B?c                     t        | j                  t        j                  j                        syt        | j                        j                         S r  )r=   r  rB   r  r  r   r  r   s    rQ   r:  zFallbackKernel.has_side_effects>  s9    $**EJJ,A,ABt//0;;==rR   c                 &   t        | j                  t        j                  j                        sg S t        j
                  j                  j                  | j                        r(| j                  D cg c]  }|j                          c}S g S c c}w rX   )
r=   r  rB   r  r  rC   utilsis_viewrY  r   r   inps     rQ   r  zFallbackKernel.get_alias_namesD  sj    $**EJJ,A,ABI??  (()9)9: /3kk:ksCLLNk::	 ;s   2Bc           	         t        | t              sJ | j                  | j                  | j                        \  }}| j
                  D cg c]  }|j                  |d        }}t        d d       }|j                  | j                  ||      }d }| j                  }|j                  j                  }	t        |	      dk(  r$|	d   j                  }
 ||
| j                        g}nxt        | j                  t              sJ t        |	      t        | j                        k(  sJ t!        |	| j                        D cg c]  \  }} ||j                  |       }}}t#        | j%                         t'        j(                  | j                  j+                         ||i             }t,        j.                  j0                  j3                  |       g ||S c c}w c c}}w )Nc           	      p   t        | t        j                        ro|}t        |t        t        f      rt        |      dk(  sJ |d   }t        j                  j                  t        j                  |j                                     S t        | t        j                        rxt        | j                         t        j                        rPt        j                  j                  |D cg c]&  }t        j                  |j                               ( c}      S t        dt        |              c c}w )Nr$   r   r  )	as_tensor)
as_tensorszUnsupported return type )r=   rB   
TensorTyper>   r?   rc   r  Argumentr  TensorArgumentr   ListTypegetElementTypeRuntimeErrorrM   )return_typeoutputr   s      rQ   handle_single_outputzFFallbackKernel.export_extern_kernel_node.<locals>.handle_single_output_  s   +u'7'78ftUm4v;!+++ )C$--44+::O 5   K8Z**,e.>.>> %--44 $* #)C &44#,,.I#)  5   #%=d;>O=P#QRR s   )+D3r$   r   )r  rY  r  metadata)r\   rO   )r=   r  r  rY  r  r  r  r   serialize_inputsr  r  r  rc   r  r  r?   rj   r  r   r  r   r\   r8   r   extern_kernel_nodesr  )r   rZ   r[   r  ordered_kwargs
serializernamed_argumentsr  r  r  r  output_argumentsreturn_schemar  rO   s                  rQ   export_extern_kernel_nodez(FallbackKernel.export_extern_kernel_nodeT  s   $///**4;;8J8JKf-1-O-O
-OcFJJsD!-O 	 
 +46
$55d6F6FfU	S. !!..((w<1!!*..K 4[$,, OP dllE222w<3t||#4444 .1$,,-G -G)M6 %]%<%<fE-G   
  ##'',,.&(	
 	
##**40''''s
P s   
G=Gc                 r   | j                   }|j                  dk(  rt        |t        j                  j
                        sJ |j                  j                  d      d   }t        j                  j                  rWt        j                         r8|t        vr0t        j                  d|       d| _        | j#                  |       n|j$                  dk(  rd| nd|j                  j'                  dd	       d
| _        |j*                  }|j,                  D cg c]'  }|j.                  s|j0                  |j2                  d) c}| _        |j,                  D cg c]  }|j.                  s|j6                   c}| _        |j,                  D ci c]2  }|j.                  r$|j6                  |j0                  |j2                  d4 c}| _        nd| | _        nt        |t        j                  j>                        rQtA        t        jB                  jD                  |j                  d       |u rd|j                   | _        nntG        d      t        j                  j                  rd| _        | j#                  |       n0|jH                  j'                  dd       d|j                   | _        | j                   r| jK                  |       d }d }t        j                         r+t        j                  j                  r| jM                         }n"g | jO                         | jQ                         }|jS                  | jU                         | jW                         || jX                  | jZ                  | j\                  | j                   || j^                  	       y | jK                  |       g | jO                         | jQ                         }t        j                  j`                  jc                  | |       t        | jd                  tf              r| ji                  |       y y c c}w c c}w c c}w )Naten.r   zG%s is missing a c-shim implementation, using proxy executor as fallbackTrH  zat::z
at::_ops::r   z::call)rM   r   zaten.ztorch._prims.rng_prims.z.Unable to find HigherOrderOperator kernel namez._ops.z.ops.)5r  	namespacer=   rB   r  r  r   r  r8   r   r  r%   	is_fbcode
has_c_shimr  rK  r  r  _overloadnamer  r  r  r  r  r  r;  r  r\   r  r  r  r  rY   _prims	rng_primsr   r   r  r  r  r  6generate_extern_kernel_alloc_and_find_schema_if_neededr   r!  r  r  r  r  r7  generate_fallback_kernelr  r  r  )r   r  r  op_base_nameschemar   exported_argsrZ   s           rQ   r  zFallbackKernel.codegen  s   !!v%fejj&;&;<<<!??005a8Lww""##%&
*BKKa 15D-''/ "//9< |n-)&//*A*A#s*K)LFS O
 $^^F "(!1!1/!1A || "#qG!1/D+ )/(8(8:(81ALL(8:D6
 "(!1!11!1A<< q OO!11D- !&l^4

 > >?u||--vEO 77HI)D 
 ww"",0)##F+ ((007CDAfooEVW  $$  ) MD!agg&9&9 $ > > @E**,Et/B/B/DEJJ((*""##--  
   )AT&&(A4+>+>+@ADGG  99$E$++v.))'2 /u/
:1s   ,P*P//P/7P4r  c           	          t        | j                  | j                  t        | j	                               t        | j                                     S rX   )r  r   r   r-   r   r   )r  s    rQ   tensor_to_layoutzFallbackKernel.tensor_to_layout  s9    MMLL%fkkm4%fmmo6	
 	
rR   c                     t         j                  f}||vrt        j                  j                  n	t               }|5    j                  |g|i |\  }}}}	d d d         j                        }
|
sJ d         t        |
      ||	       fd |g       }t        |t        t        t        f      r	|_        |S |g_        |S # 1 sw Y   wxY w)N"Not sure where to find device infoc                 N    t         t        t        f      r. t                fdt	        t                     D              S t         t              r: j                         D ci c]  \  }}| |t               |fgz           c}}S t         t        j                        rt        j                               S t         t              r S t         t        j                        r j                  j                  S  J dt                d       y c c}}w )Nc              3   T   K   | ]  } |   t              |fgz          ! y wrX   )rM   )r  rf   generate_outputr  r  s     rQ   r  zAFallbackKernel.create.<locals>.generate_output.<locals>.<genexpr>  s5      $/ $F1Iw4<:K9L/LM/r  zFallbackKernel output type z is not supported)r=   r>   r?   rM   rd   rc   r@   r<  rB   r  MultiOutputr  r   SymIntrO   r  )r  r  r  r"  r  r  packeds   ``  rQ   r  z.FallbackKernel.create.<locals>.generate_output  s   &4-0#tF| $"3v;/$   FD) %+LLN$2S g$v,9L8M.MNN$2  FELL1"((0 
 FC(FELL1{{''' NQ0f>OPQ"%s   +#D!)r  *_fused_moving_avg_obs_fq_helper_functionalr8   r   	fake_moder   r  r  r  r=   r>   r?   r@   r  )r  r  rZ   r[   fake_incorrect_kernelscontextr  r  r  r  r   r  r  r  s   `           @@rQ   r  zFallbackKernel.create  s    "&"Q"Q!S!'/E!EAGG;= 	  #""6;D;F;  n=;;;vf%
	6 "."5geT23$FN  &YFNg Ws   CCc                      t         |          S rX   )r  r  r  s    rQ   r  zFallbackKernel.apply_constraint.  s    w'))rR   rX   )r   r   r   r   r
   r^   r   r   rk  r  r  r  r  r  r   r  r:  r  r  r  rB   r  r  r3  r  r  r4  r5  s   @rQ   r  r    s    T#s(^,,  +D
>I5(-^  "> <(|V3p 
 
 
 8 8t* *rR   r  c                   >     e Zd ZdZd Zd Z fdZed        Z xZ	S )ComplexViewz9View a complex number as two dtyped numbers or vice versac                      yr  rS   r   s    rQ   r  zComplexView.should_allocate6  r  rR   c                 >    | j                   d   j                         gS r   rB  r   s    rQ   r  zComplexView.get_alias_names9  s    A'')**rR   c                 h    t         |   |t        |      t        |             g | _        || _        y rX   )r  rk  r?   r  r  )r   r  r  r  r  r  s        rQ   rk  zComplexView.__init__=  s5     	+.!	
 ')rR   c           	         t         j                  j                  }|5   | j                  |g|i |\  }}}}d d d        t        j                        }	|	sJ d       t        t        |	      ||      }
t        |j                  |j                  t        |j                               t        |j                                     }t        ||
g       }|g|
_        |S # 1 sw Y   xY w)Nr  )r8   r   r  r  r  r  r  r  r  r   r   r-   r   r   r  r  )r  r  rZ   r[   r  r  r  r  r  r   r  r  r  s                rQ   r  zComplexView.createO  s    ''## #""6;D;F;   ++KH;;;vf%v{O
 !!  %n&9&9&;<%n&;&;&=>	
 ffb1!1 Ws   CC')
r   r   r   r9  r  r  rk  r3  r  r4  r5  s   @rQ   r  r  2  s)    C+$  rR   r  c                   ,    e Zd ZU ej                  ed<   y)r  r   N)r   r   r   rB   r   r   rS   rR   rQ   r  r  m  s    LLrR   r  c                   R     e Zd Zd Zd Zdeeedf      f fdZd Z	d Z
d Z xZS )	r  c                    t        |      dkD  r|d   \  }}|t        k(  r| j                  | d| d|dd        S |t        k(  rWt        j
                  j                  j                  || j                         t        |            }| j                  ||dd        S |t        k(  r| j                  | d| d|dd        S t        d      |S )Nr   r  r  r$   z['z']znon supported index type)rc   r>   codegen_list_tuple_accessr?   r8   r   r7  codegen_tuple_accessr   r^   r@   r  )r   basenamer  ityperf   tuple_accesss         rQ   r$  z%MultiOutput.codegen_list_tuple_accessv  s    w<!qzHE1}55
!A3a6H'RSRT+VV% ww33HHdmmos1v  55lGABKPP$55
"QCr6JGTUTVKXX$%?@@OrR   c                     |j                  | j                         | j                  | j                  d   j                         | j                               | j                  |       y r   )codegen_multi_outputr   r$  rY  r  r  r  s     rQ   r  zMultiOutput.codegen  sN    $$MMO**4;;q>+B+B+DdllS	
 	))'2rR   r  .c                     t         |   d ||gd       t        j                  j	                  |       | _        || _        y rG  )r  rk  r8   r   r[  r\   r  )r   r  r|  r  r  s       rQ   rk  zMultiOutput.__init__  s5    vw3GG++D1	rR   c                 <    | j                   d   j                         S r   )rY  r  r   s    rQ   r  z$MultiOutput.get_unbacked_symbol_uses  s    {{1~6688rR   c                      yr  rS   r   s    rQ   r  zMultiOutput.should_allocate  r  rR   c                     | j                   D cg c]D  }t        |t        t        f      r,t	        |j                               dkD  r|j                         F c}S c c}w r   )rY  r=   r  r  rc   r  r   r  s     rQ   r  zMultiOutput.get_alias_names  sX     {{
"#<=C'')*Q. LLN"
 	
 
s   A	A)r   r   r   r$  r  r   r   r   rk  r  r  r  r4  r5  s   @rQ   r  r  r  s5    $3tE#s(O/D 
9
rR   r  r   rG   ry  biaspaddingr   dilationgroups
transposedoutput_paddingc
                    d }
d }|j                          |j                          ||j                          t        j                  j                  5  t	        |d      }t	        |d      }t        |j                               dz
  }dt        |      cxk  r|k  sJ  J dt        |      cxk  r|k  sJ  J dt        |      cxk  r|k  sJ  J t        ||      }t        ||      }t        ||      }|	t        dg|      }	n%dt        |	      cxk  r|k  sJ  J t        |	|      }	t        |t              sJ |r( |||      }|j                         } |
||||	|||      }nR|t	        |d      n|}t        j                  j                  j                  ||||||||	|	      }|j                         }dgt        t        t!        dt        |      dz                     z   }t        |      g|z   }t#        |      }ddd       | j%                  |      }|j'                         j(                  d	k(  r|j'                         j(                  d	k(  sJ ||g}t+        |j'                         |j-                         t/              t/                    }||||g}|r|j1                  d|	       ||j3                  |       n|j1                  d|       ||||fS # 1 sw Y   xY w)
au  
    This function is a helper function to prepare inputs, layout and constant args
    for convolution post-op fusion's create function, including deciding the output
    layout (channels first or channels last), realizing inputs and make them etc. The
    function only supports the CPU device since conv post-op fusion kernel is only
    supported on CPU right now.
    c                    t        |       t        |      k(  sJ d       t        |       }|dkD  sJ d       d}d}	g }
|
j                  | |          |
j                  ||	   |z         t        d|      D ]P  }||   dz
  ||dz
     z  dz   }| |   dz
  ||dz
     z  ||dz
     dz  z
  |z   ||dz
     z   }|
j                  |       R t        t	        t
        |
            S )NzExpect input dim == weight dimrv   zExpect input dim > 2r   r$   )rc   r  rd   r>   r   r   )output_sizeweight_sizer0  r4  r   r1  r2  r  	BATCH_DIMWEIGHT_INPUT_CHANNELS_DIMr  r  r  input_size_ds                 rQ   _conv_input_sizez<_prepare_convolution_fusion_create.<locals>._conv_input_size  s    ;3{#33U5UU3+Qw...w	$%!
+i01+&?@6IJq#A!!nq(HQUO;a?FQ!#va!e}41q5>A%' !Q'(  l+  CZ())rR   c                 L   | j                         }t        |      }|dkD  sJ d       |dkD  rWg }|j                  |d   |z         |j                  |d   |z         t        d|      D ]  }|j                  ||           |S | j	                  dd      j                         }|S )Nrv   zExpect weight dim > 2r$   r   )r   rc   r  rd   	transpose)prepacked_weightr2  prepacked_weight_sizer  r8  r  s         rQ   _original_deconv_weight_sizezH_prepare_convolution_fusion_create.<locals>._original_deconv_weight_size  s     !1 5 5 7'(Qw///wA:K4Q7&@A4Q7&@A1c]""#8#;< #  +44Q:??AKrR   NTr  rv   r   r$   r   )r   r8   r   r  r   rc   r   r2   r=   r   rB   r7   r  convolutionr>   r  rd   r   r  r   rM   r  r   r-   insertr  )r  r   ry  r/  r0  r   r1  r2  r3  r4  r<  rA  x_fakeweight_faker  r8  r  r7  	bias_faker  req_stride_orderr  rY  kernel_layoutr  s                            rQ   "_prepare_convolution_fusion_createrI    s   **6" IIK
NN	
		"1$7'DA6;;=!A%3w<'4'''''3x=(D(((((3v;&$&&&&&w-$/fd+!)1#t4Ns>*2d22222).$?N&#&&& 7{FKKJ*K >B=M!$D9SW  YY^^//
F !++-K3huQFa/H&I!JJ 0125EE6{Ce 
h 	  $45A<<>%'F,=,=,?,D,D,MMM[F		!+.!-0	M fh7MQ/dQ%=-1AAAM 
	s   F1K  K)c                    |j                          |j                          ||j                          t        j                  j                  5  t	        |d      }t	        |d      }|t	        |d      n|}|6t
        j                  j                  j                  j                  |||      }n4t
        j                  j                  j                  j                  ||      }|j                         }ddg}	t        |      }
ddd       | j                  |	      }|j                         j                  dk(  r|j                         j                  dk(  sJ ||g}t!        |j                         |j#                         t%              t%        
            }g }||j'                  |       n|j)                  d|       ||||	fS # 1 sw Y   xY w)z
    This function is a helper function to prepare inputs, layout and constant args
    for linear post-op fusion's create function. The function only supports the CPU device
    since linear post-op fusion kernel is only supported on CPU right now.
    NTr  r$   r   r   )r   r8   r   r  r   rB   r7   r  addmmrH  mmr   r   r  r   rM   r  r   r-   r  rC  )r  r   ry  r/  rD  rE  rF  r  r7  rG  r  rY  rH  r  s                 rQ   _prepare_linear_fusion_createrM  1  s    IIK
NN	
		"1$7'DA9=9Id5t 	 YY^^))11F YY^^&&..F kkmq63K@) 
, 	  $45A<<>%'F,=,=,?,D,D,MMM[F		!+.!-0	M  "MdQ%=-1AAAM 
	s   B7GGc                   z     e Zd Z	 	 d fd	Zd Zedddddddee   dee   d	ee   d
edeee	      fd       Z
 xZS )ConvolutionUnaryc                 L    t         |   |||d dd       d| _        d| _        y )N'torch.ops.mkldnn._convolution_pointwisemkldnn::_convolution_pointwiser  r  convolution_pointwisea  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm)r  rk  r  r  )r   r  rY  r  r  r  s        rQ   rk  zConvolutionUnary.__init__j  s=     	<7 	 	
 6>rR   c                 F   |j                  | j                         t        j                  j                  r| j
                  n| j                  | j                         | j                  | j                         t        | j                  t              r| j                  |       y y rX   )r  r   r8   r   r  r  r  r  r  r  r=   r  r  r  r  s     rQ   r  zConvolutionUnary.codegen  ss    FFMMO ww22DOO	
 dkk6*%%g. +rR   r   rG   ry  r/  padding_stride_	dilation_r2  scalarsc           
      n    t        | |||||||      \  }}}}||t        |	      |
gz   }t        |||      S Nr  rY  r  )rI  r   rO  )r  r   ry  r/  rW  rX  rY  r2  attrrZ  	algorithmrY  r  rH  r   s                  rQ   r  zConvolutionUnary.create  s`     5WFD(GY5
1q &#G,)
 

   '
 	
rR   )rS   rQ  r   r   r   rk  r  r3  r   r   r   r   r  r4  r5  s   @rQ   rO  rO  i  s    
 8>:	/ 

 
 	

 s)
 c
 9
 
 $s)$
 
rR   rO  c                        e Zd Z	 	 d fd	Zd Zedddddddddee   d	ee   d
ee   dedede	e
   de	e   de	ee      de	e   fd       Z xZS )ConvolutionBinaryc                 h    t         |   |||d dd       d| _        d| _        d| _        || _        y )Nz.torch.ops.mkldnn._convolution_pointwise.binaryrR  rS  binaryconvolution_pointwise_binarya  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& other_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view binary_attr,
                c10::optional<at::Scalar> alpha,
                c10::optional<c10::string_view> unary_attr,
                torch::List<c10::optional<at::Scalar>> unary_scalars,
                c10::optional<c10::string_view> unary_algorithm))r  rk  r  r  r  cpp_constant_args)r   r  rY  r  rf  r  s        rQ   rk  zConvolutionBinary.__init__  sP     	C7 	 	
 )1%<D "3rR   c                    |j                  | j                         | j                         | j                         | j                  | j
                  | j                         t        | j                  t              r| j                  |       y y rX   )r  r   r!  r  r  r  r  r=   r  r  r  r  s     rQ   r  zConvolutionBinary.codegen  so    FFMMO$$&))	
 dkk6*%%g. +rR   r   rG   r  ry  r/  rW  rX  rY  r2  binary_attrbinary_alpha
unary_attrunary_scalarsunary_algorithmc           
          t        | |||||||      \  }}}}| j                  ||      }|j                  d|       ||	|
|t        |      |gz   }t	        |||      S )Nr$   r]  )rI  r  rC  r   rb  )r  r   r  ry  r/  rW  rX  rY  r2  rh  ri  rj  rk  rl  rY  r  rH  rG  s                     rQ   r  zConvolutionBinary.create  s    , /FD(GY
	
 ((0@Aa%#M2)
 
 ! '
 	
rR   )rS   rS   )r   r   r   rk  r  r3  r   r   r^   r   r:  r   r  r4  r5  s   @rQ   rb  rb    s    
  3D
/ %
%
 %
 	%

 %
 s)%
 c%
 9%
 %
 %
 uo%
 SM%
  S	*%
 "#%
 %
rR   rb  c                        e Zd Z	 d fd	Zd Zd Zd Zeddddddd	dd
ee	   dee	   dee	   de	de
dee   dee
   deee      dee
   fd       Z xZS )ConvolutionBinaryInplacec                 z    |d   |d   g|dd  z   }t         |   |||d dd       d| _        d| _        d	| _        y )
Nr$   r   rv   z/torch.ops.mkldnn._convolution_pointwise_.binaryzmkldnn::_convolution_pointwise_rS  rd  convolution_pointwise_binary_a  
            at::Tensor&(
                at::Tensor& other_t,
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view binary_attr,
                c10::optional<at::Scalar> alpha,
                c10::optional<c10::string_view> unary_attr,
                torch::List<c10::optional<at::Scalar>> unary_scalars,
                c10::optional<c10::string_view> unary_algorithm)r  rk  r  r  r  )r   rH  rY  r  reordered_inputsr  s        rQ   rk  z!ConvolutionBinaryInplace.__init__  sd     #1Ivay1F12J>D8 	 	
 )1%=DrR   c                     |j                  | j                         | j                         | j                         | j                  | j
                  | j                         y rX   r  r   r!  r  r  r  r  r  s     rQ   r  z ConvolutionBinaryInplace.codegen,  L    FFMMO$$&))	
rR   c                 >    | j                   d   j                         gS r   rB  r   s    rQ   r  z+ConvolutionBinaryInplace.get_mutation_names6  rJ  rR   c                     i S rX   rS   r   s    rQ   r  z1ConvolutionBinaryInplace.get_unbacked_symbol_defs9  r@  rR   r   rG   r  ry  r/  rW  rX  rY  r2  rh  ri  rj  rk  rl  c           
      *   t        | |||||||      \  }}}}| j                  ||      }|j                  d|       ||	|
|t        |      |gz   }t	        t        |d   j                               ||      }t        ||d          |j                  d   S )Nr$   )rH  rY  r  r   )	rI  r  rC  r   ro  r  r   r?  rY  )r  r   r  ry  r/  rW  rX  rY  r2  rh  ri  rj  rk  rl  rY  r  r   rG  r  s                      rQ   r  zConvolutionBinaryInplace.create<  s    , /FD(GY
	
 ((0@Aa%#M2)
 
 *$VAY%9%9%;<'

 	ffQi0 }}QrR   rS   )r   r   r   rk  r  r  r  r3  r   r   r^   r   r:  r   r  r4  r5  s   @rQ   ro  ro    s    
 	"DH
+ * *  *  	* 
 *  s)*  c*  9*  *  *  uo*  SM*   S	**  "#*  * rR   ro  c                   8     e Zd Z	 d fd	Zd Zed        Z xZS )MKLPackedLinearc                 L    t         |   |||d dd       d| _        d| _        y )Nztorch.ops.mkl._mkl_linearzmkl::_mkl_linearrS  
mkl_lineara  
            at::Tensor(
                const at::Tensor& self,
                const at::Tensor& mkl_weight_t,
                const at::Tensor& origin_weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                const int64_t prepack_batch_size)rU  r   r  rY  r  r  s       rQ   rk  zMKLPackedLinear.__init__k  s=     	.) 	 	
 +5rR   c                     |j                  | j                         | j                         | j                         | j                  | j
                         y rX   r  r   r!  r  r  r  r  s     rQ   r  zMKLPackedLinear.codegen  C    FFMMO$$&	
rR   c                    | j                  | j                  |            }| j                  | j                  |            }|j                         ^ }}|j                         \  }}t        |      |gz   }t	        |      }	|||g}
d |g}t        t        |j                         |j                         ||	      |
|      S r\  )	r  r  r   r>   r   r|  r  r   r   )r  r   packed_worig_w
batch_sizemr   ocr7  r  rY  r  s               rQ   r  zMKLPackedLinear.create  s     1 1! 45$$S%6%6v%>?

A!A1gn3K@Xv&z*{M '
 	
rR   rz  )r   r   r   rk  r  r3  r  r4  r5  s   @rQ   r|  r|  j  s&    
 	5.
 
 
rR   r|  c                   >     e Zd Z	 d fd	Zd Zed        Zd Z xZS )LinearUnaryc                 L    t         |   |||d dd       d| _        d| _        y )Nz"torch.ops.mkldnn._linear_pointwisemkldnn::_linear_pointwiserS  linear_pointwiseaL  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm)rU  r  s       rQ   rk  zLinearUnary.__init__  s=     	72 	 	
 1>rR   c                     |j                  | j                         | j                         | j                         | j                  | j
                         y rX   r  r  s     rQ   r  zLinearUnary.codegen  r  rR   c           	         | j                  | j                  |            }| j                  | j                  |            }|j                         ^ }}|j                         \  }	}||g}
||r|ndg|g}|2| j                  | j                  |            }|
j                  |       n|j	                  dd        t        t        |j                         |j                         t        |      |	gz         |
|      S )Nr  r   r  r]  )
r  r  r   r  rC  r  r(  r   r   r>   )r  r   wrx  r^  rZ  r_  r  icr  rY  r  s               rQ   r  zLinearUnary.create  s    ""3#4#4Q#78""3#4#4Q#78BBQ'wtYG=&&s'8'8';<AMM!  D)!||~kkm!Wt^
 '
 	
rR   c                      y rX   rS   r   s    rQ   r  zLinearUnary.apply_constraint  r5  rR   rz  )	r   r   r   rk  r  r3  r  r  r4  r5  s   @rQ   r  r    s+    
 	>0
 
 
0rR   r  c                   B     e Zd ZdZ	 d fd	Zd Zed        Zd Z xZ	S )LinearBinary)torch.ops.mkldnn._linear_pointwise.binaryc                 Z    t         |   |||d dd       d| _        d| _        d| _        y )Nr  r  rS  rd  linear_pointwise_binarya  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& other_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                c10::string_view attr)
        rr  r  s       rQ   rk  zLinearBinary.__init__  sE     	>2 	 	
 )1%7rR   c                     |j                  | j                         | j                         | j                         | j                  | j
                  | j                         y rX   ru  r  s     rQ   r  zLinearBinary.codegen  rv  rR   c           	      0   | j                  | j                  |            }| j                  | j                  |            }| j                  | j                  |            }|j                         ^ }}|j                         \  }}|||g}	|g}
|2| j                  | j                  |            }|	j                  |       n|
j	                  d|       t        t        |j                         |j                         t        |      |gz         |	|
      S )Nr   r  r]  )
r  r  r   r  rC  r  r(  r   r   r>   )r  r   yr  rx  r^  r  r  r  rY  r  s              rQ   r  zLinearBinary.create  s    ""3#4#4Q#78""3#4#4Q#78""3#4#4Q#78BBQ=&&s'8'8';<AMM!  A&!||~kkm!Wt^
 '
 	
rR   c                      y rX   rS   r   s    rQ   r  zLinearBinary.apply_constraint  r5  rR   rz  )
r   r   r   r  rk  r  r3  r  r  r4  r5  s   @rQ   r  r    s0    8F 	2
 
 
4rR   r  c                        e Zd Z	 d fd	Zd Zedddddddee   dee   d	ee   d
ee   dedeee	      fd       Z
 xZS )ConvolutionTransposeUnaryc                 L    t         |   |||d dd       d| _        d| _        y )Nz1torch.ops.mkldnn._convolution_transpose_pointwisez(mkldnn::_convolution_transpose_pointwiserS  convolution_transpose_pointwisea  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef output_padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm)rU  r  s       rQ   rk  z"ConvolutionTransposeUnary.__init__$  s>     	FA 	 	
 @>rR   c                     |j                  | j                         | j                         | j                         | j                  | j
                         y rX   r  r  s     rQ   r  z!ConvolutionTransposeUnary.codegenA  r  rR   r   rG   ry  r/  rW  output_padding_rX  rY  groups_rZ  c                 v    d}t        | |||||||||
      \  }}}}||	t        |
      |gz   }t        |||      S )NTr]  )rI  r   r  )r  r   ry  r/  rW  r  rX  rY  r  r^  rZ  r_  r3  rY  r  rH  r   s                    rQ   r  z ConvolutionTransposeUnary.createJ  sy     
 /
	
 &#G,)
 

 ) '
 	
rR   rz  r`  r5  s   @rQ   r  r  #  s    
 	>:
 )
)
 )
 	)

 s))
 c)
 c)
 9)
 )
 $s)$)
 )
rR   r  c            !       z     e Zd Z	 d fd	Zeddddddddddddd	dd
edee   dededededededef d       Z xZ	S )MkldnnRnnLayerc                 0    t         |   |||d dd       y )Nzaten.mkldnn_rnn_layerzat::mkldnn_rnn_layerrS  r  rk  r  s       rQ   rk  zMkldnnRnnLayer.__init__x  s)     	*- 	 	
rR   r   rG   w0w1w2w3hxcxr  batch_sizesrW  hidden_size
num_layers
has_biasesbidirectionalbatch_firsttrainc                 b   | j                  | j                  |            }|j                          | j                  | j                  |            }| j                  | j                  |            }| j                  | j                  |            }| j                  | j                  |            }| j                  | j                  |            }|j                          | j                  | j                  |            }|j                          |j                         }t	        |      dk(  sJ d       |\  }}}|||g}|j                         }|j                         }g }|||||||g}||	|
||||||g	}t        t        |j                               ||      }d }|||g} |||      t        |      t        |      g}t        t        ||            D  cg c]D  \  }\  }} t        t        |j                         |j                         ||       |t        |fg      F }!}}} |!S c c} }}w )Nru   zExpect lstm input to be 3D)rY  r  c                 B    t        |       dk(  sJ d       t        |       S )Nru   zExpect output_shape to be 3D)rc   r   )output_shaper  s     rQ   get_strides_of_lstm_outputz9MkldnnRnnLayer.create.<locals>.get_strides_of_lstm_output  s&    |$)I+II).|<<rR   )r  r  r  r   rc   r  r  r   r   rx   rj   r  r  r   r?   )"r  r   r  r  r  r  r  r  r  r  rW  r  r  r  r  r  r  r  
seq_length
mini_batchr  hy_shapecy_shaperesrY  r  r  r  output_sizesoutput_stridesrf   r7  r  	output_irs"                                     rQ   r  zMkldnnRnnLayer.create  sF   (  1 1! 45 	
  !2!22!67  !2!22!67  !2!22!67  !2!22!67  !2!22!67
  !2!22!67
ZZ\
:!#A%AA# .8*
J
"J<;;=;;=RRR,

  alln-'
	= %h9&|[A'1'1
  4=L.14
4//K LLNKKM!	 	4 	 
  !
s   A	H*rz  )
r   r   r   rk  r3  r   r   r   r  r4  r5  s   @rQ   r  r  w  s    
 	
 YY Y 	Y
 Y Y Y Y Y #YY Y Y Y Y Y  !Y" #Y YrR   r  c                        e Zd Z	 d fd	Zd Zedddededdddd	dd
ddee   dee   dee   dededefd       Z	 xZ
S )QConvPointWisePT2Ec                 r    t        |      dk(  | _        t        |   |||ddd       d| _        d| _        y)a  
        if bias is not None
            - inputs = [x, w, b, weight_scale, weight_zp]
            - const_args is: [stride, padding, dilation, groups, x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, weight_scale, weight_zp]
            - const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        rk  Nz"torch.ops.onednn.qconv2d_pointwiseonednn::qconv2d_pointwiserS  qconv2d_pointwisea  
            at::Tensor(
                at::Tensor act,
                double act_scale,
                int64_t act_zero_point,
                at::Tensor weight,
                at::Tensor weight_scales,
                at::Tensor weight_zero_points,
                c10::optional<at::Tensor> bias,
                torch::List<int64_t> stride,
                torch::List<int64_t> padding,
                torch::List<int64_t> dilation,
                int64_t groups,
                double inv_output_scale,
                int64_t output_zero_point,
                c10::optional<c10::ScalarType> output_dtype,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm)rc   has_biasr  rk  r  r  r  s       rQ   rk  zQConvPointWisePT2E.__init__  sM      Fq(72 	 	
 2>rR   c                    | j                   D cg c]  }|j                          }}g }|j                  | j                                |d   }|d   }| j                  r|d   n|d   }|d   |d   }}|dd  \  }	}
}}}}}}}}}}||||||||	|
||||||||f}|j                  | j                         | j                         || j                  | j                         t        | j                  t              r| j                  |       y y c c}w )Nr   r$   rv   r  irY  r   rw  r  r  r  r   r!  r  r  r=   r  r  r  )r   r  r   rZ   
const_argspacked_weightr/  w_scalew_zpr   r0  r1  r2  x_scalex_zpo_inv_scaleo_zpoutput_dtyperj  rk  rl  r  s                         rQ   r  zQConvPointWisePT2E.codegen  sK   /3{{;{!##%{;
$1134GQ--tAwZ]R$r( st	
 #
& 	FFMMO$$&	
 dkk6*%%g. +c <s   D
r   rG   r  r  ry  r  r  r/  rX  rW  rY  r2  r  output_zero_pointc                 v   d}d }t        | ||||	||
|||
      \  }}}}||d   |d   c|d<   |d<   n|d   |d   c|d<   |d<   |j                          |j                          |||gz   }|||||||t        |      |gz   }|+|t        j                  t        j
                  fv sJ ||_        t        |||      S NFrv   r$   r   r]  )rI  r   r   rB   float32rZ  r   r  )r  r   r  r  ry  r  r  r/  rX  rW  rY  r2  r  r  r  rj  rk  rl  r3  r4  rY  r  rH  r   s                           rQ   r  zQConvPointWisePT2E.createI  s   * 
4V5
1q <1>q1A=QRCS.M!mA.1>q1A=QRCS.M!mA.7D/)%#M2	)
 	
 #EMM5>>#BBBB #/M! '
 	
rR   rz  )r   r   r   rk  r  r3  r:  r   r   r  r4  r5  s   @rQ   r  r    s    
 	,>\4/l @
@
 @
 	@

 @
 @
 @
 @
 c@
 s)@
 9@
 @
 @
 @
 @
rR   r  c                   t     e Zd Z	 d fd	Zd Zedddddddddee   d	ee   d
ee   deddddfd       Z xZ	S )QConvPointWiseBinaryPT2Ec                     t        |      dk(  | _        t        |   |||ddd       d| _        d| _        d| _        y)	a~  
        Needs input/weight/output qparams
        if bias is not None
            - inputs = [x, w, b, accum, w_scale, w_zp]
            - const_args = [stride, padding, dilation, groups, x_scale, x_zp, accum_scale, accum_zp, o_inv_scale, o_zp,
            fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, accum, w_scale, w_zp]
            - const_args = const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, accum_scale,
            accum_zp, o_inv_scale, o_zp, fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
           Nz)torch.ops.onednn.qconv2d_pointwise.binaryr  rS  rd  qconv2d_pointwise_binarya  
            at::Tensor(
                at::Tensor act,
                double act_scale,
                int64_t act_zero_point,
                at::Tensor accum,
                double accum_scale,
                int64_t accum_zero_point,
                at::Tensor weight,
                at::Tensor weight_scales,
                at::Tensor weight_zero_points,
                c10::optional<at::Tensor> bias,
                torch::List<int64_t> stride,
                torch::List<int64_t> padding,
                torch::List<int64_t> dilation,
                int64_t groups,
                double inv_output_scale,
                int64_t output_zero_point,
                c10::optional<c10::ScalarType> output_dtype,
                c10::string_view binary_attr,
                c10::optional<at::Scalar> alpha,
                c10::optional<c10::string_view> attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm))rc   r  r  rk  r  r  r  r  s       rQ   rk  z!QConvPointWiseBinaryPT2E.__init__  sU    " Fq(>2 	 	
 )1%8>rR   c                 P   | j                   D cg c]  }|j                          }}g }|j                  | j                                |d   }|d   }| j                  r|d   n|d   }|d   |d   |d   }	}}|dd  \  }
}}}}}}}}}}}}}}}|||||||||	||
|||||||||||f}|j                  | j                         | j                         || j                  | j                  | j                         t        | j                  t              r| j                  |       y y c c}w )Nr   r$   rv   r  r  i)rY  r   rw  r  r  r  r   r!  r  r  r  r=   r  r  r  )r   r  r   rZ   r  r  r/  accumr  r  r   r0  r1  r2  r  r  accum_scaleaccum_zpr  r  r  rh  alpharj  rk  rl  	conv_argss                              rQ   r  z QConvPointWiseBinaryPT2E.codegen  sw   /3{{;{!##%{;
$1134GQ--tAwZ]#Bxb48w$ st#	
 -
	0 	FFMMO$$&))	
 dkk6*%%g. +u <s   D#r   rG   r  ry  r/  rX  rW  rY  r2  r  r  c                 |   d}d }t        | |||
||||||
      \  }}}}| j                  ||      }|j                  |       |
|d   |d   c|d<   |d<   n|d   |d   c|d<   |d<   |j                          |	j                          |||	gz   }|||||||||||t	        |      |gz   }|||_        t        |||      S r  )rI  r  r  r   r   r   r  )r  r   r  r  r  r  r  ry  r  r  r/  rX  rW  rY  r2  r  r  r  rh  r  rj  rk  rl  r3  r4  rY  r  rH  rG  s                                rQ   r  zQConvPointWiseBinaryPT2E.create  s2   4 
 /
	
 ((0@Ae <1>q1A=QRCS.M!mA.1>q1A=QRCS.M!mA.7D/)%#M2)
 
 # #/M' '
 	
rR   rz  )
r   r   r   rk  r  r3  r   r   r  r4  r5  s   @rQ   r  r    s    
 	3>j=/~ P
P

 P
 P
 P
 cP
 s)P
 9P
 P
  !!P
" '#P
 P
rR   r  c                   ^     e Zd Z	 d fd	Zd Zedddededdddd	dd
ddedefd       Z xZ	S )QLinearPointwisePT2Ec                 r    t        |      dk(  | _        t        |   |||ddd       d| _        d| _        y)a  
        if bias is not None
            - inputs = [x, w, b, weight_scale, weight_zp]
            - const_args is: [x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, weight_scale, weight_zp]
            - const_args is: [bias, x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        rk  Nz"torch.ops.onednn.qlinear_pointwisezonednn::qlinear_pointwiserS  qlinear_pointwisea]  
            at::Tensor(
                at::Tensor act,
                double act_scale,
                int64_t act_zero_point,
                at::Tensor weight,
                at::Tensor weight_scales,
                at::Tensor weight_zero_points,
                c10::optional<at::Tensor> bias,
                double inv_output_scale,
                int64_t output_zero_point,
                c10::optional<c10::ScalarType> output_dtype,
                std::string post_op_name,
                torch::List<c10::optional<at::Scalar>> post_op_args,
                std::string post_op_algorithm)r  r  s       rQ   rk  zQLinearPointwisePT2E.__init__W  sM      Fq(72 	 	
 22rR   c                    | j                   D cg c]  }|j                          }}g }|j                  | j                                |d   }|d   }| j                  r|d   n|d   }|d   |d   }}|dd  \  }	}
}}}}}}||	|
||||||||||f}|j                  | j                         | j                         || j                  | j                         t        | j                  t              r| j                  |       y y c c}w )Nr   r$   rv   r  r  ir  )r   r  r   rZ   r  r  r/  r  r  r  r  r  r  r  rj  rk  rl  r  s                     rQ   r  zQLinearPointwisePT2E.codegen  s2   /3{{;{!##%{;
$1134GQ--tAwZ]R$r( rsO		
 
 	FFMMO$$&	
 dkk6*%%g. +S <s   Dr   rG   r  r  ry  r  r  r/  r  r  c           
         t        | |||      \  }}}}|j                          |j                          |||gz   }|||||	|
|t        |      |gz   }|
+|
t        j                  t        j
                  fv sJ |
|_        t        |||      S r\  )rM  r   r   rB   r  rZ  r   r  )r  r   r  r  ry  r  r  r/  r  r  r  rj  rk  rl  rY  r  rH  r   s                     rQ   r  zQLinearPointwisePT2E.create  s    " 5R	5
1q 	7D/)%#M2	)
 	
 #EMM5>>#BBBB #/M# '
 	
rR   rz  )
r   r   r   rk  r  r3  r:  r   r  r4  r5  s   @rQ   r  r  V  s    
 	(2T,/\ /
/
 /
 	/

 /
 /
 /
 /
 /
 /
 /
rR   r  c                   V    e Zd ZU dZeed<   d Zd ZddZe	d        Z
d Zd	 Zd
 ZeZy)r  zC
    TensorBox / StorageBox allow in-place mutation of Tensors
    r  c                     t        | j                  |      }t        |      r|S t        t	        | j                        j
                   d| d      )Nr   z not callable)rY   r  callableAttributeErrorrM   r   )r   r\   r]   s      rQ   __getattr__zMutableBox.__getattr__  sE    TYY%B<ITYY 8 894&NOOrR   c                 6    | j                   j                         S rX   r  r   s    rQ   r   zMutableBox.realize  r  rR   Nc                 8    | j                   j                  |      S rX   )r  r   r   s     rQ   r   zMutableBox.codegen_reference  s    yy**622rR   c                 .    | j                   j                  S rX   r  r   s    rQ   r  zMutableBox.layout  s    yyrR   c                     | j                   S rX   r3  r   s    rQ   r   zMutableBox.get_layout  r
  rR   c                 6    | j                   j                         S rX   )r  r   r   s    rQ   r   zMutableBox.get_size  r  rR   c                 t   t        | j                  t              rQt        |       j                   dt        | j                        j                   d}d}| j                  j                  }n&t        |       j                   d}| j                  }d}|t        t        |            |g}dj                  |      S )NrZ  z))rd  
)r=   r  r  rM   r   r   r^   r   )r   line0endlr  r   s        rQ   r   zMutableBox.__str__   s    dii,Dz**+1T$))_-E-E,FaHEDIINNEDz**+1-EIIED 3u:

 yyrR   rX   )r   r   r   r9  r   r   r  r   r   r  r  r   r   r   r2  rS   rR   rQ   r  r    sH     LP#3    $ " HrR   r  c                       e Zd Zed        Zy)rG   c                 *    t        t        |             S rX   )rG   r  )r  s    rQ   r  zTensorBox.create  s    D)**rR   N)r   r   r   r   r  rS   rR   rQ   rG   rG     s    + +rR   c                   J    e Zd Zd Zd Zd Zd Zd Zed        Z	ed        Z
y)	r  c                     t        | j                  t        t        f      r4| j                  j	                         t
        j                  j                  v S yr  )r=   r  r  r  r   r8   r   graph_inputsr   s    rQ   r  zStorageBox.is_input_buffer  s:    dii+!?@99%%'177+?+???rR   c           	      h   t        | j                  t        t        t        t
        t        f      r| j                  j                         S t        | j                  t        t        f      sJ t        | j                               | j                  j                         }| j                  j                         }t        d t        | j                  j                         | j                  j                         | j                  j!                               | j                        | _        t"        j$                  j'                  | j                        | j                  _        | j*                  | j                  _        || j                  _        || j                  _        | j                  j(                  S )Nr  r  )r=   r  r  rW  r  r  rU  r   r=  r  rM   r  r   r(  r   r   r   r8   r   r[  r\   r   r   r   )r   r   r   s      rQ   r   zStorageBox.realize   s*   II	
 99%%''$))i%;<Md499oM<ii//1II++-	"!yy++-ii))+YY'')
 
	 00;		 LL		 +		'		yy~~rR   c                     t        | j                  t        t        f      r6| j	                         dkD  r"| j                         r| j                          yyyy)zL
        Called on buffers we expect to be forced to realize later.
        r$   N)r=   r  r=  r  r  8is_pointwise_non_scalar_tensor_num_reads_larger_than_oner   r   s    rQ   r   zStorageBox.realize_hint>  sK    
 tyy9i"89 1$MMOLLN P % :rR   c                     t        | j                  t              xrD | j                         t        j
                  kD  xs! | j                         t        j                  kD  S rX   )r=   r  r=  r  r%   realize_acc_reads_thresholdr   realize_bytes_thresholdr   s    rQ   r   z!StorageBox.has_exceeded_max_readsI  sL    $))Y/ 
NNvAAA H$$&)G)GG	
rR   c                    dt         t        t        f   fd}|dkD  rt        | j                  t        t        f      r| j                         t        j                  kD  sQt        | j                               t        j                  kD  s't        | j                        r$ || j                        r| j                          yyyyy)zj
        A heuristic to decide if we should realize a tensor
        that is used multiple times.
        loopsc                 R    dg}| j                         t        fd|D              S )zW
            The heuristic for realizing reused result of heavy ops on cpu
            expc              3   ,   K   | ]  }|d z   v   yw)rZ  NrS   )r  rG  fn_strs     rQ   r  zGStorageBox.mark_reuse.<locals>.should_realize_on_cpu.<locals>.<genexpr>[  s     @iSV+is   )r   rg  )r  	heavy_opsr	  s     @rQ   should_realize_on_cpuz4StorageBox.mark_reuse.<locals>.should_realize_on_cpuU  s+     I'')F@i@@@rR   r$   N)r   r=  r  r=   r  r  r%   realize_reads_thresholdrc   r   r  r   r   )r   r  r  s      rQ   r   zStorageBox.mark_reuseO  s    	Ay)/C)D 	A AI499y)&<= 6#A#AAt((*+f.L.LL499%*?		*J LLN +K%	 > rR   c           	         | j                   }t        |t        t        t        f      ryt        |t
              r|j                         }nxt        |t        t        f      sJ t        |             t        d t        |j                         |j                         |j                               |      j                         }t        |j                        S )Nr$   r  r  )r  r=   rW  r  r  r  r  r=  r  rM   r(  r   r   r   rc   r+  )r   r  r  s      rQ   r  zStorageBox.num_readsh  s    yyd\;HIdN+..0KdY	$:;GT$ZG;(%??,..*
  o  ;$$%%rR   c                     t        | j                  t              rWt        d | j                  j	                         D              r-t        d | j                  j	                         D              dkD  S dS )Nc              3   R   K   | ]  }t        |t        j                          ! y wrX   )r=   r&   r  r  reads     rQ   r  zVStorageBox.is_pointwise_non_scalar_tensor_num_reads_larger_than_one.<locals>.<genexpr>  s(      1D t\%9%9::1s   %'c              3   :   K   | ]  }|j                   d k7    yw)r   N)re   r  s     rQ   r  zVStorageBox.is_pointwise_non_scalar_tensor_num_reads_larger_than_one.<locals>.<genexpr>  s     C-BTq-Bs   r$   T)r=   r  r=  r  r   rk  r   s    rQ   r   zCStorageBox.is_pointwise_non_scalar_tensor_num_reads_larger_than_one|  se    
 $))Y/  II//1  CTYY-@-@-BCCaG	
 	
rR   N)r   r   r   r  r   r   r   r   r,   r  r   rS   rR   rQ   r  r    sC    
<	
2 & && 

 

rR   r  c                        e Zd Ze ej
                  d      d               Z fdZdej                  j                  def fdZ fdZ xZS )InterpreterShimNc                  H    t         j                  j                  t              S rX   )rB   r   symbolic_tracer   rS   rR   rQ   	_dummy_gmzInterpreterShim._dummy_gm  s     xx&&x00rR   c                     t         |   | j                         d       | | _        || _        || _        d| _        |j                  | _        d | _	        y )NF)garbage_collect_values)
r  rk  r  moduler   
submodulesextra_tracebackr  
fetch_attrr  )r   r   r  r  s      rQ   rk  zInterpreterShim.__init__  sP     	)%H
$$$00 rR   r  r   c                 0    || _         t        | 	  |      S rX   )r  r  run_node)r   r  r  s     rQ   r  zInterpreterShim.run_node  s    w""rR   c                 x    t        j                  |       5  t        |   |i |cd d d        S # 1 sw Y   y xY wrX   )r8   set_interpreter_handlerr  run)r   rZ   r[   r  s      rQ   r"  zInterpreterShim.run  s.    &&t,7;// -,,s   09)r   r   r   r   r  	lru_cacher  rk  rB   r   r   r   r  r"  r4  r5  s   @rQ   r  r    sT    Y1  1	!#%((-- #C #0 0rR   r  c                        e Zd ZdZ fdZed        Zed        Zd Zde	j                  fdZd Zd	 Zd
 Zd Zd Z xZS )r8  z
    Captures the body of a Loops subclass into an FX graph.  Persists any
    indexing simplifications and makes it easier to analyze loop bodies.
    c                    t         |           || _        i | _        i | _        g | _        g | _        i | _        i | _        g | _	        d| j                  i| _        i | _        g | _        t        | ||      | _        d | _        y )N	get_index)r  rk  r>  r9  indexing_exprs_namer+  writesr:  r;  r  r&  r  	subblocksindirect_varsLoopBodyBlock
root_blockindexing)r   r]   rZ   r>  r  s       rQ   rk  zLoopBody.__init__  s    $ #% 
! "
&7'b$7rR   c                     t        j                  | j                  j                  fd | j                  j                         D              }|D cg c]  }|j                  D ]  }|  c}}S c c}}w )Nc              3   4   K   | ]  }|j                     y wrX   )r   )r  blocks     rQ   r  z%LoopBody.get_nodes.<locals>.<genexpr>  s     >&=UU[[&=   )r  chainr,  r   r)  rA   rN   )r   
all_graphsr   rO   s       rQ   	get_nodeszLoopBody.get_nodes  s[    ____""$>dnn&;&;&=>

 #-E**EEEs   A.c                     ddl m}  ||       S )Nr$   )	BoundVars)boundsr6  )r   r6  s     rQ   r7  zLoopBody.bounds  s     	&rR   c           	         dt        | j                         g}|j                  | j                  j	                         D cg c]  \  }}| d|  c}}       |j                  t        j                  d| j                  fg| j                  j	                               D cg c]  \  }}|j                  |       c}}       dj                  |      S c c}}w c c}}w )Nzvar_ranges = r  rC  r  )r@   r>  rw  r9  r<  r  r2  r,  r)  	debug_strr   )r   r   r\   r"  r0  s        rQ   r9  zLoopBody.debug_str  s     doo!6 7899L9L9R9R9TU9TID#c#'9TUV $-??doo./1E1E1G$$KD% %$	
 yy Vs   C
$C
r  c                    t        | |      j                  |       ||t        | | d      |<   || j                  vr6dt        | j                         }|| j                  |<   || j                  |<   | j                  |   S )N
_name2exprre   )rY   r  r'  rc   r9  )r   r  categorybuf_namer\   s        rQ   add_index_exprzLoopBody.add_index_expr  s    h&&t,?CGDXJj128<t///3t22345D-1D$$T*(,D%''--rR   c                     |d   j                         r|| j                  vr|}n| t        | j                         }|| j                  |<   |S )zaNot actually for nn.Modules, but subblocks in generated code are mapped to FX call_module opcodesr  )	isnumericr  rc   )r   r0  r:   r\   s       rQ   add_submodulezLoopBody.add_submodule  sM    ":!fDOO&CDXc$//234D %rR   c                     dt        | j                         }t        |      }| j                  j                  |       |S )Nindirect)rc   r*  r6   r  )r   r   r\   r  s       rQ   add_indirectzLoopBody.add_indirect  s=    #d001234 !!#&
rR   c           
          t        |      t        |      k(  ry| j                  J | j                  j                         D ci c]  \  }}|t        |||i       c}}| _        yc c}}w )z,Swap in a variable used in indirect indexingN)r^   r-  r<  r5   )r   r   newr  r%  s        rQ   replace_indirectzLoopBody.replace_indirect  sa    s8s3x}}(((BF--BUBUBWXBW$!QJq3*55BWXXs   A'c                 <    | j                   J | j                   |   S rX   )r-  r   s     rQ   r&  zLoopBody.get_index  s!    }}(((}}T""rR   c           	          t        t        j                  |       }t        |      t         j                        k(  sJ | j                  f       t         fd|D              sJ t        t         j                  j                         |            } j                  j                         D ci c]  \  }}|t        ||       c}} _         j                         }d  _        |S c c}}w )Nc              3   :   K   | ]  }|j                   v  y wrX   )r>  )r  r%  r   s     rQ   r  z$LoopBody.__call__.<locals>.<genexpr>  s     ;U1DOO+Us   )r>   r  r2  rc   r>  r  r@   rj   r  r9  r<  r5   r-  r,  )r   r  re   r  r\   r  r  s   `      rQ   __call__zLoopBody.__call__  s    Y__g./5zS11KE4??3KK1;U;;;;C 4 4 6>? #11779
9
d *T<009
 "
s   1C*)r   r   r   r9  rk  r,   r4  r7  r9  rH   r   r>  rA  rD  rG  r&  rK  r4  r5  s   @rQ   r8  r8    sf    
  F F   .5:: .Y#rR   r8  c                   B    e Zd ZdZdededef   dee   fdZd Z	d
dZ
y	)r+  a  
    Captures the body of a Loops subclass into an FX graph.
    In normal cases there will be a 1:1 mapping between LoopBody and
    LoopBodyBlock, hower in the case of ops.masked() the masked out
    operations will manifest as an extra LoopBodyBlock.
    rC  r]   .rZ   c                 V   	
 | _         d 
fd		 G 	 
fddt        j                        }t        j                  j                         
t        j                  j                  
j                        
_        
j                  dddi       }dd	l
m} dd
lm}  | ||       j                   j                        }t        j                   r ||      }t        j"                  |      5  t%        j&                   ||        d d d        
j                   _        y # 1 sw Y   xY w)Nc           	      d    j                  ddj                  j                  | ||      fi       S )Ncall_moduler&  )create_proxyrC  r>  )r  r<  r=  r   tracers      rQ   	add_indexz)LoopBodyBlock.__init__.<locals>.add_index  s8    &&))$(CE	 rR   c                      e Zd ZdW _        dedej                  f fdZd fd	Z fdZ	d Z
 fd	Zd
edej                  dej                  def fdZededef   ffd       Zedfd	       Zefd       Zy)/LoopBodyBlock.__init__.<locals>.CaptureIndexingCaptureIndexingr\   re   c                 P     |d|      }| j                   j                  ||      S )Nr+  )_innerr,  )r   r\   re   rR  s      rQ   r,  z4LoopBodyBlock.__init__.<locals>.CaptureIndexing.load  s(    !%$7{{''e44rR   Nc                 T     |d|      }| j                   j                  ||||      S Nr(  )rW  rD  )r   r\   re   r   rW  rR  s        rQ   rD  z5LoopBodyBlock.__init__.<locals>.CaptureIndexing.store"  s,    !%48{{((ueTBBrR   c                 R     |d|      }| j                   j                  |||      S rY  )rW  r  )r   r\   re   r   rR  s       rQ   r  z?LoopBodyBlock.__init__.<locals>.CaptureIndexing.store_reduction&  s*    !%48{{224FFrR   c                     | j                   j                  ||||      d|v rt        fdt        d      D              S S )Nra  c              3   (   K   | ]	  }|     y wrX   rS   )r  rf   r  s     rQ   r  zLLoopBodyBlock.__init__.<locals>.CaptureIndexing.reduction.<locals>.<genexpr>-  s      =HqHs   ru   )rW  r  r?   rd   )r   r   r  r  r   r  s        @rQ   r  z9LoopBodyBlock.__init__.<locals>.CaptureIndexing.reduction*  s@    ..uiQVW.  =E!H ===rR   c                     t        |t        t        j                  f      r%| j                  j                  t        |      |      S  |d      }| j                  j                  ||      S Nr  )r=   r   rH   r   rW  r9  r  )r   re   r   rR  s      rQ   r  z:LoopBodyBlock.__init__.<locals>.CaptureIndexing.index_expr0  sT    ec5==%9:;;//E
EBB!%1{{--eU;;rR   offsets_nameoffsets_sizeindexing_dtyperi  c                 T     |d      }| j                   j                  |||||      S r^  )rW  	bucketize)r   rA   r_  r`  ra  ri  rR  s         rQ   rc  z9LoopBodyBlock.__init__.<locals>.CaptureIndexing.bucketize6  s3      )w?{{,,L, rR   masked_body.c                     fd}j                   j                  |d      }t        j                   |g       j                   j                  |<   j	                  d|| |fi       S )zb
                Recursively capture the masked out body in another LoopBodyBlock
                c                 F    t         j                  j                  | |      S rX   )r8   r7   rE  )r}  r  subblocks     rQ   shimzDLoopBodyBlock.__init__.<locals>.CaptureIndexing.masked.<locals>.shimK  s    55<<h>>rR   masked_subblockrO  )rC  rA  r+  r)  rP  )
mask_proxyrd  other_proxyrh  r\   rg  r   rQ  s        @rQ   rE  z6LoopBodyBlock.__init__.<locals>.CaptureIndexing.maskedC  sh    ? yy..t5FG(KD,4		##D)**!4*k)BB rR   c                     j                   j                        fd}j                  dj                   j                  |d       | fi        S )z
                Flow data from tensors into indexing formulas.
                Introduce a call_module to update the indexing.
                c                 |    j                   j                  t        j                  j	                  |              y rX   )rC  rG  r8   r7   indirect_indexing)new_varcheckr   r   r  s    rQ   set_indirectzWLoopBodyBlock.__init__.<locals>.CaptureIndexing.indirect_indexing.<locals>.set_indirect^  s-    II..QUU44WdEJrR   rO  set_)rC  rD  rP  rA  )index_proxyr   rp  rq  r  r   rQ  s    `` @rQ   rn  zALoopBodyBlock.__init__.<locals>.CaptureIndexing.indirect_indexingU  sZ     ii,,T2
 ##!II++LD,G N	 
rR   c                 0    j                  dd| fi        y )Nr  )rP  )r  rQ  s    rQ   r  z6LoopBodyBlock.__init__.<locals>.CaptureIndexing.outputk  s    ##Hh	2FrR   rX   T)r   r   r   r\   r^   rH   r   r,  rD  r  r  r  rB   r   r   rc  r   r   r   rE  rn  r  )rR  r   rQ  s   rQ   rU  rT    s    )DI5 5UZZ 5CG< " $jj	
 !&  c0B  "  * G GrR   rU  )
tracer_clsplaceholderr7   rS   r$   )IndexPropagation)SimplifyIndexingrX   )rC  r8   WrapperHandlerrB   r   TracerGraphr  r   rP  index_propagationrx  r   ry  r>  r%   constant_and_index_propagationset_ops_handlerr7   r  )r   rC  r]   rZ   rU  	proxy_opsrx  ry  handlerrR  rQ  s   `        @@rQ   rk  zLoopBodyBlock.__init__  s    		R	G R	Ga.. R	Gh "xx~~1A1A~B''ub"E	7.'I&		(<(<
 00&w/Gw' JJr4y! ( \\
	 ('s   ,DD(c                     | j                   }| j                  j                  }t        ||      j	                  t        j                               S rX   )r   rC  r  r  r"  r8   get_ops_handler)r   r   r  s      rQ   rK  zLoopBodyBlock.__call__  s;    

YY))
uj155a6G6G6IJJrR   c           
         t         j                  j                  | j                  j                  | j
                        j                  }t        j                  dd|j                         j                  dd| d            S )Nz;[^\n]*r   zdef forward(zdef rZ  )rB   r   GraphModulerC  r  r   coderesubstripr  )r   r\   r  s      rQ   r9  zLoopBodyBlock.debug_str  sa    xx##DII$8$8$**EJJvvJJL  4vQ@	
 	
rR   N)r0  )r   r   r   r9  r8  r   r   r   rk  rK  r9  rS   rR   rQ   r+  r+    s>    p"X p"8CH+= p"T#Y p"dK
rR   r+  c                   P     e Zd ZdZ	 d fd	Zd Zd Zed	d       Zd Z	d Z
 xZS )
Waitz
    Wait should not be used by itself.  It should always be constructed in tandem
    with a collective op that produces a work to wait on.
    c                 (    t         |   |||       y rX   r  r  s       rQ   rk  zWait.__init__  s     	7rR   c                      yr  rS   r   s    rQ   r  zWait.should_allocate  r  rR   c                     ddl m} |j                  d       d | j                  D        \  }|j	                  | d| d       |j	                   ||| j                  d   | d	             y )
Nr$   )	ReuseLinezGfrom torch.distributed._functional_collectives_impl import _wait_tensorc              3   <   K   | ]  }|j                           y wrX   rW  rX  s     rQ   r  zWait.codegen.<locals>.<genexpr>  rq  rY  z = _wait_tensor(rd  r   F)
delete_old)r  r  add_import_oncerY  r  )r   r  r  input_collectives       rQ   r  zWait.codegen  so    .U	
 KdkkJ	-..>?O>PPQRS 	)GT[[^TeTUrR   c                 P    |j                          t        t        |      |g      S )N)r  rY  )r  r  r  )r  collective_ops     rQ   r  zWait.create  s*     	##% /!?
 	
rR   c                 >    | j                   d   j                         gS r   rY  r   r   s    rQ   r  zWait.get_alias_names      A00233rR   c                 >    | j                   d   j                         gS r   r  r   s    rQ   r  zWait.get_mutation_names  r  rR   rz  )r  rG   )r   r   r   r9  rk  r  r  r3  r  r  r  r4  r5  s   @rQ   r  r    s;     	8V 
 
44rR   r  c                   P     e Zd ZdZ fdZd Zd Zd Zd Ze	d        Z
d Z xZS )	CollectiveKernela  
    Each collective should follow the pattern:
    - extend InPlaceCollectiveKernel or OutOfPlaceCollectiveKernel.
    - the kernel delegates into c10d processgroup, which returns a 'work' obj
    - the work obj is registered via _register_tensor_work so it can be waited on later
    c                 r    t         |   d |||       t        j                  j	                  |       | _        y rX   rL  r  s       rQ   rk  zCollectiveKernel.__init__  s,    vv}=GG++D1	rR   c                      yrR  rS   r   s    rQ    should_emit_register_tensor_workz1CollectiveKernel.should_emit_register_tensor_work  rB  rR   c                      yrR  rS   r   s    rQ   should_emit_find_or_create_pgz.CollectiveKernel.should_emit_find_or_create_pg  rB  rR   c                     t        d      NzMust implementr  r   r  rF  input_namess       rQ   codegen_collectivez#CollectiveKernel.codegen_collective      !"233rR   c                     t        d      r  r  r  s       rQ   codegen_outputzCollectiveKernel.codegen_output  r  rR   c                 2    d }t        t        ||            S )Nc                     t        t        | j                         | j                         | j	                               |       }t
        j                  |      S rX   )InPlaceHintr(  r   r   r   rG   r  )r  rG  s     rQ   
wrap_inputz;CollectiveKernel.wrap_inputs_as_inplace.<locals>.wrap_input  sA    s~~/#,,.QSVB ##B''rR   )r>   r   )r  rY  r  s      rQ   wrap_inputs_as_inplacez'CollectiveKernel.wrap_inputs_as_inplace  s    	( C
F+,,rR   c           
         |j                  d       |j                  d       |j                  d       | j                  D cg c]  }|j                          }}| j                         }| j                  \  }}}| j                         r|j                  | d| d| d| d       | j                  |||       | j                  |||       | j                         r|j                  d| d| d	       y y c c}w )
Nz import torch.distributed as distz1import torch.distributed.distributed_c10d as c10dzEimport torch.distributed._functional_collectives_impl as fun_col_implz0_pg = c10d._find_or_create_pg_by_ranks_and_tag('rc  r  rd  z#fun_col_impl._register_tensor_work(z_work))
r  rY  r   r   r  r  r  r  r  r  )r   r  r   r  rF  tagranks
group_sizes           rQ   r  zCollectiveKernel.codegen  s    BC STS	
 7;kkBkq**,kBmmo!%!3!3UJ--/-OPSuTWX]W^^`ak`llmn 	G[+>kB0025k]"[MQWX 3 Cs   C8)r   r   r   r9  rk  r  r  r  r  r3  r  r  r4  r5  s   @rQ   r  r    s:    244 - -rR   r  c                   4     e Zd ZdZ fdZd Zd Zd Z xZS )InPlaceCollectiveKernelz
    InPlaceCollectiveKernel are those with in-out arguments such as all_reduce.
    Extend this kernel if your collective needs to modify its inputs in-place.
    c                 (    t         |   |||       y rX   r  r  s       rQ   rk  z InPlaceCollectiveKernel.__init__  s    7rR   c                      yr  rS   r   s    rQ   r  z'InPlaceCollectiveKernel.should_allocate
  r  rR   c                      yrR  rS   r   s    rQ   r:  z(InPlaceCollectiveKernel.has_side_effects  rB  rR   c                     t        |      dkD  r'|j                  | ddj                  |       d       y |j                  | d|d           y )Nr$    = [,z] r  r   )rc   r  r   r  s       rQ   r  z&InPlaceCollectiveKernel.codegen_output  sR    {aT#((;2G1HKLSQ0@ABrR   )	r   r   r   r9  rk  r  r:  r  r4  r5  s   @rQ   r  r    s    
8CrR   r  c                   V     e Zd ZdZ fdZd Zd Zd Zedd       Z	ed        Z
 xZS )	OutOfPlaceCollectiveKernelz
    OutOfPlaceCollectiveKernel are those that allocate their
    outputs and leave their inputs inplace, such as all_gather.
    c                     t         |   |||z   |       || _        || _        | j                  D ]5  }t        j
                  j                  j                  |j                         7 y rX   )	r  rk  r  original_inputsr8   r   never_reuse_buffersre  r\   )r   r  rY  r  r  r   r  s         rQ   rk  z#OutOfPlaceCollectiveKernel.__init__  sT    '!1=A%
 AGG''++AFF3 rR   c                      yr  rS   r   s    rQ   r  z*OutOfPlaceCollectiveKernel.should_allocate(  r  rR   c                      yrR  rS   r   s    rQ   r:  z+OutOfPlaceCollectiveKernel.has_side_effects+  rB  rR   c                    | j                   D cg c]  }|j                          }}|j                  | ddj                  |       d       |j                  | ddj                  d | j                  D               d       y c c}w )Nz_inputs = [r  r  r  c              3   4   K   | ]  }|j                     y wrX   r  )r  r   s     rQ   r  z<OutOfPlaceCollectiveKernel.codegen_output.<locals>.<genexpr>1  s     6T|!qvv|r1  )r  r   r  r   r  )r   r  rF  r  r   s        rQ   r  z)OutOfPlaceCollectiveKernel.codegen_output.  s    6:6J6JK6Jq**,6JK[MSXXk5J4K1MN[Mchh6Tt||6T.T-UUVWX Ls   Bc                     g }|D ]a  }|j                         }| ||       t        t        |j                         |j	                         |            }|j                  |       c |S )Nr  r3  )r   OutputBufferr(  r   r   r  )r  rY  size_cbr  r|  r  buffs          rQ   create_output_buffersz0OutOfPlaceCollectiveKernel.create_output_buffers3  sm    E~~'H"!  % ++-//+!D NN4   rR   c                 |    t        |      D cg c]   \  }}t        |j                  |d| d      " c}}S c c}}w )Nr  r  )rx   MultiOutputNoSizeAssertr  )r  colloutput_buffersrf   out_ts        rQ   create_output_nodesz.OutOfPlaceCollectiveKernel.create_output_nodesF  sP     &n5
 65 $A3a
 6
 	
 
s   %8rX   )r   r   r   r9  rk  r  r:  r  r3  r  r  r4  r5  s   @rQ   r  r    sE    
	4Y
  $ 
 
rR   r  c                   .     e Zd ZdZd Z fdZd Z xZS )r  a  
    Helper OP to encode an in/out argument that tries to make it inplace whenever possible.
    Wrap the input of your inplace op to enable this behavior.

    The design is based on two key decisions:
    - this node is responsible for allocating the in/out buffer used by the collective.
        This is controlled by the ``should_allocate`` method that returns True here and
        False for the collective node
    - The scheduler special-case this node and enable it to reuse its input.
    c                     | j                   d   j                         }| j                         }|j                  | | j                   d         s|j	                  | d| d       y y )Nr   z.copy_(z) #no reuse)rY  r   r   	did_reuser  )r   r  
input_namerF  s       rQ   r  zInPlaceHint.codegen^  s[    [[^557
mmo  t{{1~6WZLLM 7rR   c                     | j                  |      }t        | 	  d || j                  |g      d       t        j
                  j                  |       | _        y rG  )r  r  rk  rX  r8   r   r[  r\   )r   r  r|  r  s      rQ   rk  zInPlaceHint.__init__d  sI    ""5)vt':':E7'CRHGG++D1	rR   c                      yrR  rS   r   s    rQ   r  zInPlaceHint.should_allocatei  rB  rR   )r   r   r   r9  r  rk  r  r4  r5  s   @rQ   r  r  R  s    	N2
rR   r  c                   .     e Zd ZdZ fdZd Zd Z xZS )r  zO
    Represent the output buffer used by ops that require multiple of them
    c                 r    t         |   d |g        t        j                  j	                  |       | _        y )Nr  rL  )r   r  r  s     rQ   rk  zOutputBuffer.__init__r  s-    d6"=GG++D1	rR   c                      yrR  rS   r   s    rQ   r  zOutputBuffer.should_allocatev  rB  rR   c                 @    |j                  d| j                          y )Nz# collective out buffer )r  r\   r  s     rQ   r  zOutputBuffer.codegeny  s    4TYYK@ArR   )r   r   r   r9  rk  r  r  r4  r5  s   @rQ   r  r  m  s    2BrR   r  c                   (     e Zd ZdZ fdZd Z xZS )r  z
    Extract partial output from a multi-output OP.
    Works like MultiOutput but doesn't assert size. This must be a property guaranteed by the op emitting this.
    c                 6    t         |   ||g        || _        y rX   )r  rk  re   )r   r  r|  re   r  s       rQ   rk  z MultiOutputNoSizeAssert.__init__  s    +
rR   c                     |j                  | j                          d| j                  d   j                          | j                          y )Nr  r   )r  r   rY  re   r  s     rQ   r  zMultiOutputNoSizeAssert.codegen  s?    }}s4;;q>#:#:#<"=djj\J	
rR   )r   r   r   r9  rk  r  r4  r5  s   @rQ   r  r  }  s    

rR   r  c                   \     e Zd Z fdZd Zd Zedddedede	e   d	ef
d
       Z
d Z xZS )	Broadcastc                 6    t         |   |||       || _        y rX   )r  rk  r  )r   r  rY  r  r  r  s        rQ   rk  zBroadcast.__init__  s    7rR   c                 >    | j                   d   j                         gS r   rB  r   s    rQ   r  zBroadcast.get_mutation_names  rJ  rR   c                     i S rX   rS   r   s    rQ   r  z"Broadcast.get_unbacked_symbol_defs  r@  rR   r   rG   r  r  r  r  c                     | j                  |g      }t        t        |d   j                               ||||g|      }t	        ||d          |d   S )Nr   )r  rY  r  r  )r  r  r  r   r?  )r  r   r  r  r  r  inplace_inputsr  s           rQ   r  zBroadcast.create  sb     33QC8nQ/::<=!z2	
 	fnQ&78a  rR   c           
      R    |j                  | d| d| d| j                   d       y )Nz_work = dist.broadcast(, async_op=True, group=z	_pg, src=rd  )r  r  r  s       rQ   r  zBroadcast.codegen_collective  s7    m2m2;-y
RSU	
rR   )r   r   r   rk  r  r  r3  r   r^   r   r  r  r4  r5  s   @rQ   r  r    sW    + !!"%!,/!8<S	!OR! !
rR   r  c                   h     e Zd Z fdZd Zd Zd Zeded   de	de	d	ee
   d
e
f
d       Zd Z xZS )AllReduceCoalescedc                 6    t         |   |||       || _        y rX   r  rk  	reduce_opr   r  rY  r  r  r  s        rQ   rk  zAllReduceCoalesced.__init__      7"rR   c                      yr  rS   r   s    rQ   r  z"AllReduceCoalesced.should_allocate  r  rR   c                 >    | j                   d   j                         gS r   rB  r   s    rQ   r  z%AllReduceCoalesced.get_mutation_names  rJ  rR   c                     i S rX   rS   r   s    rQ   r  z+AllReduceCoalesced.get_unbacked_symbol_defs  r@  rR   rY  rG   r  r  r  r  c                     | j                  |      }t        t        |d   j                               ||||g|      }t	        ||d          |S Nr   )r  rY  r  r  )r  r  r  r   r?  )r  rY  r  r  r  r  r  r  s           rQ   r  zAllReduceCoalesced.create  s[     33F;#nQ/::<=!z2	
 	fnQ&78rR   c           
      d    |j                  | d| dt        | j                         d| d       y )Nz"_work = dist.all_reduce_coalesced(z%, op=fun_col_impl._str_to_reduce_op('
'), group=_pg, async_op=True)r  r^   r  r  s       rQ   r  z%AllReduceCoalesced.codegen_collective  sF    m=m 225dnn2E1F G M "	
rR   )r   r   r   rk  r  r  r  r3  r   r^   r   r  r  r4  r5  s   @rQ   r  r    si    #+ [!  	
 Cy  $
rR   r  c                   \     e Zd Z fdZd Zd Zedddededee	   d	e	f
d
       Z
d Z xZS )	AllReducec                 6    t         |   |||       || _        y rX   r  r  s        rQ   rk  zAllReduce.__init__  r  rR   c                 >    | j                   d   j                         gS r   rB  r   s    rQ   r  zAllReduce.get_mutation_names  rJ  rR   c                     i S rX   rS   r   s    rQ   r  z"AllReduce.get_unbacked_symbol_defs  r@  rR   r   rG   r  r  r  r  c                     | j                  |g      }t        t        |d   j                               ||||g|      }t	        ||d          |d   S r  )r  r  r  r   r?  )r  r   r  r  r  r  r  r  s           rQ   r  zAllReduce.create  sb     33QC8nQ/::<=!z2	
 	fnQ&78a  rR   c                 d    |j                  | d| d| dt        | j                         d       y )Nz_work = dist.all_reduce(r  (_pg, op=fun_col_impl._str_to_reduce_op(''))r  r  s       rQ   r  zAllReduce.codegen_collective  sB    m3m2;-?ghklplzlzh{g||A	
rR   )r   r   r   rk  r  r  r3  r^   r   r   r  r  r4  r5  s   @rQ   r  r    sW    #+ !!(+!25!>B3i!UX! !
rR   r  c            	       L     e Zd Z fdZedddedee   defd       Zd Z	 xZ
S )	AllGatherIntoTensorc                 *    t         |   ||||       y rX   r  r   r  rY  r  r  r  s        rQ   rk  zAllGatherIntoTensor.__init__      -@rR   r   rG   r  r  r  c                     | j                  |      g}fd}| j                  ||      }t        |d   j                               }t	        |||||g      }	| j                  |	|      d   S )Nc                      | dxx   z  cc<   y r   rS   r  r  s    rQ   compute_sizez0AllGatherIntoTensor.create.<locals>.compute_size      QK:%KrR   r   r  rY  r  r  )r  r  r  r   r  r  )
r  r   r  r  r  rY  r  r  r  r  s
       `     rQ   r  zAllGatherIntoTensor.create  s}    ##A&'	& ++FLA"6!9#7#7#9:$z2	
 &&vw7::rR   c           
      >    |j                  | d| d| d| d       y )Nz$_work = dist.all_gather_into_tensor([0], !_inputs[0], async_op=True, group=z_pg)r  r  s       rQ   r  z&AllGatherIntoTensor.codegen_collective  s3    m?m5-N{m[_a	
rR   r   r   r   rk  r3  r^   r   r   r  r  r4  r5  s   @rQ   r  r    sB    A ;{ ; ;T#Y ;C ; ;$
rR   r  c                   P     e Zd Z fdZedddededee   def
d       Zd	 Z	 xZ
S )
ReduceScatterTensorc                 8    t         |   ||||       || _        y rX   r  r   r  rY  r  r  r  r  s         rQ   rk  zReduceScatterTensor.__init__      -@"rR   r   rG   r  r  r  r  c                     | j                  |      g}fd}| j                  ||      }t        |d   j                               }	t	        |	||||g|      }
| j                  |
|      d   S )Nc                      | dxx   z  cc<   y r   rS   r  s    rQ   r  z0ReduceScatterTensor.create.<locals>.compute_size'      QKJ&KrR   r   r  rY  r  r  r  )r  r  r  r   r  r  )r  r   r  r  r  r  rY  r  r  r  r  s        `     rQ   r  zReduceScatterTensor.create  s     ##A&'	' ++FLA"6!9#7#7#9:$z2
 &&vw7::rR   c                 j    |j                  | d| d| d| dt        | j                         d
       y )Nz#_work = dist.reduce_scatter_tensor(r	  r
  r  r  r  r  s       rQ   r  z&ReduceScatterTensor.codegen_collective7  sK    m>m5 .$$/=0XY\]a]k]kYlXmmpr	
rR   r  r5  s   @rQ   r  r    sU    # ;; ; 	;
 Cy; ; ;4
rR   r  c            	       R     e Zd Z fdZeded   dedee   defd       Zd Z	 xZ
S )	AllGatherIntoTensorCoalescedc                 *    t         |   ||||       y rX   r  r   s        rQ   rk  z%AllGatherIntoTensorCoalesced.__init__@  r  rR   rY  rG   r  r  r  c                     |D cg c]  }| j                  |       }}fd}| j                  ||      }t        |d   j                               }t	        |||||g      }	|S c c}w )Nc                      | dxx   z  cc<   y r   rS   r  s    rQ   r  z9AllGatherIntoTensorCoalesced.create.<locals>.compute_sizeM  r  rR   r   r  )r  r  r  r   r  )
r  rY  r  r  r  r   r  r  r  r  s
       `     rQ   r  z#AllGatherIntoTensorCoalesced.createC  s}     1771###A&7	& ++FLA"6!9#7#7#9:-z2	
 ! 8s   A'c           
      >    |j                  | d| d| d| d       y )NzO_work = fun_col_impl._all_gather_into_tensor_coalesced_fallback(output_tensors=, input_tensors=z_inputs, group=r  r  r  s       rQ   r  z/AllGatherIntoTensorCoalesced.codegen_collective^  s=    m )] +(M * M "	
rR   r   r   r   rk  r3  r   r^   r   r  r  r4  r5  s   @rQ   r  r  ?  sQ    A [!  Cy	
  4
rR   r  c                   V     e Zd Z fdZeded   dededee   def
d       Zd	 Z	 xZ
S )
ReduceScatterTensorCoalescedc                 8    t         |   ||||       || _        y rX   r  r  s         rQ   rk  z%ReduceScatterTensorCoalesced.__init__i  r  rR   rY  rG   r  r  r  r  c                     |D cg c]  }| j                  |       }}fd}| j                  ||      }t        |d   j                               }	t	        |	||||g|      }
|S c c}w )Nc                      | dxx   z  cc<   y r   rS   r  s    rQ   r  z9ReduceScatterTensorCoalesced.create.<locals>.compute_sizex  r  rR   r   r  )r  r  r  r   r   )r  rY  r  r  r  r  r   r  r  r  r   s        `     rQ   r  z#ReduceScatterTensorCoalesced.createm  s     1771###A&7	' ++FLA"6!9#7#7#9:(z2
 # 8s   A(c                 j    |j                  | d| d| dt        | j                         d| d
       y )NzN_work = fun_col_impl._reduce_scatter_tensor_coalesced_fallback(output_tensors=r  z,_inputs, op=fun_col_impl._str_to_reduce_op('r  r  r  r  s       rQ   r  z/ReduceScatterTensorCoalesced.codegen_collective  sR    m )] +(M *225dnn2E1F G M "		
rR   r  r5  s   @rQ   r   r   h  sZ    # [!  	
 Cy  6
rR   r   c                   n    e Zd Zd Zd Zd Zedeee	e   f   ddfd       Z
edeee	e   f   fd       Zy)	_CollectiveKernelc                      yr  rS   r   s    rQ   r  z!_CollectiveKernel.should_allocate  r  rR   c                      yrR  rS   r   s    rQ   r:  z"_CollectiveKernel.has_side_effects  rB  rR   c                    ddl m} |j                  j                  | _        |j                  j
                  | _        | j                  j                  dd       d| j                   | _         ||      | _	        |j                  j                  D cg c]  }|j                  s|j                   c}| _        y c c}w )Nr$   r  r  r   )r  r  r  r\   r  r  r  r  r  r  r  r  r  )r   r  r  r   s       rQ   r  z _CollectiveKernel.set_cpp_kernel  s    6 ..--(.(D(D%&&tS12!D4Q4Q3RS 	 /v6"NN44.
4qAFF4.
* .
s   B;$B;rY  r   Nc                 H  
 t         j                  j                  5   | j                  ||g|i |\  }}}}d d d        D ]  }	|	j	                            | t        |d   j                               ||      
t        j                  
fd|       y # 1 sw Y   `xY w)Nr   c                 2    t        | j                  |       S rX   )rD  r  )r   r  s    rQ   r  z2_CollectiveKernel.create_inplace.<locals>.<lambda>  s    .1f"ErR   )	r8   r   r  r  r   r  r   r  tree_map)r  r  rY  rZ   r[   r  r  r  r  
tensor_argr  s             @rQ   create_inplacez _CollectiveKernel.create_inplace  s     WW #""66CDCFC  &J  & {1~0023
 	EvN# s   BB!c           
      *   t         j                  j                  5   | j                  ||g|i |\  }}}}d d d        D ]  }	|	j	                           t        t              rw| j                  ||      }
 | t        |
      ||      }t        |      D cg c](  \  }}t        | j                  |      |t        |fg      * c}}|_        |j                  S  | | j                  |      ||      }|g|_        |S # 1 sw Y   xY wc c}}w rX   )r8   r   r  r  r   r=   r>   r  r  rx   r  r  r  )r  r  rY  rZ   r[   r  r  r  r  r-  r   r  rf   r  s                 rQ   create_out_of_placez%_CollectiveKernel.create_out_of_place  s2    WW #""66CDCFC  &J  & nd+__[.AF!&)F "+>!: ";IAv ((0AYK
 ";FN >>!$$^4F %XFNMK &s   D-DD)r   r   r   r  r:  r  r3  r   rG   r   r.  r0  rS   rR   rQ   r&  r&    sr    

( O"9d9o#=>O	O OX ("9d9o#=>( (rR   r&  c                   >     e Zd Zd Zededdfd       Z fdZ xZS )_WaitKernelc                 
   | j                   d   }t        |t              r|j                   d   gS t        |t              rC|j                   d   }t        |t              sJ |j                  d   \  }}|j                   |   gS g S r   )rY  r=   r&  r  r  )r   r  r  r   ry   s        rQ   get_volatile_readsz_WaitKernel.get_volatile_reads  s|    kk!nc,-JJqM?"[)::a=Dd$5666[[^FAsKK$%% IrR   r  r   Nc                    t         j                  j                  5  | j                  ||      \  }}}}d d d         | t	        |j                               |      }t        |j                  ||       y # 1 sw Y   DxY wrX   )r8   r   r  r  r  r   rD  r  )r  r  r  r  r  r  r  r  s           rQ   create_waitz_WaitKernel.create_wait  sy    WW ""63/  s~~'(
 	szz3/ s   A66A?c                     t         |          }| j                         }|D ]>  }|j                  j	                  t        j                  |j                                      @ |S rX   )r  r  r4  r+  re  r&   r  r   )r   r  volatile_readsvrr  s       rQ   r  z_WaitKernel.get_read_writes.  sS    g-/002 B!!,"6"6r{{}"EF !rR   )	r   r   r   r4  r3  rG   r6  r  r4  r5  s   @rQ   r2  r2    s4      0i 0D 0 0" rR   r2  c                 $   t        | t        t        j                  f      rt	        |       S t        | t
        t        f      r!t               }| D ]  }|t        |      z  } |S t        | t        j                        rt	        |       S t               S rX   )r=   r    rH   r   r   r?   r>   r   r  rB   r  )r   r  r   s      rQ   r  r  :  st    !h

+,$Q''	At}	%EA,Q//A 	Au||	$$Q''urR   c                   r     e Zd Z fdZd Zedddeee      deee      de	dee
   d	e
fd
       Zd Z xZS )AllToAllSinglec                 F    t         |   ||||       || _        || _        y rX   )r  rk  output_split_sizesinput_split_sizes)r   r  rY  r  r  r>  r?  r  s          rQ   rk  zAllToAllSingle.__init__K  s(     	-@"4!2rR   c                     t               }| j                  |t        | j                        z  }| j                  |t        | j                        z  }|S rX   )r   r>  r   r?  )r   r  s     rQ   r  z'AllToAllSingle.get_unbacked_symbol_usesX  sR    E"".&t'>'>??A!!-&t'='=>>ArR   r   rG   r>  r?  r  r  r  c                     | j                  |      g}fd}| j                  ||      }	t        |d   j                               }
t	        |
||	|||g|      }| j                  ||	      d   S )Nc                 (    t              | d<   y y r   )rk  )r  r>  s    rQ   r  z+AllToAllSingle.create.<locals>.compute_sizel  s    !-!"45 .rR   r   )r  rY  r  r  r>  r?  )r  r  r  r   r<  r  )r  r   r>  r?  r  r  r  rY  r  r  r  r  s     `         rQ   r  zAllToAllSingle.create`  s     ##A&'	6 ++FLA"6!9#7#7#9:z21/
 &&vw7::rR   c                     | j                   \  }}}|j                  | d| d| d| j                   d| j                   d| d       y )Nz_work = dist.all_to_all_single(r	  z_inputs[0], output_split_sizes=z, input_split_sizes=z, group=r  )r  r  r>  r?  )r   r  rF  r  r  r  r  s          rQ   r  z!AllToAllSingle.codegen_collective~  sk    !%!3!3UJ 	m:m5 .""&"9"9!: ;!!%!7!7 8 9 M!4	6	
rR   )r   r   r   rk  r  r3  r   r   r   r^   r   r  r  r4  r5  s   @rQ   r<  r<  J  sv    3 ;; %T$Z0; $DJ/	;
 ; Cy; ; ;:
rR   r<  ru  )TFN)FN)r   rG   ry  rG   r/  rG   )ry  r   r  r  r  loggingr  textwrapr   r   enumr   r   inspectr   typingr   r   r	   r
   r   r   r   r   r   r   r   unittest.mockr   rH   r   r   torch._export.serde.schema_exportserder
  r  torch._loggingrB   torch.fxtorch.utils._pytreer  _pytreer  torch._dynamo.device_interfacer   torch._dynamo.utilsr   torch._export.serde.serializer   torch._prims_commonr   r   r   r   r   r   torch._subclasses.fake_tensorr   %torch.fx.experimental.symbolic_shapesr   r    torch.utils._sympy.functionsr!   r"   r#   r   r%   r&   codegen.commonr'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   virtualizedr7   r8   	getLoggerr   r  r   r  rU   r_   rl   ro   rt   r  r}   r   r   r   r   r   r   r   r   r  r   r;  r=  rR  rY  r_  r  r  r  rb  rd  r   r  r  r  r  r  rE   r  r  r  rK  r  r;  rE  rQ  rX  r  r  r(  r  r  r  r  r  rO  r  r  r  rU  rj  rl  rW  r  r  r~  r  r  r  r(  r?  rD  rS  rc  rm  r  r  rF   r  r  rH  rK  r   bmmrL  
_scaled_mmrepeat_interleaver  nonzeror  r  r  r  r  r   rI  rM  rO  rb  ro  r|  r  r  r  r  r  r  r  r  rG   r  r   Interpreterr  r8  r+  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r&  r2  r  r<  rS   rR   rQ   <module>r`     s^	         	   "           2 2   $ $ C ( ?  : Q L L " 4 
     g!			8??4	8yy~~D$6 ! (3- DI 2('U% U%p [
F [
 [
|&B B2 
i 
 
6D t  |$y!y!u=) /d H
 H
 H
V3|
y |
~< $59#4#44$  
 JLv JL JLZ 7 7 7t &( & &R79( 79t (  6 K; K K\ @
h @
 @
F/< /<d6 : 8| 8 8$ @| @ @ |TV |T |T~ &  FX7V X7vPF P2 *K*V K*\ KV K K\	& 	
[ 
&-6 -
F  S4V S4 S4l0
V 0
f	> 	M M$ .6 . .b 
}9 }@ }< } }@ "l " "J
/ 
 !" !"HY3l Y3x2+\ +*'| 'B.\ .<['l ['|-'| -'`K K>)7L )7X    	,,44JJNNHHLLGGKKOO!!LL
Y*& Y*x 7# 7 7t   .
, .
t *.KBKB KB 	KB
 #YKB IKB 3iKB KB KB T#Y'KB\5B5B 5B 	5BpC
( C
LU
) U
p` 0 ` F2
' 2
j<# <~B$ BJQ
 1 Q
hj& jZf
* f
RF
0 F
RI
, I
X . . .b+
 +m
 m
`0ehh** 04a aHG
 G
T/4 /4d:| :zC. C,8
!1 8
v, 6B< B 
k 
 
' 
@(
0 (
V
' 
B
4 
<%
4 %
P&
#= &
R)
#= )
Zs sl)# )^ ?
/ ?
rR   