
    %Vji	                   f   U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZmZmZmZmZ d dl m Z  d dl	m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 d dlm5Z5 d dl6Z6d dl7Z7d dl8m9c m:Z; d d	l<m=Z= d d
l>m?Z? d dl@mAZA d dlBmCZC d dl8mDZDmEZE ddgZFd dlGmHZHmIZImJZJmKZK e+rVd dlmLZLmMZMmNZN d dl7mOZOmPZPmQZQ d dlRmSZS d dlTmUZU d dlVmWZW ddlXmYZY ddlZm[Z[ ddl\m]Z] ddl^m_Z_m`Z`maZambZbmcZcmdZd ddlemfZf ddlgmhZhmiZi g dZj e,d          Zkejl        dd!            Zmd d"lnmoZo d d#lpmqZq d d$lrmsZs d d%ltmuZu d d&lvmwZw d d'lxmyZy d d(lzm{Z{m|Z|m}Z}m~Z~mZ d d)lmZmZ d d*lmZmZ dd+lmZ dd,lmZ ej        d-k    Z ej        e          Ze7j                            ed.          Z e,d/          Zee6j        e6j        f         Ze)e-e7j        ee7jQ        f                  Zd0d1d2Zd3Zd3Zd3Zd4Zd5Zeedz
  z  d k    red6k    s
J d7            dd:Zdd>Z G d? d@e6j                  Z ej        dAB           G dC dD                      Zdd dLZ	 dd dMZejl        d!dN            Zd"dRZd#dUZd$dYZd%d\Zd&d`Zd'dcZd(dgZd)djZd*dmZd+dpZdq fd,dvZd-d~Zd.d/dZ	 	 d0d1dZ	 	 	 	 	 d2d3dZd4dZd5dZd6dZd7dZd8dZ e1d          Z e,ddA          Z G d de*e&eef                   Zd9dZd:dZd;dZd<dZ	 d=d>dZd?dZÐd@dZĐdAdZŐdBdZƐdCdƄZǐdDdȄZȐdEdʄZɐdFd̈́ZʐdGdτZːdHdЄZ̐dIdԄZ͐dJdՄZΐdKdׄZd dlZАdLd؄Zg Zdzed<   dMdڄZԐdLdۄZej        	 	 	 dNdOd            ZeZeZeZڐdPdZېdQdZ ej        d6          dRd            Z G d de(          Zej         G d d                      Z G d d          Z G d de          Zej        dSd            Z G d d          Z G d de          Zejl        dTdUd            Zej        dVd            Zej        d!d            ZdVdZ	 d=dWdZdXd	ZdYdZdYdZdddAddZdZddd[dZddd[dZd\dZd]dZe-ee6j        f         ZdeӐd <   ejl        d^d!            Zejl        d^d"            Zejl        d_d#            Zejl        d`d$            Zejl        dad&            Zdbd'Zd\d(Zd\d)Zdbd*Zdbd+Zdcd/Z	 	 	 	 ddded4Zd!d5Z  G d6 d7          Zdfd<Zdfd=Zdgd?Zdhd@ZdidAZdidBZdjdDZej        dkdG            Z		 d=dldKZ
dmdMZdndNZdodPZdodQZdpdTZdqdVZej        drdY            ZdVdZZejl        dVd[            Zejl        dsd\            Zejl        dVd]            ZdVd^Zdtd`ZdudaZd!dbZd!dcZdvdfZdHdgZ G dh diej                  ZdwdmZdxdpZ dxdqZ!	 d=dyduZ"dzdwZ#d{dzZ$d{d{Z%d|d~Z&d}dZ'd fd~dZ(d fd~dZ)ddZ*ddZ+ej         G d d                      Z,ej        dd            Z-ddZ.ddZ/ddZ0ddZ1ddZ2ddZ3ddZ4ddZ5ddZ6ddZ7ddZ8ddZ9ddZ:	 d=ddZ;ddZ<ddZ=ddZ>d!dZ?ddZ@dÐdĐdŐdƐdǐdȐdȐdɜZAdʄ eAB                                D             ZC ejD        d˦          ZEdd̄ZFdd̈́ZGddЄZHddфZIejl        ddӄ            ZJej         G dԄ dզ                      ZKi ZLdeӐd<   ddۄZM eC            ZNdeӐd<   ddބZOdd߄ZPddZQ e,d          ZR e,d          ZS G d deeReSf                   ZT e0dA          d=dAdBdd            ZUddZV G d dej                  ZWejl        dd            ZXd!dZYddZZddZ[ddZ\d!dZ]ddZ^dZ_ddZ`ddZaddZb	 	 dddZcddZdd!d	ZeddZf	 	 dddZgddZh ej        dAB           G d d                      Zie$de#f         Zje$ejeigejf         Zk G d d          Zl el            ZmddZnddZodS (      )annotationsN)
Collection	GeneratorIteratorMappingMutableMapping
MutableSet)datetime)StringIO)AnyCallablecastGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKINGTypeVarUnion)Concatenatedataclass_transform	ParamSpecSelf	TypeAlias	TypeGuard)mock)datasheet_tops)DeviceProperties)dtype_abbrs)
OrderedSet)tree_flattentree_map_only!activation_quantization_aten_passinductor_autotune_lookup_table)free_symbolsfree_unbacked_symbolsIterateExprsShapeEnv)IterableSequence
ValuesView)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)Node   )WorkspaceArgPythonWrapperCodegenGraphLowering)BufferExternKernelIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpumtiaTreturnstrc                     d t           D             } t          |           dk    sJ t          |           dk    rdn|                                 }|S )Nc                `    g | ]+}t          t          |                                          )|,S  )getattrtorchis_available.0xs     O/root/voice-cloning/.venv/lib/python3.11/site-packages/torch/_inductor/utils.py
<listcomp>z get_gpu_type.<locals>.<listcomp>k   s3    KKK'%*;*;*H*H*J*JK!KKK    r3   r   rC   )	GPU_TYPESlenpop)
avail_gpusgpu_types     rS   get_gpu_typer[   i   sT    KKYKKKJz??aZA--vv:>>3C3CHOrU   )get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesconfig)ceildivwin32
perf_hints_Tz.cubinz.spv)rC   rE         @      zmust be power of 2nbytesintc                .    | t           z   dz
  t            z  S )z/Round up to the nearest multiple of ALIGN_BYTESr3   )ALIGN_BYTES)ru   s    rS   _alignry      s    [ 1$44rU   v
sympy.Exprboolc                   t          | t          j        t          j        f          r't	          t          t          | j                            S t          | t                    p"t          j	        | t                    t          k    S )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdrx   )rz   s    rS   r   r      s]    !ei+,, -3{AF++,,,aK59Q#<#<#KKrU   c                  2    e Zd ZdZdZdZed
d            Zd	S )r   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr3   Tvaluer{   rH   Optional[sympy.Expr]c                    t          |t          t          j        f          rt	          t          |                    S t          |          r|S d S N)r~   rv   r   Integerry   r   )clsr   s     rS   evalz
align.eval   sN    ec5=122 	&#e**%%%u 	L	 	rU   N)r   r{   rH   r   )__name__
__module____qualname____doc__nargs
is_integerclassmethodr   rL   rU   rS   r   r      sB        FFEJ   [  rU   r   Tfrozenc                  <    e Zd ZU dZded<   ded<   ded<   ded<   d	S )
GraphPartitionMapzP
    Mapping from the partition info (e.g., input/output) to the graph info
    rv   idzlist[Optional[int]]input_index_mappingoutput_index_mapping	list[str]constant_namesNr   r   r   r   __annotations__rL   rU   rS   r   r      sO          
 GGG -,,,---- rU   r      d   fnCallable[[], Any]warmuprepfloatc                    |              t           j                                         t          j        t	          d          t           j        d          }t           j                            d          }t           j                            d          }|                                 t          d          D ] }|	                                  |              !|                                 t           j                                         |
                    |          dz  }t          dt	          ||z                      }t          dt	          ||z                      }	t          |          D ]} |              d t          |	          D             }d	 t          |	          D             }t           j                            t           j        j        j        g
          5 }
t           j                                         t          |	          D ]}|	                                 ||                                          t           j        j                            d          5   |              ddd           n# 1 swxY w Y   ||                                          t           j                                         t          j        d t%          ||          D                       }ddd           n# 1 swxY w Y   t          j        |                                          }t*                              d           t*                              |
                                                    dd                     t3          d |
                                D                       }|r$|t7          j        d |D                       dz  z  }t*                              d|           |S )R  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
        ArC   dtypedeviceTenable_timing   r3   c                N    g | ]"}t           j                            d           #S Tr   rN   rC   EventrQ   _s     rS   rT   zfp8_bench.<locals>.<listcomp>   s+    QQQA5:##$#77QQQrU   c                N    g | ]"}t           j                            d           #S r   r   r   s     rS   rT   zfp8_bench.<locals>.<listcomp>   s+    OOO!!!!55OOOrU   
activitiesRunCudaModuleNc                >    g | ]\  }}|                     |          S rL   )elapsed_time)rQ   ses      rS   rT   zfp8_bench.<locals>.<listcomp>   s(    GGG41aQ^^AGGGrU   
raw eventsself_device_time_totalsort_by	row_limitc                p    g | ]3}|j         t          j        k    rt          j        d |j                  1|4S )zfused_abs_max_\d)device_typer^   CUDArematchnamerQ   events     rS   rT   zfp8_bench.<locals>.<listcomp>   sJ     	
 	
 	
!Z_44H0%*==I	  JIIrU   c              3  $   K   | ]}|j         V  d S r   device_time_totalr   s     rS   	<genexpr>zfp8_bench.<locals>.<genexpr>	  s%      QQE3QQQQQQrU        @@profiling results: %s ms)rN   rC   synchronizeemptyrv   float16r   recordrangezero_r   maxprofilerprofileProfilerActivityr   nvtxtensorzipmeanitemlogdebugkey_averagestabler_   events
statistics)r   r   r   cachestart_event	end_eventr   estimate_msn_warmupn_repeatpitimesresfiltered_eventss                  rS   	fp8_benchr      s    BDDD	JKJu}VLLLE *"""66K
  t 44I1XX  
	J**9559K 1c&;.//00H1c#+,,--H 8__  
QQxQQQKOOuXOOOI			N+0
 
  
 
 
 

   x 	" 	"AKKMMMN!!###&&77                aL!!!!
   GG3{I+F+FGGG
 

 
 
 
 
 
 
 
 
 
 
 
 
 
 
" *U


 
 
"
"CIIlIIann$$-EQS$TTUUU	
 	
	
 	
 	
	 	O  
OQQQQQQQ	

 II(#...Js8   (BK=*J5K=JK=J	A(K==LLc                    |              t           j                                         t          j        t	          d          t           j        d          }t           j                            d          }t           j                            d          }|                                 t          d          D ] }|                                  |              !|                                 t           j                                         |	                    |          dz  }t          dt	          ||z                      }t          dt	          ||z                      }	t          |          D ]} |              t           j                                         t           j                            t           j        j        j        g          5 }
t          |	          D ] }|                                  |              !t           j                                         d	d	d	           n# 1 swxY w Y   t                              d
           t                              |
                                                    dd                     t'          d |
                                D                       }t+          |          |	z  dk    rt-          dt+          |          |	          t+          |          |	z  t'          fdt/          |          D                       }|                                 |                                }t                              d           t                              |                    d                     t3          d |D                       dz  |	z  }t                              d|           |S )r   r   rC   r   Tr   r   r3   r   Nr   r   r   r   c                R    g | ]$}|j         t          j        k    |j        d k    "|%S )zContext Sync)r   r^   r   r   r   s     rS   rT   z,do_bench_using_profiling.<locals>.<listcomp>G  s>     	
 	
 	
 JO33
n8T8T 8T8T8TrU   r   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %sc                ,    g | ]\  }}|z  d k    |S r   rL   )rQ   r   r   num_event_per_groups      rS   rT   z,do_bench_using_profiling.<locals>.<listcomp>V  s8     	
 	
 	
5&&!++ +++rU   zprofiling time breakdown)r   c              3  $   K   | ]}|j         V  d S r   r   r   s     rS   r   z+do_bench_using_profiling.<locals>.<genexpr>b  s%      AA%e%AAAAAArU   r   r   )rN   rC   r   r   rv   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r_   r   rW   RuntimeError	enumerate_build_treesum)r   r   r   r   r   r   r   r   r   r   r   r   r   actual_eventsr   r   s                  @rS   do_bench_using_profilingr    s    BDDD	JKJuyHHHE *"""66K
  t 44I1XX  
	J**9559K 1c&;.//00H1c#+,,--H 8__  
	J			N+0
 
  
 
 ! 
x 	 	AKKMMMBDDDD
   ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! IIlIIann$$-EQS$TTUUU	
 	
	
 	
 	
 O ?h&!++-  	
 
 	
 o..9	
 	
 	
 	
%o66	
 	
 	
 M !..00MII()))IIm!!B!//000
AA=AAA
A
AF
JX
UCII(#...Js   AH00H47H4c                    	 ddl m}  t          j                            dd           | d uo(t          t          t          j        dd           d          S # t          $ r Y dS t          $ r}dt          |          v sJ Y d }~dS d }~ww xY w)	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr  Fztorchvision::nms does not exist)torchvision.opsr  rN   _C%_dispatch_has_kernel_for_dispatch_keyhasattrrM   opsImportErrorr   rI   )r  r   s     rS   has_torchvision_roi_alignr  g  s    ------667I6RRR$ 
EI}d33[*
 *
 	
    uu   0CFF::::uuuuus   AA 
B	"	B	+BB	r   "Union[Optional[torch.device], str]torch.devicec                :   | t          j        d          j        S t          | t                    rt          j        |           } | j        dvrM| j        Ft          | j                  }t          j        | j        |j        	                                          S | S )Ng        )cpumeta)index)
rN   r   r   r~   rI   typer  r\   Workercurrent_devicer   device_interfaces     rS   decode_devicer  w  s    ~|C  ''&# &f%%{/))fl.B3FK@@|FK/?/F/U/U/W/WXXXXMrU   itIterable[sympy.Expr]c                `    t          j        t          j        | t          j        j                  S r   )	functoolsreduceoperatormulr   SOner  s    rS   sympy_productr#    s    HL"egk:::rU   seq1Sequence[sympy.Expr]seq2c           	         t          |           t          |          k    sJ t          j        t          d t	          | |          D                                 S )Nc              3  &   K   | ]\  }}||z  V  d S r   rL   )rQ   abs      rS   r   zsympy_dot.<locals>.<genexpr>  s*      >>daAE>>>>>>rU   )rW   r   expandr   r   )r$  r&  s     rS   	sympy_dotr,    sN    t99D		!!!!<>>c$oo>>>>>???rU   Iterable[_T]ValuesView[_T]c                >    d | D                                              S )Nc                .    i | ]}t          |          |S rL   )r   rP   s     rS   
<dictcomp>zunique.<locals>.<dictcomp>  s     !!!BqEE1!!!rU   )valuesr"  s    rS   uniquer3    s"    !!b!!!((***rU   numberUnion[int, sympy.Expr]denomc           
        t          | t          j                  st          |t          j                  r4t          t          j        |           t          j        |                    S t          | t
                    rt          |t
                    s/J |  dt          |            d| dt          |                       t          | |          S )Nz: , )r~   r   Exprrb   sympifyrv   r  runtime_ceildiv)r4  r6  s     rS   rm   rm     s     &%*%% DE5:)F)F Du}V,,emE.B.BCCC fc"" z%'='=  ;;T&\\;;U;;d5kk;; = 65)))rU   keyOptional[torch.dtype]c                   | dS t          |                               d          d         }i dddddd	d
ddddddd	dddddddddddddddddd d!d"dd#d$d%d&}|                    d' t          |                                          D                        t          | t                     r| n
d(||          S ))Nz*i8.r   r|   i1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float8_e8m0fnuu8float4_e2m1fn_x2r   fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64u16u32u64)uint8uint16uint32uint64c                    i | ]}||S rL   rL   )rQ   rz   s     rS   r1  z_type_of.<locals>.<dictcomp>  s    1111111rU   *)rI   splitupdatelistr2  r~   )r<  	dtype_strtyss      rS   _type_ofrk    so   
 {uCs##B'Ii 	G 	z	
 	 	 	w 	$ 	D 	6 	F 	6 	6  	!" 	#$ 	%& 	'( /  C4 JJ11d3::<<00111222S#&&@33,@I,@,@@rU   lst"Iterable[Union[int, torch.SymInt]]list[sympy.Expr]c                    d | D             S )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    c                6    g | ]}t          j        |          S rL   )r   r:  rQ   r   s     rS   rT   z-convert_shape_to_inductor.<locals>.<listcomp>  s"    ***EM!***rU   rL   rl  s    rS   convert_shape_to_inductorrs    s     +*c****rU   r   Union[int, torch.SymInt]c                    ddl m} t          | t                    r| nNt          | t          j                  rt          |           n%|j        j        j        	                    | d          S )zL
    Like convert_shape_to_symint, but operates on a single expression.
    r3   VN)hint)
virtualizedrw  r~   rv   r   r   graphsizevars	shape_envcreate_symintnode)r   rw  s     rS   convert_to_symintr~    sr      a	
 !U]++LCFFF!+==ad=KKrU    Iterable[Union[int, sympy.Expr]]list[Union[int, torch.SymInt]]c                    d | D             S )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    c                ,    g | ]}t          |          S rL   )r~  rq  s     rS   rT   z+convert_shape_to_symint.<locals>.<listcomp>  s!    ...Qa  ...rU   rL   rr  s    rS   convert_shape_to_symintr    s     /.#....rU   optorch._ops.OpOverloadc                H    t          d | j        j        D                       S )z-
    Does this op overload have aliasing
    c              3  (   K   | ]}|j         d uV  d S r   )
alias_inforQ   r)  s     rS   r   zis_view.<locals>.<genexpr>  s)      FFAq|4'FFFFFFrU   )any_schema	argumentsr  s    rS   is_viewr    s&     FF1EFFFFFFrU   c                    dS NFrL   )r   s    rS   <lambda>r    s     rU   user2   is_pointwise_fn'Callable[[torch._ops.OpOverload], bool]c                   | j         dk    sdS t          | j        t          j        j                  s| j        t          j        u sdS t          t          j        j        | j                  }|t          j        u st          |          r t          fd| j        D                       S t          j        j        |j        v p
 |          S )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc              3  8   K   | ]}t          |          V  d S r   )is_pointwise_use)rQ   ur  s     rS   r   z#is_pointwise_use.<locals>.<genexpr>  s.      KKA#A77KKKKKKrU   )r  r~   targetrN   _ops
OpOverloadr  getitemr   r  r   usersTag	pointwisetags)r  r  r  s    ` rS   r  r    s     6_$$u3:uz4559<xGW9W9Wu%*'44F!!!WV__!KKKKKKKKKK9&+-H1H1HHrU   r  r   r   	list[Any]kwargsdict[str, Any]&tuple[GraphModule, list[torch.Tensor]]c           	        t           j                                        g d	fd} j        | gt	          t           j        |||f          R  }t          | j        j                  dk    r+t          | j        j        d         j
                  dk    r|f}                    |           t           j                            i           }|fS )
Nargtorch.TensorrH   r2   c                x                         |                                dt                               S )Nr  )appendplaceholderrW   )r  g
graph_argss    rS   add_tensor_argz)gen_gm_and_inputs.<locals>.add_tensor_arg  s8    #}}43z??44555rU   r3   r   Tensor)r  r  rH   r2   )rN   fxGraphr  r#   r  rW   r  returnsrI   r  outputr1   )r  r   r  r  nodegmr  r  s         @@rS   gen_gm_and_inputsr    s     	A%'J6 6 6 6 6 6 6 1?u|^dF^LL  D 	FN"##q((&q).//8;;wHHTNNN			b!	$	$Bz>rU   rC   Nonec                    | dk    rd S t          |           }|                                r|                                 d S d S Nr  )r\   rO   r   r  s     rS   r   r      sT    /77$$&& '$$&&&&&' 'rU   modelCallable[..., Any]example_inputsSequence[Any]r   c                    t          |           t          j        d           t          j                    }t          |          D ]} | | }t          |           t          j                    }|J ||z
  S )Ni9  )r   rN   manual_seedtimeperf_counterr   )r  r  r   r   t0r   resultt1s           rS   timedr  (  s     	d				B5\\  'F				B7NrU   rL   
         ?repeatbaselinec                     t          j         fdt          |          D                       }t          j        |          z  }t	          ||z  d           |                                S )Nc                4    g | ]}t                    S rL   )r  )rQ   r   r   r  r  r   s     rS   rT   z%print_performance.<locals>.<listcomp>C  s'    LLLuneV	4	4LLLrU   z.6f)rN   r   r   medianprintr   )r  r  r   r  r  r   timingstooks   ```  `  rS   print_performancer  :  s{     lLLLLLLLeFmmLLL G <  5(D	TH_
"
"###99;;rU   objmethodc                `     t          | |                      t          | |fd           dS )zKReplace obj.method() with a new method that returns a precomputed constant.c                      S r   rL   )r  s   rS   r  z#precompute_method.<locals>.<lambda>M  s     rU   N)rM   setattr)r  r  r  s     @rS   precompute_methodr  J  s8    !WS&!!##FC(((((rU   methodsr   c                0    |D ]}t          | |           dS )zFReplace methods with new methods that returns a precomputed constants.N)r  )r  r  r  s      rS   precompute_methodsr  P  s.     ' '#v&&&&' 'rU   r)  r*  c                P    t          | |k              t          | |k               z
  S r   )rv   )r)  r*  s     rS   cmpr  V  s!    q1u::AE

""rU   rR   Union[int, Sequence[int]]sizeSequence[int]c                    t          | t                    r| g|z  S t          |           dk    r" t          |           | d         g          |z  S | S )Nr3   r   )r~   rv   rW   r  )rR   r  s     rS   pad_listliker  Z  sS    !S sTz
1vv{{tAww!v%%HrU   tuple[_T, ...]list[_T]c                V    t          |           dk    rg S dd}t          | |          S )	Nr   elemrp   rH   rI   c                    t          | t                    r| S ddlm} t          | |          sJ |                                 S )Nr3   )rA   )r~   rI   	schedulerrA   get_name)r  rA   s     rS   	sort_funcztuple_sorted.<locals>.sort_funcg  sP    dC   	K000000$ 122222}}rU   r<  )r  rp   rH   rI   )rW   sorted)rR   r  s     rS   tuple_sortedr  c  s?    
1vv{{	    !####rU   PRV)	covariantc                  .    e Zd Zedd            ZddZdS )CachedMethodr   r   rH   r  c                    d S r   rL   )r   s    rS   clear_cachezCachedMethod.clear_cachex  s    ),rU   r   P.argsr  P.kwargsr  c                    d S r   rL   selfr   r  s      rS   __call__zCachedMethod.__call__{  s      rU   N)r   r   rH   r  )r   r  r  r  rH   r  )r   r   r   staticmethodr  r  rL   rU   rS   r  r  w  s2        ,,, \,DDDDDDrU   r  !Callable[Concatenate[Any, P], RV]CachedMethod[P, RV]c           	         | j         }d| dd| i}t          d| d d d                                |            t          j        |           || d                   }dfd}||_        |S )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        _cache_on_selfr  r   rH   r  c                L    t          |           rt          |            d S d S r   )r	  delattrr  r<  s    rS   r  z"cache_on_self.<locals>.clear_cache  s5    4 	D#	 	rU   )r  r   rH   r  )r   execlstripr  wrapsr  )r   r   ctxwrapperr  r<  s        @rS   cache_on_selfr    s    ;D
t


C *C				 		 !		 		 (+		 		 		 FHH   "iob!!#&=&=&=">??G      &GNrU   node_schedule0Union[Sequence[BaseSchedulerNode], ExternKernel]OrderedSet[Node]c                    ddl m} t          | t                    r6t	          j        t          j        d | D             t                                S t          | |j	                  r| j
        S t                      S )Nr3   irc                T    g | ]%}t          |d           |j        |j        j        &S )r  )r	  r  originsrQ   r  s     rS   rT   z%aggregate_origins.<locals>.<listcomp>  sI       4(( .2Y	!  rU   ) r  r~   rh  r  r  r  or_r!   r:   r  )r	  r  s     rS   aggregate_originsr    s     -&& L )  
 LL
 
 	
 
M2?	3	3 $$||rU   Sequence[BaseSchedulerNode]descriptive_names8Literal[True, 'torch', 'original_aten', 'inductor_node']c                8   t          |           }|dk    r)d |D             }t          t          |                    }n|dk    rg }|D ]}|j        dk    rsd|j        v rj|j        d         d         }t          |d         t                    r|                    |d                    `|                    |d         j                   t          t          |                    }n|dk    rd	 |D             }nt          |}d

                    dg|z             S )Noriginal_atenc                ~    g | ]:}|j         d k    r-d|j        v r$|j        d         #|j        d         j        j        ;S )r  r  )r  r  _overloadpacketr   rQ   origins     rS   rT   z)get_fused_kernel_name.<locals>.<listcomp>  sX     
 
 
yO++6;..O,8	 K(8A 988rU   rN   r  source_fn_stackr   r3   inductor_nodec                2    g | ]}|j         d k    |j        S r  )r  r   r  s     rS   rT   z)get_fused_kernel_name.<locals>.<listcomp>  s-     
 
 
"VY/5Q5QFK5Q5Q5QrU   r   fused)r  r  r!   r  r  r~   rI   r  r   NotImplementedErrorjoin)r	  r  all_originssourcesr  	source_fns         rS   get_fused_kernel_namer(    sO    $M22KO++
 
%
 
 
 G,,--	g	%	%! 	: 	:FyO++0AV[0P0P"K(9:2>	ilC00 :NN9Q<0000NN9Q<#8999G,,--	o	-	-
 
&1
 
 
 "!G88WI'(((rU   r  r6   tuple[str, str]c                (	   t          |           }d |D             }t          j        t                    }t          j        t                    }dt	          |          rt          d |D                       }t	          |          dk    r[|d         j        t          d          s%d t          j	                  D             }|_
        |                    fd	           |D ]}d
|j        v rL|j        d
         ?t          |j        d
         j                  }	||	                             |j                   d|j        v r8|j        d         d         j        }	||	                             |j                   dnd}
|j         d|
 dd                    |                                           dd                    |                                           d}|j         dg}t)          |                                          D ]G\  }}|                    |j         d| dd                    t)          |                                Hddlm |                    |j         d           t                      }g }t1          | j                  slddlm} d2fd}d3d"d4fd%}| D ]R}t          |d&          r|j        t          |j        d'          r|j        j        |j        j        D ]}|j        |v r|                    |j                   |j                            |j                  }|H |||j                  \  }}|                    |j         d(| d) ||           d*| d           t          |j        d+          ri|j        j         ]|j        j         D ]P}|j                            |j                  }|$ |||j                  \  }}|                    d,|z              QT|D ]5}|                    |j         d|!                    d-.                      6|                    |j         d/d0                    |                      |d1                    |          fS )5aH  
    Retrieves metadata information for a kernel.
    Args:
        node_schedule (Union[Sequence[BaseSchedulerNode], ExternKernel]):
            Either a sequence of BaseSchedulerNode objects or an ExternKernel instance.
        wrapper (PythonWrapperCodegen):
            An instance of PythonWrapperCodegen, used to define the code comment format.
    Returns:
        tuple[str, str]:
            A tuple containing two strings:
                - The first string represents the kernel's metadata.
                - The second string represent the kernel's detailed metadata.
    c                (    g | ]}|j         d k    |S r!  r  r  s     rS   rT   z'get_kernel_metadata.<locals>.<listcomp>  s$    WWW&):V:Vf:V:V:VrU   Nc              3  $   K   | ]}|j         V  d S r   )rz  )rQ   ns     rS   r   z&get_kernel_metadata.<locals>.<genexpr>  s$      "C"Cq17"C"C"C"C"C"CrU   r3   r   )_inductor_kernel_metadata_node_to_idx_mapc                    i | ]\  }}||	S rL   rL   )rQ   idxr-  s      rS   r1  z'get_kernel_metadata.<locals>.<dictcomp>  s    "V"V"Vfc11c"V"V"VrU   c                    j         |          S r   )r.  )r-  single_graphs    rS   r  z%get_kernel_metadata.<locals>.<lambda>  s    lTUVW rU   r  r  	from_nodezTopologically SortedUnsorted z Source Nodes: [r8  z], Original ATen: []z" Source node to ATen node mapping:z   z => r  z Graph fragment:rv  buffer2Union[ir.TensorBox, ir.Buffer, ir.TorchBindObject]rw_namerI   rH   tuple[str, ir.Layout | None]c                   t          | j                  r,t          | j        j                  r| j        j        j        }n| j        }||}n|j        }	 |                                 }n# t          $ r d }Y nw xY w||fS r   )r~   	TensorBoxdata
StorageBoxorigin_noder   
get_layoutr#  )r7  r9  r?  r   layoutr  s        rS   get_buffer_infoz,get_kernel_metadata.<locals>.get_buffer_info  s     fbl33 5
K9 9 5 #)+"2">KK"("4K&"DD&+D"#..00FF* " " "!FFF"V|#s   A, ,A;:A;shapeIterable[int]c                H    dd                     d | D                        dS )N[r8  c                ,    g | ]}t          |          S rL   )rI   rP   s     rS   rT   z@get_kernel_metadata.<locals>.stringify_shape.<locals>.<listcomp>0  s    %<%<%<c!ff%<%<%<rU   r6  )r$  )rC  s    rS   stringify_shapez,get_kernel_metadata.<locals>.stringify_shape/  s.    @499%<%<e%<%<%<==@@@@rU   rA  ir.Layout | Nonec                    | dS  | j                    } | j                   }| j         }dt          | j                  | | | dS )Nr  ")r  strider   r    r   )rA  shape_annotationstride_annotationdevice_annotationrH  s       rS   stringfy_layoutz,get_kernel_metadata.<locals>.stringfy_layout2  s|    >2&5ofk&B&B#D '6v}'E'E$G!'-}$6!?FL1 ?3C ?(?*;? ? ?rU   read_writesreadsz   %z
 : Tensor z = PlaceHolder[target=writes%T)include_tensor_metadataz
   return ,
)r7  r8  r9  rI   rH   r:  )rC  rD  rH   rI   )rA  rI  rH   rI   )"r  collectionsdefaultdictrh  rW   r!   rz  r	  r   nodesr.  sortr  rI   r  r  r   commentr$  keysr  itemsr  r  r~   r:   ry  rw  rQ  rR  addtry_get_bufferrS  format_node)r	  r  r%  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsnode_to_idx_mapr  r<  sort_strmetadatadetailed_metadataoriginal_noderZ  	all_reads
all_writesrw  rB  rP  r-  rr7  
input_namerA  woutput_namer   r  r2  rH  s                               @@@rS   get_kernel_metadatarq    s   $ $M22KWW;WWWN ,T22N$066
 L
> 
""C"CN"C"C"CCC}"")!,2L<)TUU Y"V"V	,BT8U8U"V"V"VIXFWWWW       2 2di''DIo,F,Rdi0@AACs#**49555$)##)K(+0C3&&ty111)5)A%%zH? 	C 	CX 	C 	CtyyATATAVAV7W7W 	C 	C99%7%<%<%>%>??	C 	C 	C  $OOOOP &~';';'='= > > 
 
u  PP=PPdiiu6N6NPP	
 	
 	
 	

   GO!E!E!EFFF%/\\	 "
-99 B	=&&&&&&$ $ $ $ $ $(A A A A
 
 
 
 
 
 # = =q-00 AM4I1='22 q}7J7V]0  6Y..$!af---!"!7!7!?!?!>$-<_VQV-L-L*
F)00& \ \J \ \.v66\ \NX\ \ \    AM844
=,8]1 = =!"!7!7!?!?!>$)8)H)HQ"))#*;<<<<" 	 	D$$?WWt'7'7PT'7'U'UWW    	  GO!U!Usxx
?S?S!U!UVVVTYY01111rU   initial_queueIterable[torch.fx.Node]skip_filterOptional[Callable[[Any], bool]]OrderedSet[torch.fx.Node]c                    t          |           } t          |           }| r\|                                 }|j        D ]>}|r ||          r||vr*|                    |           |                     |           ?| \|S )zJReturns the set of nodes whose values depend on those within initial_queue)rh  r!   rX   r  r_  r  )rr  rt  dominated_setr  users        rS   dominated_nodesrz  f  s    
 ''M}--M
 +  ""J 	+ 	+D {{400 =((!!$'''$$T***  + rU   Sequence[IRNode]dict[str, IRNode]c                    ddl m d
fdt          |          \  }}fd|D             }t          |           \  }}fd	|D             }t          t	          j        g ||R            S )Nr3   r  r-  r;   rH   r|   c                   t          | j                  r | j                  S t          | j                  r | j                  S t          | j                  o(t          | j        j        j        j        f           S r   )	r~   r<  r=  r>  r;   ComputedBufferInputsKernelInputBufferTemplateBuffer)r-  r  is_unrealized_nodes    rS   r  z*gather_origins.<locals>.is_unrealized_node  s    a&& 	.%%af---a'' 	.%%af---!RY'' 

!!	1
 1
 -
 	
rU   c                4    g | ]} |          |j         S rL   r  rQ   valr  s     rS   rT   z"gather_origins.<locals>.<listcomp>  s-    WWWc?Q?QRU?V?VWckWWWrU   c                4    g | ]} |          |j         S rL   r  r  s     rS   rT   z"gather_origins.<locals>.<listcomp>  s,    SSSC;M;Mc;R;RSCKSSSrU   )r-  r;   rH   r|   )r  r  r"   r!   	itertoolschain)	r   r  kwargs_flattenr   kwargs_originsargs_flattenargs_originsr  r  s	          @@rS   gather_originsr  z  s     
 
 
 
 
 
 
" %V,,NAWWWW^WWWN"4((OL!SSSS<SSSLioE|EnEEEFFFrU   exprc                P    d
ddfddfddfd	 |           S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    r  r{   rH   r|   c                    t          | t          j                  o(t          | j                  dk    o| j        d         dk    S )N   r   r   )r~   r   MulrW   r   )r  s    rS   is_neg_leadzsympy_str.<locals>.is_neg_lead  s9    tUY''VC	NNa,?VDIaLTVDV	
rU   rI   c                n   t          | t          j                  rt          | j                  dk    rP | j        d                   r: | j        d                    d | j        d         j        d                    S d                    t          | j                            S  |           S )Nr  r3   r   z - z + )r~   r   r   rW   r   r$  r   )r  r  sympy_str_muls    rS   sympy_str_addz sympy_str.<locals>.sympy_str_add  s    dEI&& 	' 49~~""{{49Q<'@'@"'-	!55__--	RSHYZ[H\:]:]___zz#mTY"?"?@@@ =&&&rU   c                    t          | t          j                  rL |           rd | j        d                    S d                    t          | j                            S  |           S )N-r3   z * )r~   r   r  r   r$  r   )r  r  sympy_str_atoms    rS   r  z sympy_str.<locals>.sympy_str_mul  sx    dEI&& 	({4   B :>>$)A,77999zz#ndi"@"@AAA!>$'''rU   c                   t          | t          j                  r| j        S t          | t          j        t          j        f          rd |            dS t          | t          t          t          t          f          r=| j
        j         dd                    t          t          | j                             dS t!          |           S )N()r8  )r~   r   Symbolr   r   r  rf   rc   rd   re   funcr   r$  r   	sympy_strr   rI   )r  r  s    rS   r  z!sympy_str.<locals>.sympy_str_atom  s    dEL)) 	9uy%)455 	-}}T**----(HMNN 	i(RR499SDI5N5N+O+ORRRRt99rU   )r  r{   rH   r|   r  r{   rH   rI   rL   )r  r  r  r  r  s    @@@@rS   r  r    s    
 
 
 

	' 	' 	' 	' 	' 	' 	'	( 	( 	( 	( 	( 	( 	(      =rU   r  ValueRanges[Any]c                    ddl m} t          j        r2t	          |j        dd           x}r|j        dk    rt          |           S t          j	                    S )Nr3   rv  current_node
index_expr)
ry  rw  rl   compute_all_boundsrM   interpreterr  ri   rj   unknown)r  rw  fx_nodes      rS   get_bounds_index_exprr    sh     	!%~tDDDW% Nl**5!!!"$$$rU   prefixc                    | d         dk    S )Nr   rm  rL   )r  s    rS   prefix_is_reductionr    s    !9rU   rh   r0  sympy.Symbolc                L    | t           j        k    sJ t          | |dd          S )9
    Used to generate an integer-nonnegative symbol.
    Tintegernonnegative)rh   SIZErg   )r  r0  s     rS   sympy_index_symbol_with_prefixr    s0     TY vsDdCCCCrU   checkc                6    | st           j        ot           j        S r   )rl   debug_index_assertsassert_indirect_indexing)r  s    rS   generate_assertr    s    /V/TV5TTrU   r   c                L    | d         dk    sJ t          j        | dd          S )r  r   r   Tr  )r   r  r   s    rS   sympy_index_symbolr    s.     7c>>>> <d====rU   replacementsdict[sympy.Expr, Any]c                    d	dt          j        |                               fd|                                D                       S )
z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    replacedr{   replacementUnion[sympy.Expr, str]rH   r  c                    t          | t          j                  sJ t          |t                    r!t          j        || j        | j                  S |S )Nr  )r~   r   r9  rI   r  r   is_nonnegative)r  r  s     rS   	to_symbolzsympy_subs.<locals>.to_symbol  s^     (EJ/////k3'' 	< +$3    rU   c                0    i | ]\  }}| ||          S rL   rL   )rQ   krz   r  s      rS   r1  zsympy_subs.<locals>.<dictcomp>	  s)    ===1IIaOO===rU   )r  r{   r  r  rH   r  )r   r:  xreplacer^  )r  r  r  s     @rS   
sympy_subsr    s^        =''====(:(:(<(<===  rU   ,TypeGuard[Union[torch.SymInt, torch.Tensor]]c                
   t          | t          j                  pit          | t          j                  oOt	          d t          j        |                                 |                                           D                       S )Nc              3  4   K   | ]}t          |          V  d S r   is_symbolicrP   s     rS   r   zis_symbolic.<locals>.<genexpr>  s(      NN1ANNNNNNrU   )	r~   rN   r/   r  r  r  r  r  rL  )r)  s    rS   r  r    sf    a&& 1el## 	ONN	!((**(M(MNNNNNrU   c                 4    t          d | D                       S )Nc              3  4   K   | ]}t          |          V  d S r   r  r  s     rS   r   z"any_is_symbolic.<locals>.<genexpr>  s(      ,,!{1~~,,,,,,rU   r  )r   s    rS   any_is_symbolicr    s    ,,t,,,,,,rU   r  torch.fx.GraphModuleOptional[torch.fx.Node]c                   ddl m} t          g d          }t          j                    r|                    d           | j        j        D ]}t          |j	                  |v r|c S t          j
        j        j        sJt          |j	        t          j        j                  r&t          j        j        j        |j	        j        v r|c S |j                            d          x} ||          r|c S d S )Nr   )r'   )z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultrun_and_save_rng_staterun_with_rng_statezaten._local_scalar_densezaten._assert_scalar)zaten._unsafe_index_put.defaultz0aten._unsafe_masked_index_put_accumulate.defaultzaten.index_put.defaultzaten.index_put_.defaultzaten.scatter.srczaten.scatter.reducezaten.scatter.value_reducezaten.scatter_add_zaten.scatter_add.defaultzaten.scatter_reduce.twozaten.scatter_reduce_.twozaten.scatter_reduce.two_outr  )%torch.fx.experimental.symbolic_shapesr'   r!   rN   $are_deterministic_algorithms_enabledrg  rz  rZ  rI   r  	_inductorrl   graph_partitionr~   r  r  r  r  cudagraph_unsafer  r  get)r  r'   forbidden_setr  r  s        rS   %get_first_incompatible_cudagraph_noder    s     LKKKKK	
 	
 	
 M  133 
	
 	
 	
"   t{},,KKK &6	4;
(=>>	 -1AAA
 KKK9=='''C49N9Ns9S9S4KKK4rU   c                    t          t          t          | j        j                                      }|j        dk    sJ |S )z$Get the output node from an FX graphr  )nextiterreversedrz  rZ  r  )r  	last_nodes     rS   output_noder  S  s<    T(28>223344I<8####rU   OrderedSet[torch.device]c                   | j                             d          }t          d |D                       }t          |           j        d         }t          |t                    r|n|f}t          d |D                       }||z  S )Nr  r  c              3     K   | ]H}t          |j                            d           t          j                  4|j        d          j        V  IdS r  N)r~   r  r  rN   r  r   r  s     rS   r   z"get_all_devices.<locals>.<genexpr>\  s`       9 9dimmE**EL999	%9 9 9 9 9 9rU   r   c              3     K   | ]g}t          |t          j        j                  !t          |j                            d           t          j                  S|j        d          j        V  hdS r  )r~   rN   r  r2   r  r  r  r   )rQ   r  s     rS   r   z"get_all_devices.<locals>.<genexpr>d  sw       7 7c58=))7 sx||E**EL99	77 7 7 7 7 7rU   )rz  
find_nodesr!   r  r   r~   tuple)r  placeholder_nodesinput_devicesout_argout_argsout_devicess         rS   get_all_devicesr  Z  s    ++}+==.8 9 9%9 9 9 / /M "oo"1%G$We44Dww7*H,6 7 77 7 7 - -K ;&&rU   c                    t          t          j                                                  D ]} |                     d          st          j        |          }|j                                        D ]}|                    d          rt          ||          }t          |t          j	        j
        j        j                  rV|j        D ]N}t          |t          j	        j
        j        j                  r#|j        j        j                                         Ot          j        | = dt          j        v r<t          j        d         }t'          |j        j        j                  `|j        j        `t1          j                     d S )Nz&torch._inductor.runtime.compile_tasks.triton_ztriton.runtime.driver)rh  sysmodulesr]  
startswith__dict__rM   r~   rN   r  runtimetriton_heuristicsCachingAutotunercompile_resultsTritonCompileResultkernelrunmod__del__r  driveractiveutilsinstancegccollect)module_namem	attr_namer   r  r  s         rS   unload_xpu_triton_pydsr  p  s]   CK,,..// % %%%&NOO 	K$** 	< 	<I##I.. 
< I..EO3EV  < #)"8 < <%"!O3EY  < #M-199;;;K$$ #+--k12"())2J#JLLLLLrU   _registered_cachesc                    t          | d          rt          | j                  st          |  d          t                              |            | S )zh
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_cache().
    cache_clearz# does not have a cache_clear method)r	  callabler  AttributeErrorr  r  r  s    rS   clear_on_fresh_cacher    sY    
 3&& Jhs.G.G JHHHIIIc"""JrU   c                 B    t           D ]} |                                  dS )z&
    Clear all registered caches.
    N)r  r  r  s    rS   clear_cachesr    s0     "   rU   cache_entriesOptional[dict[str, Any]]dirOptional[str]deleteIterator[None]c              #  |  K   t                       ddlm}  |t          j        |                    	 t
          j                            t          j	        di          5  t                              d            |t          j                            d                    t
          j                            t          j	        di          5  dV  t          | t                    rqt          |           dk    s
J d	            t          j                                      r5t          j                  }|                     fd
|D                        ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   |r`t'                      r,t(          j                                        rt/                       t1          j        t'                      fd           n*# t4          $ r t                              d            w xY wt                       dS # t                       w xY w)z
    Contextmanager that provides a clean tmp cachedir for pt2 caches.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    r   )normalize_path_separator)r  TORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNz!expected empty cache_entries dictc           	         i | ]D}d |v|t           j                            t           j                            |                    ES )z.lock)ospathgetsizer$  )rQ   ftriton_cache_dirs     rS   r1  zfresh_cache.<locals>.<dictcomp>  sP       $%#*!#3#3 !"27??27<<@PRS3T3T#U#U#3#3#3rU   c                >    t                               d|          S )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)r  r$  r)  inductor_cache_dirs      rS   r  zfresh_cache.<locals>.<lambda>  s$    S[[@&% 6A 6 6 rU   )ignore_errorsonerrorz(on error, temporary cache dir kept at %s)r  torch._inductor.cpp_builderr  tempfilemkdtempr   patchdictr#  environr   r   r$  r$  r~   rW   existslistdirrg  
is_windowsrN   rE   rO   r  shutilrmtree	Exceptionr*  )r  r  r  r  filesr+  r'  s        @@rS   fresh_cacher;    s      NNNDDDDDD11(2Bs2K2K2KLL)Z__J24FG
 
 	 	 II35GHHH77/::    .@BR-STT  mT22 
}--2224W222w~~&677  "
+; < <%,,   ).                  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	(  	|| )	 6 6 8 8 )&(((M" )ll          >@RSSS 	sh   ,G1 $A1FBE, F,E0	0F3E0	4F7G1 FG1 
FA%G1 0H+ 1'HH+ +H;seq	list[int]c           	         | j         }t          t          |                     }t          t	          t          ||d                              S )NT)r<  reverse)__getitem__r   rW   rh  r  r  )r<  gettera_rs      rS   argsortrC    s?    _F
C//C>>>??@@@rU   r|  r)   .Sequence[Union[int, torch.SymInt, sympy.Expr]]c                     d
 fd}d t          |          D             }t          |t          j        |                    }d	 |D             }|S )Nr)  tuple[int, sympy.Expr]r*  rH   rv   c                    | \  }}|\  }}d	fd} |||k               rdS  |||k              rdS ||k     rdS ||k    rdS dS )
Nr  %Union[bool, torch.SymInt, sympy.Expr]rH   r|   c                `    t          | t                    r| S                     | d          S )NT)size_oblivious)r~   r|   evaluate_expr)r  r|  s    rS   evaluatez*argsort_sym.<locals>.cmp.<locals>.evaluate  s3    $%% **4*EEErU   r   r3   r   )r  rH  rH   r|   rL   )r)  r*  a_idxa_valb_idxb_valrL  r|  s          rS   r  zargsort_sym.<locals>.cmp  s    uu	F 	F 	F 	F 	F 	F
 8EEM"" 	28EEM"" 	1
 5==15==2qrU   c                h    g | ]/\  }}|t          |t          j                  r|j        j        n|f0S rL   )r~   rN   r/   r  r  )rQ   r0  r   s      rS   rT   zargsort_sym.<locals>.<listcomp>	  sJ       C 
Z5<88?afkka@  rU   r  c                    g | ]\  }}|S rL   rL   )rQ   r0  r   s      rS   rT   zargsort_sym.<locals>.<listcomp>  s    &&&fc1c&&&rU   )r)  rF  r*  rF  rH   rv   )r   r  r  
cmp_to_key)r|  r<  r  exprsr  s   `    rS   argsort_symrU    s|         0 nn  E 5i2377888E&&&&&FMrU   r   torch.dtypec                v    | t           j        k    rdS t          j        d|                                           S )Nrt   rL   r   )rN   rc  r   element_sizerX  s    rS   get_dtype_sizerZ    s7     q;r'''44666rU   c                      e Zd ZU ded<   dS )LineContextr   contextNr   r   r   r   rL   rU   rS   r\  r\    s         LLLLLrU   r\  c                  $    e Zd ZU ded<   ded<   dS )ValueWithLineMaprI   r   zlist[tuple[int, LineContext]]line_mapNr^  rL   rU   rS   r`  r`    s'         JJJ++++++rU   r`  c                      e Zd ZdZd1d2dZej        d3d
            Zd4dZd5dZ	d5dZ
d6dZd7dZd5dZd6dZd8dZd9dZd:d;dZd:d<dZd:d<d Z	 d=d>d%Zd?d(Zd5d)Zd@d,ZdAd/Zd0S )BIndentedBuffer   r   initial_indentrv   rH   r  c                "    g | _         || _        d S r   )_lines_indent)r  re  s     rS   __init__zIndentedBuffer.__init__(  s    GI%rU   tabwidthr  c              #  V   K   | j         }	 || _         d V  || _         d S # || _         w xY wr   )rj  )r  rj  prevs      rS   set_tabwidthzIndentedBuffer.set_tabwidth,  s@      }	!$DMEEE DMMMDDM    s    	(r`  c                   t                      }d}g }| j        D ]}t          |t                    r |            }|$n4t          |t                    r|                    ||j        f           W|}t          |t                    sJ |                    |           |                    d           |d|	                    d          z   z  }t          |                                |          S )Nr3   rW  )r   rg  r~   DeferredLineBaser\  r  r]  rI   writecountr`  getvalue)r  bufr   linemaplilines         rS   getvaluewithlinemapz"IndentedBuffer.getvaluewithlinemap5  s    jj13+ 	& 	&B".// rtt<  B,, 2:///dC(((((IIdOOOIIdOOOTZZ%%%%AA888rU   rI   c                4    |                                  j        S r   )rw  r   r  s    rS   rr  zIndentedBuffer.getvalueI  s    ''))//rU   c                   t                      }| j        D ]}t          |t                    r |            }|$nt          |t                    r;|}t          |t
                    sJ |                    d          r|                    |d d                    |                    |           |                    d           |                                S )N\r   rW  )	r   rg  r~   ro  r\  rI   endswithrp  rr  )r  rs  ru  rv  s       rS   getrawvaluezIndentedBuffer.getrawvalueL  s    jj+ 	  	 B".// rtt<  B,, dC(((((}}T""  		$ss)$$$$		$		$||~~rU   c                8    | j                                          d S r   )rg  clearry  s    rS   r  zIndentedBuffer.clear`  s    rU   r|   c                *    t          | j                  S r   )r|   rg  ry  s    rS   __bool__zIndentedBuffer.__bool__c  s    DK   rU   c                &    d| j         | j        z  z  S )Nr5  )rh  rj  ry  s    rS   r  zIndentedBuffer.prefixf  s    dlT]233rU   c                0    |                      d           d S )NrW  	writelinery  s    rS   newlinezIndentedBuffer.newlinei  s    trU   rv  )Union[LineContext, DeferredLineBase, str]c                   t          |t                    r| j                            |           d S t          |t                    rA| j                            |                    |                                                      d S |                                r2| j                            |                                  |            d S | j                            d           d S Nr  )r~   r\  rg  r  ro  with_prefixr  stripr  rv  s     rS   r  zIndentedBuffer.writelinel  s    dK(( 	#Kt$$$$$.// 	#Kt//>>?????ZZ\\ 	#K$++--77788888Kr"""""rU   lines3Sequence[Union[LineContext, DeferredLineBase, str]]c                :    |D ]}|                      |           d S r   r  )r  r  rv  s      rS   
writelineszIndentedBuffer.writelinesv  s2      	! 	!DNN4    	! 	!rU   r3   offset'contextlib.AbstractContextManager[None]c                L     t           j        d fd            } |            S )NrH   r  c               3     K   xj          z  c_         	 d V  xj          z  c_         d S # xj          z  c_         w xY wr   rh  )r  r  s   rS   r  z"IndentedBuffer.indent.<locals>.ctx}  sQ      LLF"LL'&&s   + =rH   r  )
contextlibcontextmanager)r  r  r  s   `` rS   indentzIndentedBuffer.indent|  sB    		"	' 	' 	' 	' 	' 	' 
#	"	' suurU   c                &    | xj         |z  c_         d S r   r  r  r  s     rS   	do_indentzIndentedBuffer.do_indent      rU   c                &    | xj         |z  c_         d S r   r  r  s     rS   do_unindentzIndentedBuffer.do_unindent  r  rU   F
other_codeUnion[IndentedBuffer, str]r  c           	        t          |t                    rt          d          }|j        D ]X}t          |t                    sA|r?t          |t          |          t          |                                          z
            }Yt          j	        |          rd}|j        D ]b}t          |t                    r| j        
                    |           2t                              | |t          |          d                     cd S t          j        |          }|r|                                }|sd S |                                }|                    d          D ]}|                     |           d S )Ninfr   rW  )r~   rc  r   rg  r\  minrW   r  mathisinfr  r  rv   textwrapdedentrstriprf  )r  r  r  r  rv  r   s         rS   splicezIndentedBuffer.splice  s    j.11 	"5\\F") I I!$44 I I TS5G5G)GHHFz&!! ") H HdK00 HK&&t,,,,",,T4F3FGGGG	H H "44J 1'..00
 #**,,J%%d++ " "q!!!!" "rU   r  Callable[[Any], Any]c                b    t          | j                  }fd| j        D             |_        |S )Nre  c                &    g | ]} |          S rL   rL   )rQ   rv  r  s     rS   rT   z&IndentedBuffer.map.<locals>.<listcomp>  s!    999Tdd4jj999rU   )rc  rh  rg  )r  r  r   s    ` rS   r   zIndentedBuffer.map  s7    DL9999999T[999

rU   c                P    t          |            d|                                  dS )Nr  r  )r  rr  ry  s    rS   __repr__zIndentedBuffer.__repr__  s'    t**11t}}1111rU   otherr   c                    | j         |j         k    sJ t          | j                   }|                    | j                   |                    |j                   |S )Nr  )rh  rc  r  rg  )r  r  r   s      rS   __add__zIndentedBuffer.__add__  sV    |u},,,,DL999t{###u|$$$
rU   new_line)Union[DeferredLineBase, LineContext, str]c                    || j         v S r   )rg  )r  r  s     rS   containszIndentedBuffer.contains  s    4;&&rU   Nr   )re  rv   rH   r  )rj  rv   rH   r  )rH   r`  rH   rI   rH   r  rH   r|   )rv  r  rH   r  )r  r  rH   r  r   )r  rv   rH   r  )r  rv   rH   r  )F)r  r  r  r|   rH   r  )r  r  rH   rc  )r  r   rH   rc  )r  r  rH   r|   )r   r   r   rj  ri  r  r  rm  rw  rr  r}  r  r  r  r  r  r  r  r  r  r  r   r  r  r  rL   rU   rS   rc  rc  %  s       H& & & & & ! ! ! !9 9 9 9(0 0 0 0   (   ! ! ! !4 4 4 4   # # # #! ! ! !	 	 	 	 	         EJ" " " " "2   
2 2 2 2   ' ' ' ' ' 'rU   rc  c                  (     e Zd Zd fdZd	dZ xZS )
FakeIndentedBufferrH   r  c                H    t                                                       d S r   )superri  )r  	__class__s    rS   ri  zFakeIndentedBuffer.__init__  s    rU   r   rI   r   c                j    |dk    rt                               | |          S t          d| d          )Nr  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )r  r   s     rS   r  z#FakeIndentedBuffer.__getattribute__  sK    ;**4666=$ = = =
 
 	
rU   r  )r   rI   rH   r   )r   r   r   ri  r  __classcell__r  s   @rS   r  r    sQ             
 
 
 
 
 
 
 
rU   r  c               #     K   t           j        t           j        }} 	 d V  | |ct           _        t           _        d S # | |ct           _        t           _        w xY wr   )r  stdoutstderr)initial_stdoutinitial_stderrs     rS   restore_stdout_stderrr    sR      %(ZNN@!/
CJJJ
CJ????s	   ; Ac                  R    e Zd ZdZddZddZdd	ZddZddZddZ	ddZ
ddZdS )ro  z.A line that can be 'unwritten' at a later timerv  rI   c                @    |                                 sd}|| _        d S r  )r  rv  r  s     rS   ri  zDeferredLineBase.__init__  s"    zz|| 	D			rU   rH   Union[str, None]c                    t           )zJReturns either self.line or None to indicate the line has been 'unwritten'r#  ry  s    rS   r  zDeferredLineBase.__call__      !!rU   r   c                    t           )z3Returns a new deferred line with the same conditionr  r  s     rS   	_new_linezDeferredLineBase._new_line  r  rU   r  c                >    |                      | | j                   S r   r  rv  )r  r  s     rS   r  zDeferredLineBase.with_prefix  s!    ~~444555rU   c                Z    |                      | j                                                  S r   )r  rv  r  ry  s    rS   r  zDeferredLineBase.lstrip  s"    ~~di..00111rU   r  Union[int, slice]c                B    |                      | j        |                   S r   r  )r  r  s     rS   r@  zDeferredLineBase.__getitem__  s    ~~di.///rU   r|   c                *    t          | j                  S r   )r|   rv  ry  s    rS   r  zDeferredLineBase.__bool__  s    DIrU   rv   c                *    t          | j                  S r   )rW   rv  ry  s    rS   __len__zDeferredLineBase.__len__  s    49~~rU   N)rv  rI   )rH   r  )rv  rI   rH   r   )r  rI   rH   r   )rH   r   )r  r  rH   r   r  rH   rv   )r   r   r   r   ri  r  r  r  r  r@  r  r  rL   rU   rS   ro  ro    s        88   
" " " "" " " "6 6 6 62 2 2 20 0 0 0        rU   ro  c                  4     e Zd ZdZd fdZdd	Zdd
Z xZS )DelayReplaceLinez6At end of codegen call `line.replace(key, value_fn())`r<  rI   value_fnCallable[[], str]rv  c                f    t                                          |           || _        || _        d S r   )r  ri  r<  r  )r  r<  r  rv  r  s       rS   ri  zDelayReplaceLine.__init__  s-     rU   rH   c                f    | j                             | j        |                                           S r   )rv  replacer<  r  ry  s    rS   r  zDelayReplaceLine.__call__  s$    y  4==??;;;rU   c                8    t          | j        | j        |          S r   )r  r<  r  r  s     rS   r  zDelayReplaceLine._new_line  s    $->>>rU   )r<  rI   r  r  rv  rI   r  )rv  rI   rH   r  )r   r   r   r   ri  r  r  r  r  s   @rS   r  r    sk        @@! ! ! ! ! !
< < < <? ? ? ? ? ? ? ?rU   r  index_or_deviceUnion[int, torch.device]c                   t          | t          j                  r| }n!t          j        t                      |           }t	          j        |          }t          j        j        r=|j        J |j        dk     s|j        dk    rt          
                    d           dS dS |j        dk    rdnd}|j        }||k     r!t          
                    d	||d
           dS dS )N	   r  z6GPU arch does not support max_autotune_gemm mode usageFTrE   rq   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)r~   rN   r   r[   r   createversionhipmajorr   r*  r  multi_processor_count)r  r   propr  r  s        rS   
is_big_gpur    s    /5<00 ? lnno>>"6**D } z%%%:>>TZ2--KKPQQQ5tK5((bbbG*I7:%I>> 	 	
 	
 	
 u4rU   c                     t           j                                        r#t           j                                        j        S t           j                            d          j        S )NrC   )rN   rE   rO   get_device_propertiesgpu_subslice_countrC   r  rL   rU   rS   get_max_num_smsr    sI    y Dy..00CC:++F33IIrU   c                     t           j                                        sdS t           j                            t           j                                                  } | j        dk    S )zEReturns true if the device is a NVIDIA B200, otherwise returns false.Fr  )rN   rC   rO   r  r  r  )device_propertiess    rS   
using_b200r  %  sQ     :""$$ u
889R9R9T9TUU"b((rU   c                     t           j                                        rt                      S t           j                                        } t                      | | ndz
  S )zFHandle experimental carveout if set otherwise return hardware SM countNr   )rN   rE   rO   r  r  _get_sm_carveout_experimental)carveouts    rS   get_num_smsr  /  sS     y !   x5577HH,@aHHrU   num_tma_descriptorsnum_programsOptional[int]r4   c                    ddl m}m} |t                      }|                    d          }|| z  t
          z  } |||| |j                              S )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r3   )r4   WorkspaceZeroModeNF)rq  	zero_moder   
outer_name)codegen.commonr4   r  r  	from_boolTMA_DESCRIPTOR_SIZEunique_name)r  r   r  r4   r  r  r  s          rS   get_tma_workspace_argr	  8  s     @???????"}}!++E22I--0CCD<+<+--	   rU   rA  r<   allowed_layout_dtypeslist[torch.dtype]c                    | j         |vr!t                              d| j         |           t          | j        j                  o| j         |v ot          | j                  S )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r  r  )rA  r
  s     rS   _use_template_for_gpur  L  sl     |000		RL!	
 	
 	
 	v}!"" 	&L11	&v}%%rU   backendc                    |                                  d t          j                                                             d          D             v S )Nc                6    g | ]}|                                 S rL   r  rP   s     rS   rT   z)_use_autotune_backend.<locals>.<listcomp>]  -       		  rU   rV  )upperrl   max_autotune_gemm_backendsrf  r  s    rS   _use_autotune_backendr  \  P    ==??  !<BBDDJJ3OO    rU   c                    |                                  d t          j                                                             d          D             v S )Nc                6    g | ]}|                                 S rL   r  rP   s     rS   rT   z._use_conv_autotune_backend.<locals>.<listcomp>c  r  rU   rV  )r  rl   max_autotune_conv_backendsrf  r  s    rS   _use_conv_autotune_backendr  b  r  rU   F)enable_int32enable_float8check_max_autotuner  r  r  c                  ddl m}m} t          j        t          j        t          j        g}|r.t          j        t          j        t          j        t          j        g}|r+|                    t          j	        t          j
        g           t          | j        j                  ot          | |          p| j        j        dk    o| j        |v o?t           j        pt           j        p| o$t'          d          o || j        |j                  S )Nr3   )BackendFeaturehas_backend_featurer  TRITON)r  r!  r"  rN   r   rO  rQ  rY  extendrI  rJ  r  r   r  r  r   rl   max_autotunemax_autotune_gemmr  TRITON_TEMPLATES)rA  r  r  r  r!  r"  layout_dtypess          rS   use_triton_templater)  h  s    DCCCCCCC]ENEMBM Tu{S Ge153DEFFF v})** A)&-@@O "e+M0M
	P  VF$<VDV@V
	P "(++
	P  ~/NOOrU   )
add_guardsmatricesr;   r*  c                     ddl m} ddlm dfd	d fddfd |            ot	          fd|D                       S )u  
    Return True iff *all* supplied tensors satisfy the CUDA-12.9 TMA constraints
    that Triton relies on today.
    * https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html

    A tensor is accepted when:
      * 2 ≤ rank ≤ 5
      * dtype ∈ {FP16, BF16, FP8-E4M3FN}
      * Every logical size ≥ 2
      * Base pointer 16-byte aligned
      * All "outer" dims have 16-byte aligned strides
      * The “inner” dim has stride 1 (contiguous)
      * For FP8 tensors, inner dim ≥ 32
    r   )has_triton_tma_devicer3   rv  
expr_bytesr5  rH   r|   c                N    j         j                            | t                    S r   )rz  r{  statically_known_multiple_ofTMA_ALIGNMENT)r.  rw  s    rS   _alignedzcan_use_tma.<locals>._aligned  s    w<<ZWWWrU   rR   r;   c                   |                                  }|                                 }t          |          }|                                 }|j        }|dk     s|dk    rdS |t
          j        t
          j        t
          j        fvrdS | 	                                j
        j        v rdS r?j
        j                            |          }j
        j                            |          }nfd|D             }fd|D             }t          fd|D                       rdS fdt          |          D             }t          |          dk    rdS |d	         }	t          |          D ]\  }
}|
|	k    r ||z            s dS ||	         } ||z            sdS |t
          j        k    r"j
        j                            |d
          sdS dS )Nr  r   Fc                N    g | ]!}j         j                            |          "S rL   rz  r{  symbolic_hintrQ   r   rw  s     rS   rT   zCcan_use_tma.<locals>._is_tma_compatible_default.<locals>.<listcomp>  s,    HHHQqw'55a88HHHrU   c                N    g | ]!}j         j                            |          "S rL   r5  rQ   strw  s     rS   rT   zCcan_use_tma.<locals>._is_tma_compatible_default.<locals>.<listcomp>  s,    NNN)77;;NNNrU   c              3  Z   K   | ]%}j         j                            |d            V  &dS r  N)rz  r{  statically_known_geqr7  s     rS   r   zBcan_use_tma.<locals>._is_tma_compatible_default.<locals>.<genexpr>  s;      PP117#88A>>>PPPPPPrU   c                Z    g | ]'\  }}j         j                            |d           %|(S r   rz  r{  statically_known_equalsrQ   r   r:  rw  s      rS   rT   zCcan_use_tma.<locals>._is_tma_compatible_default.<locals>.<listcomp>  H     
 
 
2w77A>>

 
 
rU   r3   r       T)get_size
get_striderW   	get_dtypeitemsizerN   r   rO  rI  r  rz  unaligned_buffersr{  guard_int_seqr  r   r=  )rR   sizesstridesrankr   rG  sizes_i	strides_iinner	inner_idxr   r:  	inner_dimrw  r2  r*  s                rS   _is_tma_compatible_defaultz/can_use_tma.<locals>._is_tma_compatible_default  s)   

,,..5zz> !88taxx5 8KLLL5 ::<<174445 	Og&44U;;G(66w??IIHHHH%HHHGNNNNgNNNI PPPPPPPPP 	5
 
 
 
"9--
 
 

 u::??5!H	 y)) 	 	EArI~~8BM** uu I&	x	H,-- 	5 E'''0@0U0Ur1
 1
' 5trU   c                    |                                  }fd|D             }fdt          |          D             }t          |          dk    rdS dS )Nc                N    g | ]!}j         j                            |          "S rL   r5  r9  s     rS   rT   z?can_use_tma.<locals>._is_tma_compatible_xpu.<locals>.<listcomp>  s,    JJJBQW%33B77JJJrU   c                Z    g | ]'\  }}j         j                            |d           %|(S r   r?  rA  s      rS   rT   z?can_use_tma.<locals>._is_tma_compatible_xpu.<locals>.<listcomp>  rB  rU   r3   FT)rE  r   rW   )rR   rK  rN  rO  rw  s       rS   _is_tma_compatible_xpuz+can_use_tma.<locals>._is_tma_compatible_xpu  st    ,,..JJJJ'JJJ	
 
 
 
"9--
 
 

 u::??5trU   c              3     K   | ];}|                                 xj        dk    r |          n
 |          V  <d S )NrE   )
get_devicer  )rQ   r  rR  rV  m_devices     rS   r   zcan_use_tma.<locals>.<genexpr>  sv       + +  &H/8=E3I3I 	#"1%%%##A&&+ + + + + +rU   )r.  r5  rH   r|   rR   r;   rH   r|   )torch.utils._tritonr-  ry  rw  r   )r*  r+  r-  rw  r2  rR  rV  rY  s   `  @@@@@rS   can_use_tmar\    s     :99999X X X X X X: : : : : : : :x      ! "" s + + + + + + 	+ + + ( ( rU   c                n    t          d |D                       ot          |d| iot          j        j        S )Nc              3  `   K   | ])}t          |                                          d k    V  *dS r<  )rW   rD  )rQ   r  s     rS   r   z*use_triton_tma_template.<locals>.<genexpr>  s7      55qC

"555555rU   r*  )r   r\  rl   r   enable_persistent_tma_matmul)r*  r+  s     rS   use_triton_tma_templater`    sC    55H55555 	79j99	7M6rU   r  r-  r  c                   ddl m} |j        j                            ||z  |z  d          }|dk    s|t
          j        j        k     rdS ddlm	} t          j        j        rdS t          j        t          j        t          j        g}t!          | |          o&t
          j        pt
          j        ot'          d          }|r6 |            s,t(                              d	t
          j        j                   dS |S )
Nr3   rv  r   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir %s is set correctly. Skipping CUTLASS backend for now.)ry  rw  rz  r{  	size_hintrl   rC   cutlass_backend_min_gemm_sizecodegen.cuda.cutlass_utilsrd  rN   r  r  r   rO  rY  r  r%  r&  r  r   r*  cutlass_dir)	rA  r  r-  r  rw  	gemm_sizerd  r(  r   s	            rS   use_cutlass_templaterk    s    **1q519r*BBIA~~V[%NNNu>>>>>> } u ]ENEK@Mfm44 	- <F$<	-!),,   !!## 	KK4 '	   5JrU   op_namec                    t           j        j                                        }|dk    rdS |                                 d |                    d          D             v S )z8Check if CUTLASS should be used for the given operation.ALLTc                6    g | ]}|                                 S rL   r  rP   s     rS   rT   z'_use_cutlass_for_op.<locals>.<listcomp>  s     IIIQqwwyyIIIrU   rV  )rl   rC   cutlass_enabled_opsr  rf  )rl  enabled_opss     rS   _use_cutlass_for_oprr    sX    +17799Ket==??II+2C2CC2H2HIIIIIrU   r   _IntLikec           
     D   ddl m} t          j        j        }t
          j        j         ow|j        j	        
                    t          j        t          j        ||| z            t          j        |||z                                o|j        j         o|j        j         S )Nr   rv  )torch._inductor.virtualizedrw  rl   r   decompose_k_thresholdrN   r  r  rz  r{  statically_known_truer   AndGeaot_modecpp_wrapper)r  r-  r  rw  rv  s        rS   use_decompose_k_choicer|     s    ------"M? M 	$G22I1A5661A566 
 
	$   	$ ##
rU   c           
     \   t           j        j        }ddlm} t          t          j        j                  ow|j	        j
                            t          j        t          j        ||| z            t          j        |||z                                o|j	        j         o|j	        j         S )z
    Check if we should use the contiguous subgraph transform.
    This transform makes the second matrix contiguous before the matmul.
    r   rv  )rl   rocmcontiguous_thresholdru  rw  r|   rN   r  r  rz  r{  rw  r   rx  ry  rz  r{  )r  r-  r  r  rw  s        rS   use_contiguousr  3  s     ";; .----- 	U] 	$G22I0145501455 
 
	$   	$ ##
rU   c                   t           j        j        }g d}t          |t          j                  r	|j        s|S |dk    rg S t          | t          j                  r| j        r!t          |t          j                  r
|j        sdnt          || z  ||z            dt	          j        |          }fd|D             }g g g }}}|D ]j}	||	z  }
|
dk     r|
|
dz
  z  dk    r|
dk    r|	                    |	           6|
dz  dk    r|	                    |	           U|	                    |	           kt           j
        d	k    r||z   |z   S ||z   |z   }|d |         S )
N)rq   rC  rs   rr      r   r  r  c                ,    g | ]}|k    |k    |S rL   rL   )rQ   divisormax_k_splitmin_k_splits     rS   rT   z get_k_splits.<locals>.<listcomp>c  s8       k!!g&<&< 	&<&<&<rU   rr   r3   rC  
EXHAUSTIVE)rl   r   num_decompose_k_splitsr~   r   r9  	is_numberr  divisorsr  max_autotune_gemm_search_space)r  r-  r  k_splits_limitdefault_k_splitsr  pow_of_2_divisorsmul_of_32_divisorsrest_of_splitsdkPartbest_splitsr  r  s               @@rS   get_k_splitsr  K  s    ]9N .--!UZ    	1			1ej!! *!+ *1ej!!**++* !q&!q&))K~a  H      H =?B>) % %Q 3;; EAI!##$$Q''''RZ1__%%a(((( !!!$$$$,<< #55FF#&88>IK''rU   c                J    t           j                            |           j        S r   )rN   rC   r  gcnArchNamer   s    rS   _rocm_native_device_arch_namer    s    :++F33??rU   Qtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]]c                     	 dd l } ddlm}m} ddlm} t          j                            | j	                  }n'# t          $ r d
d}d
d} G d d	          }d }Y nw xY w||||fS )Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationrH   r  c                     g S r   rL   rL   rU   rS   r  z*try_import_ck_lib.<locals>.gen_ops_library      IrU   c                     g S r   rL   rL   rU   rS   r  z.try_import_ck_lib.<locals>.gen_ops_preselected  r  rU   c                      e Zd ZdS )*try_import_ck_lib.<locals>.CKGemmOperationN)r   r   r   rL   rU   rS   r  r    s        DrU   r  )rH   r  )ck4inductor(ck4inductor.universal_gemm.gen_instancesr  r  ck4inductor.universal_gemm.opr  r#  r$  dirname__file__r  )r  r  r  r  package_dirnames        rS   try_import_ck_libr    s    	
 	
 	
 	
 	
 	
 	
 	
	
 	
 	
 	
 	
 	
 '//+*>??   	 	 	 		 	 	 		 	 	 	 	 	 	 	  O-@/QQs   69 !AAc                H   t           j        st           j        sdS t          j        j        sdS | j        j        dk    sdS t          | j                  }d t           j	        j
        D             p|                    d          d         |ifd                                t           j	        j        z  D             }|sdS | j        t          j        t          j        t          j        fvrdS t%                      \  }}}}|st&                              d           dS t          j                    r|t           j	        _        t           j	        j        st&                              d           dS |t           j	        j        k    rt&                              d	           dS d
S )NFrC   c                F    i | ]}|                     d           d         |S ):r   )rf  )rQ   r  s     rS   r1  z#use_ck_template.<locals>.<dictcomp>  s(    DDDaqwws||ADDDrU   r  r   c                     g | ]
}|         S rL   rL   )rQ   r  requested_archss     rS   rT   z#use_ck_template.<locals>.<listcomp>  s.     ! ! ! 	! ! !rU   z,Please pip install Composable Kernel packagez,Please set TORCHINDUCTOR_CK_DIR env variablezInvalid path to CK libraryT)rl   r%  r&  rN   r  r  r   r  r  r~  archrf  r]  ck_supported_archr   r   rO  rQ  r  r   r*  	is_fbcodeck_dir)rA  native_archrequested_supported_archsck_package_dirnamer   r  s        @rS   use_ck_templater    s    6#; u= u=''u 0>>KDD6;3CDDD #q!;IO! ! ! ! %%''&+*GG! ! ! % u|EM5>5=IIIu"3"5"51a BCCCu 0/; BCCCuV[///0111u4rU   c                    ddl m} t          d          o9t          |           o*|j        j                            ||z  |z  d          dk    S )Nr3   rv  CKr   rb  r   ry  rw  r  r  rz  r{  rf  rA  r  r-  r  rw  s        rS   use_ck_gemm_templater    se     	d## 	CF##	CG&&q1uqy2&>>BrU   c                    ddl m} t          d          o9t          |           o*|j        j                            ||z  |z  d          dk    S )Nr3   rv  CKTILEr   rb  r   r  r  s        rS   use_ck_tile_gemm_templater    se     	h'' 	CF##	CG&&q1uqy2&>>BrU   c                >    t          d          ot          |           S )Nr  )r  r  rA  s    rS   use_ck_conv_templater    s    %d++G0G0GGrU   c                R    t           j        pt           j        o| j        j        dk    S r  )rl   r%  r&  r   r  r  s    rS   _use_template_for_cpur    s'    7v7&
-

%&rU   mat1Union[ReinterpretView, Buffer]mat2c                    ddl m} t          |j        |          sJ t	          | ||d          o|j                                        S )Nr3   )r<   F)require_constant_mat2)r  r<   r~   rA  use_cpp_gemm_templateis_contiguous)rA  r  r  r<   s       rS   use_cpp_bmm_templater    s_     dk6***** 	fdDNNN 	(K%%''rU   mat2_transposedr  is_woq_int4q_group_sizec                `   ddl m} ddlm} ddlm}	 ddlm}
 t          |           rt          d          sdS t          j        j        sdS |                                t          j        t          j        fv }t          j        t          j        t          j        t          j        g} |
|||r| j        nd ||          \  }}}} }}t+          ||f          rdS t-          ||j                  r|                                } |	|                                          \  }} |d	||||                                |                                |t3                      | |

  
        }dd}| j        |v o:|d uo6 ||          o+t-          ||j                  o|                                p| S )Nr3   r  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtyper  use_4x2_dim
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsuse_refr  rR   r;   rH   r|   c                f    |                                   |                                 d         dk    S )Nr   r3   )freeze_layoutrE  rR   s    rS   is_last_dim_stride1z2use_cpp_gemm_template.<locals>.is_last_dim_stride13  s*    	||~~b!Q&&rU   rZ  )r  r  codegen.cpp_micro_gemmr  codegen.cpp_utilsr  kernel.mm_commonr  r  r  rl   cppweight_prepackrF  rN   r`  rU  rQ  rO  halfr   has_free_symbolsr~   BaseViewunwrap_viewparallel_num_threadsr>  is_module_buffer)rA  r  r  r  r  r  r  r  r  r  r  	int8_gemmr(  r  r-  r  r  r   r  r  s                       rS   r  r     s     999999MMMMMM)))))) (( 0Ee0L0L u:$ u  U[%*$==I]ENEJLM")'"+5&,,'# # #Aq!VT4 A u$$$ "!!@@AQAQRROL!""			NN$$^^%%!(**!  J' ' ' '
 	% 	Cd"	C%%	C tR]++	C ""$$A,A(ArU   c                 R    t           j        pt           j         pt          d          S )NATEN)rl   r%  r&  r  rL   rU   rS   use_aten_gemm_kernelsr  @  s*    7v7 '	v	&	&'rU   c                  R    e Zd ZU  ej        d          Zded<   ddZddZdd
Z	dS )DebugDirManagerr   rI   prev_debug_namerH   r  c                B    t          t          j                  | _        d S r   )r  r  counterr   ry  s    rS   ri  zDebugDirManager.__init__J  s    .//rU   c                    t           j        j        j        | _        | j         d| j         | _        | j        t           j        j        _        d S )N_tmp_)rN   _dynamorl   debug_dir_rootr  r   new_namery  s    rS   	__enter__zDebugDirManager.__enter__M  sA    $}3B/??dg??.2m+++rU   r   r   c                n    t          j        | j                   | j        t          j        j        _        d S r   )r7  r8  r  r  rN   r  rl   r  )r  r   s     rS   __exit__zDebugDirManager.__exit__R  s*    dm$$$.2.B+++rU   Nr  )r   r   rH   r  )
r   r   r   r  rq  r  r   ri  r  r  rL   rU   rS   r  r  F  su         ioa  G0 0 0 0< < < <
C C C C C CrU   r  Callable[P, _T]r  r  tuple[_T, list[str]]c                    ddl m} g d	fd}t          j                            |d|          5  t
          j                                          | |i |}d d d            n# 1 swxY w Y   |fS )
Nr3   r7   coderI   rH   r  c                2                         |            d S r   r  r  source_codess    rS   save_output_codez*run_and_get_code.<locals>.save_output_code`      D!!!!!rU   r  r  rI   rH   r  rz  r8   r   r1  r  rN   r  reset)r   r   r  r8   r  r  r  s         @rS   run_and_get_coder	  W  s    
 %$$$$$ L" " " " " " 
		=*<>N	O	O % %T$V$$% % % % % % % % % % % % % % % <s   'A$$A(+A(c                    t          | g|R i |\  }}g }|D ]5}|                    t          j        d|t          j                             6||fS )Nz	'''.*?''')r	  r$  r   findallDOTALL)r   r   r  r  r  kernelsr  s          rS   run_and_get_kernelsr  i  sk     ,B@@@@@@FLG B Brz,bi@@AAAA7?rU   tuple[Any, list[str]]c                .     d fd}t          |          S )NrH   r   c                 h                 } |                                                                   | S r   )r   backward)r  r   s    rS   run_with_backwardz1run_fw_bw_and_get_code.<locals>.run_with_backwardt  s-    

rU   )rH   r   )r	  )r   r  s   ` rS   run_fw_bw_and_get_coder  s  s2         
 -...rU   c                b   ddl m} g dfddfd}t          j                            |d|          5  t          j                            |d          5  t
          j                                          | |i |}ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   S )zLGet the inductor-generated code, but skip any actual compilation or running.r3   r7   r  rI   rH   r  c                2                         |            d S r   r  r  s    rS   r  z"get_code.<locals>.save_output_code  r  rU   r  r8   r   c                     G d d          }| j         r|                                 n|                                 \  }} |j                   |r |j                    |            S )Nc                  "    e Zd ZdZd
dZddZd	S )@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton modulerH   r  c                    d S r   rL   ry  s    rS   ri  zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__  s    rU   r   r   r  c                    d S r   rL   r  s      rS   callzEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.call  s    rU   Nr  r   r   r  r   rH   r  )r   r   r   r   ri  r  rL   rU   rS   DummyModuler    sB        FF        rU   r  )r{  codegen_with_cpp_wrappercodegenr   )r  r  wrapper_codekernel_coder  s       rS   patched_compile_to_modulez+get_code.<locals>.patched_compile_to_module  s    	 	 	 	 	 	 	 	 04/?SD))+++T\\^^ 	"k 	+,,, 	0[.///{}}rU   compile_to_moduler  Nr  )r  r8   rH   r   r  )r   r   r  r8   r#  r   r  r  s         @@rS   get_coder%  |  st   $$$$$$ L" " " " " "     , 	
.0I	
 	
    	
-);=MNN	    	B                                                            s5   "B$'BB$B	B$B	B$$B(+B(c                    t          | g|R i |}dt          |          cxk    rdk    sn J dt          |                       |d         S Nr3   r  z%expected one or two code outputs got r   )r%  rW   )r   r   r  r  s       rS   get_triton_coder(    ss    B000000LL!!&&&&Q&&&&&CL0A0ACC '&& ?rU   c                    t          | g|R i |\  }}dt          |          cxk    rdk    sn J dt          |                       |d         S r'  )r	  rW   )r   r   r  r   r  s        rS   run_and_get_triton_coder*    sy     'r;D;;;F;;OA|L!!&&&&Q&&&&&CL0A0ACC '&& ?rU   tuple[Any, list[GraphLowering]]c                    ddl m ddlm} |j        g dfd	}t
          j                            |d
|          5   | |i |}d d d            n# 1 swxY w Y   |fS )Nr   r7   r?   r   r   r  rH   r  c                 v     | i | | d         }t          |          sJ                     |           d S )Nr  )r~   r  )r   r  rz  r8   graph_lowerings	real_inits      rS   	fake_initz-run_and_get_graph_lowering.<locals>.fake_init  sQ    	4"6"""Q%/////u%%%%%rU   ri  r  )torch._inductor.graphr8   torch._inductor.output_coder@   ri  r   r1  r  )	r   r   r  r@   r0  r  r8   r.  r/  s	         @@@rS   run_and_get_graph_loweringr3    s     433333;;;;;;(IO& & & & & & & & 
		?J		B	B % %T$V$$% % % % % % % % % % % % % % % ?""s   	AAAaten_opoverride_fnc              #     K   ddl m} |j        |          }	 t          j        ||          |j        | <   dV  ||j        | <   dS # ||j        | <   w xY w)z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)torch._inductorr7  	loweringsr  partial)r4  r5  r7  orig_fns       rS   override_loweringr<    s{       )((((( )G.&/&7W&M&M7#&-7###g7#----s   !A Apre_fnpost_fnOptional[Callable[..., Any]]c                ~     ddl m} |j        d	 fd}t          j        j                            |d|          S )
zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerr  r   rZ  rH   c                T     | |            | |          }r | |           |S r   rL   )r  rZ  outr;  r>  r=  s      rS   r  z(add_scheduler_init_hook.<locals>.wrapper  sE    y%   gi'' 	&GIu%%%
rU   ri  )r  r   rZ  r   rH   r   )torch._inductor.schedulerrA  ri  unittestr   r1  r  )r=  r>  rA  r  r;  s   ``  @rS   add_scheduler_init_hookrF    sh     433333 G        =%%iWEEErU   msgc                    t           j        rt                              |            dS t                              |            dS )z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)rl   developer_warningsr   r*  info)rG  s    rS   developer_warningrK    s:       CrU   c                    	 t           j                            d          } | dz   t          t           j                  k     rZt          t           j        | dz                      dk    r4t           j        | dz            d         dk    rt           j        | dz            S n# t          $ r Y nw xY wt           j        D ]0}|                    d          r|t          d          d         c S 1dS )a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr3   r   r  z--only=N)r  argvr  rW   
ValueErrorr  )r0  r  s     rS   get_benchmark_namerO  	  s    	hnnX&&!Gc#(mm##CHS1W%&&**q!!$++8C!G$$    x ) )>>)$$ 	)s9~~''((((	) 4s   BB 
B)(B)r^  c                4    t          d | D                       S )Nc              3  "   K   | ]
}|d k    V  dS r3   NrL   rP   s     rS   r   zis_ones.<locals>.<genexpr>%	  &      %%!qAv%%%%%%rU   r   r^  s    rS   is_onesrV  $	      %%u%%%%%%rU   c                4    t          d | D                       S )Nc              3  "   K   | ]
}|d k    V  dS )r   NrL   rP   s     rS   r   zis_zeros.<locals>.<genexpr>)	  rS  rU   rT  rU  s    rS   is_zerosrZ  (	  rW  rU   inputsSequence[torch.Tensor]c                4    t          d | D                       S )Nc              3     K   | ];}t          |t          j                  |j        t          j        d           k    V  <dS )r  N)r~   rN   r  r   )rQ   r   s     rS   r   z is_cpu_device.<locals>.<genexpr>-	  sY        dEL))u|E***     rU   rT  )r[  s    rS   is_cpu_devicer_  ,	  s0           rU   r  c                    t          | t          j                  s
J d            | j        rt          j        S t          j        S )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)r~   r   r9  r   rN   r[  rS  )r  s    rS   get_sympy_Expr_dtypera  4	  sG    c5:&&  B & ~ {}rU   should_profileIterator[Any]c              /     K   | r5t          j        j        |i |5 }|V  d d d            d S # 1 swxY w Y   d S d V  d S r   )rN   r   r   )rb  r   r  r   s       rS   maybe_profilere  >	  s       ^#T4V44 	GGG	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   -11c                 Z    t           j        j        } | dk     rt          j                    } | S Nr3   )rl   r  threadsrN   get_num_threads)rh  s    rS   r  r  G	  s(    j G{{'))NrU   c                 t    ddl m}   |             }|                    dt          j        j        rdnd          S )Nr3   )get_backend_options
num_stagesr     )runtime.triton_helpersrk  r  rN   r  r  )rk  optionss     rS   get_backend_num_stagesrp  N	  sD    ;;;;;;!!##G;;|%-*;%BQQCCCrU   c                j   t          | t          j        j        j        j                  }||S ddlm}m} t          j        	                                o!t          j        
                                dk    }| t          j        t          j        t          j        fv sJ t          j        |          j                            d          rddlm}  |            }| t          j        t          j        fv r|r || |          S t          j        j        j        j        r |t          j        |          S  |t          j        |          S | t          j        t          j        fv r|r ||           S t          j        j        j        j        r |t          j                  S  |t          j                  S )z
    We don't want to throw errors in this function. First check to see if the device is in device_info.py,
    then fall back to the inaccurate triton estimation.
    )is_tf32Nr   )get_max_simd_tflopsget_max_tensorcore_tflops)rt   r   
clock_rate)max_clock_rate)r   rN   backendsrC   matmul
allow_tf32triton.testingrs  rt  rO   get_device_capabilityr   rO  rQ  inspect	signature
parametersr  torch._utils_internalrv  )r   ds_topsrs  rt  SM80OrLaterrv  sm_clocks          rS   get_device_tflopsr  V	  s    UEN,?,F,QRRRGMMMMMMMM*))++ 
0P0P0R0R W 1K
 U]ENEMBBBBB,--8<<\JJ 6888888!>##U]EN3333,,UH===>%0 	@,,U]HEEE&&u}h???U]EN3333,,U333>%0 	6,,U];;;&&u}555rU   c                 "    ddl m}   |             S )Nr   get_dram_gbps)rz  r  r  s    rS   get_gpu_dram_gbpsr  	  s     ,,,,,,=??rU   c                 t    ddl m}  | j        j                            d                              dd          S )Nr   r  max_shared_mem)triton.runtimer  r  r  r  r  r  s    rS   get_gpu_shared_memoryr  	  s>    %%%%%%=44Q77;;<LaPPPrU   reduction_typec                ,    |                      d          S )Nwelford)r  r  s    rS   is_welford_reductionr  	  s    $$Y///rU   c                8    t          |           rdS | dk    rdS dS )Nrm  online_softmax_reducer  r3   )r  r  s    rS   reduction_num_outputsr  	  s-    N++ q	2	2	2qqrU   c                 0    t          j                    dk    S )NLinux)platformsystemrL   rU   rS   is_linuxr  	  s    ?''rU   c                 "    t           j        dk    S )Nrn   )r  r  rL   rU   rS   r6  r6  	  s    <7""rU   itrIterable[Any]c                4    t          d | D                       S )Nc              3  Z   K   | ]&}t          |t          j                  o|j         V  'd S r   )r~   r   r9  r  rP   s     rS   r   z#has_free_symbols.<locals>.<genexpr>	  s7      JJz!UZ((<_JJJJJJrU   r  )r  s    rS   r  r  	  s    JJcJJJJJJrU   c            	     t   ddl m} | D ]}t          ||j        |j        |j        |j        |j        f          rJt          |	                                pd          s#t          |
                                pd          r dS zt          ||j                  st          dt          |                     dS )Nr3   r  rL   Tzunexpected type for is_dynamic F)r  r  r~   r<  r>  r  r  r9   r  maybe_get_sizemaybe_get_strider;   	TypeErrorr  )r   r  ts      rS   
is_dynamicr  	  s     I IbmR[":KRYW
 
 
	I   0 0 2 2 8b99 =M""$$*> >  tt Ary)) 	IGd1ggGGHHH5rU   c                      e Zd ZdZdZdS )PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r   r   r   r  r  rL   rU   rS   r  r  	  s          K *rU   r  r  r1   inpc                   ddl m} t          j        ddd          5 }t	          j                    }t	          j                    } t          |t          |                    j        |  t          d|j
         |	           t          |j
        |	           t          j                    }t          ||          5   | |j
                   d d d            n# 1 swxY w Y   t          j                    |z
  }	 ||j
                   |j
                                         |                                 t          d
|j
         |	           t          |j
        |	           |                                |                                k    }
t"                              d||j        |
|	           d d d            d S # 1 swxY w Y   d S )Nr3   )stable_topological_sortro  zutf-8F)modeencodingr  )r  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherr  r/  NamedTemporaryFileior   ra   r]   	propagater  rz  r
   nowr`   lint	recompilerr  r   rJ  r   )r  r  r  rG  r  r&  	before_ioafter_io
start_timetime_elapsedr  s              rS   pass_execution_and_saver  	  sD    988888		$
 
 
 
 
KMM	;==C	R#3C#8#8999CSII$"($$1----bhY''''\^^
#B,, 	 	DNNN	 	 	 	 	 	 	 	 	 	 	 	 	 	 	|~~
2)))

###!,,,,bhX&&&&  H$5$5$7$77hF	
 	
 	
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
s7   BG=CGC	G!C	"CGGG	input_buf"Optional[Union[Buffer, Operation]]c                l    ddl m} t          | |j                  ot          | j        |j                  S )zB
    Check if input buffer is a multi-outputs template buffer
    r3   r  )r  r  r~   CppTemplateBufferrA  MultiOutputLayoutr  r  s     rS   is_multi_outputs_templater  	  sG     i!566 :".< < rU   c                    ddl m} t          | |j                  o1t	          | j                  dk    ot          | j        d                   S )zL
    Check if input buffer is a output of multi-outputs template buffer
    r3   r  r   )r  r  r~   MultiOutputrW   r[  r  r  s     rS   #is_output_of_multi_outputs_templater  	  s\      	9bn-- 	;	 !!Q&	;%i&6q&9::rU   r   Optional[Union[Node, Operation]]!Optional[torch._ops.OperatorBase]c                H   | dS ddl m} t          | |j                  o"t          | |j                   o|d u p| j        |u pt          |           |j        k    ot          t          j
        j        d          o#| j        t          j
        j        j        j        k    pt          t          j
        j        d          o#| j        t          j
        j        j        j        k    pBt          t          j
        j        d          o#| j        t          j
        j        j        j        k    S )NFr3   r  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  r~   _CollectiveKernel_WaitKernelop_overloadr  FallbackKernelr	  rN   r
  torchrecr  defaultr  r  r  r  r  s      rS   is_collectiver  
  s%    |u 	4-.. 	34000	34Z14+r1 	T

b'' 	

 	*,?@@ U$	(:(L(TT
 	*,DEE E$9%<DE 	*,CDD Y$	(:(P(XX/rU   "Optional[Union[IRNode, Operation]]c                >    ddl m} t          |           |j        k    S Nr3   r  )r  r  r  r  r  r  s     rS   is_waitr  '
  s'    ::''rU   snoderA   c                    ddl m} t          | |          rt          d | j        D                       S t          | j                  S )Nr   GroupedSchedulerNodec              3  4   K   | ]}t          |          V  d S r   )contains_collectiverP   s     rS   r   z&contains_collective.<locals>.<genexpr>1
  s+      @@a&q))@@@@@@rU   )rD  r  r~   r  snodesr  r  r  r  s     rS   r  r  -
  sW    >>>>>>%-.. A@@5<@@@@@@$$$rU   c                    ddl m} t          | |          rt          d | j        D                       S t          | j                  S )Nr   r  c              3  4   K   | ]}t          |          V  d S r   )contains_waitrP   s     rS   r   z contains_wait.<locals>.<genexpr>:
  s*      ::=##::::::rU   )rD  r  r~   r  r  r  r  r  s     rS   r  r  6
  sV    >>>>>>%-.. #::U\::::::uz"""rU   Optional[Operation]?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]c                    ddl m} t          |t          j        j                  r|g}t          | |j                  o| j        |v S r  )r  r  r~   rN   r  r  r  r  r  s      rS   is_fallback_opr  ?
  sT     "ej+,, TdB-..I43Cr3IIrU   buf_namename_to_bufname_to_fused_nodec                L    |||          j                                                  S r   )defining_opr  )r  r  r  s      rS   buf_name_to_fused_snoder  J
  s#     k(3?HHJJKKrU   c                    dS r  rL   r  s    rS   r  r  U
      u rU   collected_node_setMutableSet[BaseSchedulerNode]dict[str, SchedulerBuffer]dict[str, BaseSchedulerNode]criteria_cbCallable[[Any], bool]c                     ||           rd S |                     |            | j        D ]1}t          |j        ||          }||v rt	          |||||           2d S )Nr  )r_  unmet_dependenciesr  r   find_recursive_deps_of_node)r  r  r  r  r  depdefining_op_for_deps          rS   r  r  P
  s     {5 5!!!' 
 
5Hk#5
 
 "444##	
 	
 	
 	
 	

 
rU   c                    dS r  rL   r  s    rS   r  r  n
  r  rU   c           	         ||           rd S |                     |            |                                 D ]}|j        D ]}}|j        J |j                                        dk    r)|j                                        |vrE||j                                                 }||v rit          |||||           ~d S )NOUTPUTr  )r_  get_outputsr  r  r  find_recursive_users_of_node)r  r  r  r  r  ory  user_ops           rS   r  r  i
  s    {5 5!!!    G 	 	D9(((y!!##x//y!!##+===(););)=)=>G,,,(""'    	 rU   dynamo_gm_num_inputsaot_fw_gm_num_inputsc                F    t           j        j        j        rdnd}|| z
  |z
  S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)r  r   )rN   
_functorchrl   functionalize_rng_ops)r  r  num_rng_seed_offset_inputss      rS   num_fw_fixed_argumentsr
  
  s2     $:A   "669SSSrU   fx_gc                   d	d}d}g }| j         j        D ]2}|j        dk    r% ||          r|                    |           |dz  }3|t	          t          t          |                              k    sJ t          |          S )
z>
    Infers which inputs are static for a backwards graph
    rR   r2   rH   r|   c                J    d| j         vod| j         vod| j         vod| j         vS )Ntangentsbwd_seedbwd_base_offsetbwd_rng_stater  r  s    rS   is_saved_tensorz'count_tangents.<locals>.is_saved_tensor
  s@    af$ .!&(.!/.  qv-		
rU   r   r  r3   )rR   r2   rH   r|   )rz  rZ  r  r  rh  r   rW   )r  r  	arg_countstatic_arg_idxsr-  s        rS   count_tangentsr  
  s    

 
 
 
 IOZ  4=  q!! 2&&y111NId5_)=)=#>#>??????rU   c                  :    e Zd ZU ded<   d
dZedd            Zd	S )	BoxedBoolr|   r   rH   c                    | j         S r   )r   ry  s    rS   r  zBoxedBool.__bool__
  s
    zrU   r  r   Union[BoxedBool, bool]c                B    t          | t                    r	d| _        | S dS r  )r~   r  r   r  s    rS   disablezBoxedBool.disable
  s%    c9%% 	CIJurU   Nr  )r  r   rH   r  )r   r   r   r   r  r  r  rL   rU   rS   r  r  
  sS         KKK       \  rU   r  kernel_listc              #      K   ddl m} |j        	 	 	 dd fd}t          j                            |d|          5  d V  d d d            d S # 1 swxY w Y   d S )Nr3   r5   Tr  r6   kernel_namerI   r"  rh  r  gpur|   cpp_definitionrH   r   c                N                         |            | |||||          S r   r  )r  r  r"  rh  r  r   r  orig_define_kernels         rS   define_kernelz.collect_defined_kernels.<locals>.define_kernel
  s;     	;'''!!+{Hc>
 
 	
rU   r#  )NTN)r  r6   r  rI   r"  rI   rh  r  r  r|   r   r  rH   r   )codegen.wrapperr6   r#  r   r1  r  )r  r6   r#  r"  s   `  @rS   collect_defined_kernelsr%  
  s      555555-; #'(,
 
 
 
 
 
 
 
 
		/-	P	P                   s   AAAc                    | dz   S )N__original__rL   r  s    rS    get_cloned_parameter_buffer_namer(  
  s    .  rU   c                    | t           v S r   )rV   r  s    rS   r  r  
  s    YrU   c                ,    | dk    ot          |           S )NrD   )r  r  s    rS   device_need_guardr+  
  s    U?-vf~~-rU   c                H   t          j                    r^| t          j        k    rNt          j                                        r0t          j                                        dk    rt           j        rdS | t          t          j	        t          j
        t          j        g          v S )N)r  r   F)rl   r  rN   rO  rC   rO   r{  bfloat16_atomic_adds_enabledr!   r[  r|   rX  s    rS   ,needs_fallback_due_to_atomic_add_limitationsr.  
  s    
 		NU^##J##%% $J,,..&88/ 9 u
EKU^#LMMMMrU   r  
self_dtype	src_dtypesrc_device_typesrc_is_tensorc                *   | j         t          j        j        j        t          j        j        j        fv r|dS | j         t          j        j        j        k    rdnd}|d |fvp|ot          |          ot          |          p| j         t          j        j        j        k    oA|dk    o;|o9|dk    o3t          j
        j        o"t          j
        j        pt                      dk    p2||k    o|t          j        t          j        fv pt          j                    S )NFr_  r   r  r3   )overloadpacketrN   r
  atenscatter_reduce_scatter_reducescatter_r  r.  rl   r  fallback_scatter_reduce_sumdynamic_threadsr  r|   r[  r  )r  r  r/  r0  r1  r2  	reduce_tys          rS   use_scatter_fallbackr<  
  s=    	"IN*EIN,IJ	K 	K"u +uy~/FFFE 
 	tY// 	8 H''H<YGG		8 &%).*HH L%'LL  5(L 
6	L
 +J/C/E/E/J	8 i'SJ5:u{:S,S	8 577!rU   c                   ddl m}m} ddlm} t          dt          |            d           t          |           D ]!\  }}t          d|dd           ||u rt          d	           .||u rt          d
           Bt          ||          r|	                                }t          |rdnd d           |r*|j
        J t          d|j
        j        j                    t          d           |j        j        D ]}t          |           t          d           |j        j        D ]}t          |           t!          dt#          |                     dS )z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr5  3r  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdr>  r?  rD  r@  r  rW   r   r~   is_reductionr  r=  reduction_hintrQ  rR  rS  r   r  )r	  r>  r?  r@  r0  r  is_redr  s           rS   dump_node_schedulerH    s   
 ONNNNNNN777777	
:M 2 2
:
:
:;;;}-- H H	Tl#llll?""$%%%%%%%%&&&&m,, 	H&&((Ff.UU$???@@@ Ry,,,P1NPPQQQ*'-  c



+'.  c



 F$t**FFGGG'H HrU   r   r  c                    ddl m}  ||                                 t          | j                  z  t
          z  dk              S )Nr   )rw  )r  rw  storage_offsetrZ  r   GPU_ALIGN_BYTES)r   rw  s     rS   tensor_is_alignedrL  2  sU     LKKKKK  				 	 >&,#?#?	??RVWW  rU   example_inputc                n    t          | j        j                  sdS t          j        pt          |           S r  )r  r   r  rl   assume_aligned_inputsrL  )rM  s    rS   should_assume_input_alignedrP  @  s6     -&+,, u'K+<]+K+KKrU   r  c                     t           j        j                                        } | st	          j                    S | j        r| j        j        st	          j                    S | j        j        }|                                S r   )	rN   _guardsTracingContexttry_getr  nullcontextr  r|  suppress_guards)tracing_contextr|  s     rS   #maybe_get_suppress_shape_guards_ctxrX  I  sw    
 m2::<<O (%''' $ (O,E,O (%''')3I$$&&&rU   tuple[_T, str]c                   t           j        j                            t          dd          5  t
          j                                         dd l}dd l	} |j
                    } |j        |          }ddlm} |                    |           |j        }|                    |j                    | |i |}	|                                }
|                    |           |                    |           d d d            n# 1 swxY w Y   |	|
fS )Nr   Tr   )output_code_log)rE  r   r1  r  rl   rN   r  r  r  loggingr   StreamHandlertorch._inductor.codecacher[  
addHandlerlevelsetLevelDEBUGrr  removeHandler)r   r   r  r  r\  log_capture_stringchr[  
prev_levelr  r   s              rS   run_and_get_cpp_coderg  Y  sZ    
		#	#FGT	:	: * *			(R[]]"W"#566======""2&&&$*
  ///T$V$$''))  ,,,%%b)))* * * * * * * * * * * * * * *  19s   CD  DDSequence[InputType]Optional[ShapeEnv]c                    t          |           }||j        S | D ]*}t          |t          j                  r|j        j        c S +d S r   )r]   r|  r~   rN   r/   r  )r[  r  inputs      rS   shape_env_from_inputsrl  r  sb     ((I ""  ( (eU\** 	(:''''	( 4rU   Callable[[list[InputType]], _T]inputs_to_checkmutated_input_idxsOrderedSet[int]c                F     t                    dk    r S d fd}|S )Nr   
new_inputslist[InputType]rH   r   c                    t          |           \  }} |           }t          |          rt          j        ||           |S r   )copy_misaligned_inputsrW   rN   _foreach_copy_)rr  old_tensorsnew_tensorsrC  rn  r  ro  s       rS   r  z)align_inputs_from_check_idxs.<locals>.run  sZ    #9);$
 $
 [ eJ { 	; k:::
rU   )rr  rs  rH   r   )rW   )r  rn  ro  r  s   ``` rS   align_inputs_from_check_idxsry    sN    
 ?q          JrU   c                   d|                                  v rd}nNt          d t          |                                  |                                           D                       dz   }t	          j        | |fd                                          }t	          j        ||                                  |                                           S )Nr   c              3  ,   K   | ]\  }}|d z
  |z  V  dS rR  rL   )rQ   rC  rL  s      rS   r   z)clone_preserve_strides.<locals>.<genexpr>  s/      TTf$TTTTTTrU   r3   r   )r  r   r   rL  rN   
as_stridedclone)rR   needed_sizer7  s      rS   clone_preserve_stridesr    s    AFFHH}} TT#affhh

:S:STTTTTWXX 	 a+66<<>>FFAFFHHahhjj999rU   rr  rs  check_inputs_idxsreturn_pair_idxsOptional[OrderedSet[int]]-tuple[list[torch.Tensor], list[torch.Tensor]]c                d   g }g }|du}|D ]}| |         }t          |t          j                  sJ dt          |                       |                                t
          z  rHt          |          | |<   |r4||v r0|                    |           |                    | |                    ||fS )z
    Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
    cloned tensor which is in `return_pair_idxs`.
    Nz Expected tensors only, but got: )r~   rN   r  r  data_ptr	ALIGNMENTr  r  )rr  r  r  rw  rx  ret_pair_definedr   _inps           rS   ru  ru    s     ')K&(K (t3 
2 
2!}$-- 	
 	
;tDzz;;	
 	
- ==??Y& 	22488JqM 2A)9$9$9""4((("":a=111##rU   static_input_idxsc                   g }|D ]Y}| |         }t          |t          j                  r5|                                t          z  dk    r|                    |           Zt          |          t          |          k    r|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )r~   rN   r  r  r  r  rW   )r[  r  aligned_static_input_idxsr0  rk  s        rS   remove_unaligned_input_idxsr    s     !#  2 2seU\** 	20@0@90LQR/R/R%,,S111
$%%->)?)???((rU   r   c                x   ddl m} t          j        t          j                  j        }|j        j        j        }|j        j        j	        j
        }|j        j                            | |k              rdS |j        r%|j        j                            | dk               rdS  ||           o ||           |k    S )Nr3   rv  Tg@xDF)ry  rw  rN   iinforY  r   rz  r{  rf  r|  has_hintrw  aot_compilation)r   rw  int_maxrf  r  s        rS   expr_fits_within_32bitr    s    k%+&&*G *Iw)2H 	w--a7l;; t 	  711!d(;; 	 5 8A;;299Q<<722rU   compiled_graphr@   c                   t           j        j                                        }||j        t          |j                  dk    sJ t          |           |j        J |j        D ]}||j                            d            dt           j        j                                        x}r|j        d	fd|j                            t          fd|D                                  d S d S d S )
Nr   Fr   r   rH   ,Union[float, int, SymInt, SymFloat, SymBool]c                ~    t          |           S r                    |           S                     |           S r   )rv   deserialize_symexprevaluate_symexpr)r   fakify_first_callr|  s    rS   map_exprz4set_tracing_context_output_strides.<locals>.map_expr  sE     ("1vv( @(<<Q???$55a888rU   c              3  .   K   | ]} |          V  d S r   rL   )rQ   r   r  s     rS   r   z5set_tracing_context_output_strides.<locals>.<genexpr>  s+      55!((1++555555rU   )r   r   rH   r  )
rN   rR  rS  rT  output_stridesrW   rl  r  r  r  )r  r  r]  rT  r  r  r  r|  s        @@@rS   "set_tracing_context_output_stridesr    s>    m*2244Gw5A7)**a////).99	,888#2 	 	E}&--d3333$)!-6>>@@@3 >(+(=%9 9 9 9 9 9 9 &--5555u55555   ' AA	 	rU   c                    t           j        t           j        S t          j                    sdS t          j                                        rdS 	 ddlm}  n# t          $ r Y dS w xY w| t          j        	                    d          k    S )NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
rl   fx_graph_remote_cacher  rN   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacher  ModuleNotFoundErrorjustknobs_getval_intr  s    rS    should_use_remote_fx_graph_cacher    s    #/++ u,,.. uHHHHHHH   uu  5#8#M#M8$ $  s   A 
A$#A$c                .    t          j        dd|           S )Nz[^a-zA-Z0-9_]r   )r   subr  s    rS   normalize_namer  -  s    6"C...rU   ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2c                    i | ]\  }}||	S rL   rL   )rQ   r  rz   s      rS   r1  r1  =  s    GGG$!QAGGGrU   z^.*[.]c                    t                               dt          |                     }t                              ||          S )z"Convert torch.dtype to triton typetl.)_triton_type_rer  rI   _triton_type_mappingr  )r   triton_type_names     rS   triton_typer  C  s6    &**5#e**==##$46FGGGrU   c                    t                               | |           }|                    dd          }t          t          |          }t          |t          j                  sJ |S )Nr  r  )_torch_triton_mappingr  r  rM   rN   r~   r   )r   adjusted_type	type_namer  s       rS   triton_type_to_torchr  I  sY    )--eU;;M%%eR00Iy))Ii-----rU   r=  r   c                   | j          o|                                 |                                k    o|                                 |                                k    o| j        |j        k    o| j        |j        k    ow|                                                                 |                                                                k    o)|                                 |                                k    S r   )	is_mkldnnr  rL  r   r   untyped_storager  rJ  r=  r   s     rS   is_same_tensorr  Q  s    N 	<IIKK5::<<'	<KKMMU\\^^+	< J%+%	< K5<'		<
   ""++--1F1F1H1H1Q1Q1S1SS	< !!U%9%9%;%;;rU   c                8   | j         o|                                 |                                k    oi| j        |j        k    oY| j        |j        k    oIt          j        j                            |           t          j        j                            |          k    S r   )r  r  r   r   rN   r
  mkldnnr  r  s     rS   is_same_mkldnn_tensorr  ]  s     	PIIKK5::<<'	PJ%+%	P K5<'	P I%%d++uy/?/H/H/O/OOrU   tuple[str, ...]c                     dS )N)r  isnanlogical_notlogical_andsignbitand_leltgegteqner  xorrL   rL   rU   rS   boolean_opsr  g  s     rU   c                  $    e Zd ZU ded<   ded<   dS )OpDtypeRuler0   type_promotion_kindr=  override_return_dtypeNr^  rL   rU   rS   r  r  {  s*         8888000000rU   r  zdict[str, OpDtypeRule]op_dtype_propagation_rulesr  r0   r  c                6    t          ||          t          | <   d S r   )r  r  )r   r  r  s      rS   #register_op_dtype_propagation_rulesr    s%    
 (32( (t$$$rU   zOrderedSet[str]op_requires_libdevice_fp64c                :    t                               |            d S r   )r  r_  r  s    rS   #register_op_requires_libdevice_fp64r    s    ""4(((((rU   c                     ddl m}  | j                                        j        }|dk    rt
          j        S |dk    rdS t
          j        S )Nr   rv  r  rD   )ru  rw  rz  get_current_device_or_throwr  rl   cpu_backendcuda_backend)rw  
device_strs     rS   get_current_backendr    sW    ------4466;JU!!	u		u""rU   c                    | t           j        t           j        fv r/t          j        j        rt                      dk    rt           j        S | S )z"Maybe upcast [b]float16 to float32r   )rN   r   rO  rl   r   codegen_upcast_to_fp32r  rQ  rX  s    rS   upcast_compute_typer    sC     	%-000M0 	1!!X--}LrU   KeyTypeValTypec                  \    e Zd ZdZddZdd	ZddZddZdddZd dZ	d!dZ
d"dZd#dZdS )$
ScopedDictz
    A dictionary-like object that allows for scoped updates. It maintains
    an original dictionary and a set of new items that can override
    the original items within the scope.  The original dictionary is
    unmodified.
    original_dictMapping[KeyType, ValType]c                "    || _         i | _        d S r   r  	new_items)r  r  s     rS   ri  zScopedDict.__init__  s    *13rU   r<  r  rH   r  c                H    || j         v r| j         |         S | j        |         S r   r  r  r  s     rS   r@  zScopedDict.__getitem__  s*    $.  >#&&!#&&rU   r   r  c                    || j         |<   d S r   )r  )r  r<  r   s      rS   __setitem__zScopedDict.__setitem__  s    #srU   r  r|   c                &    || j         v p|| j        v S r   r  r  s     rS   __contains__zScopedDict.__contains__  s    dn$At/A(AArU   Nr  Optional[ValType]c                d    || j         v r| j         |         S | j                            ||          S r   )r  r  r  )r  r<  r  s      rS   r  zScopedDict.get  s5    $.  >#&&!%%c7333rU   rv   c                ^    t          | j                  }| j        D ]}|| j        vr|dz  }|S rg  )rW   r  r  )r  r-  r  s      rS   r  zScopedDict.__len__  s@    "## 	 	A***QrU   Iterator[KeyType]c              #  R   K   | j         E d {V  | j        D ]}|| j         vr|V  d S r   r  )r  r  s     rS   __iter__zScopedDict.__iter__  sT      %%%%%%%% 	 	A***	 	rU   c                8    t          | j        p| j                  S r   )r|   r  r  ry  s    rS   r  zScopedDict.__bool__  s    D&8$.999rU   c                    t           r   r  r  s     rS   __delitem__zScopedDict.__delitem__  s    !!rU   )r  r  )r<  r  rH   r  )r<  r  r   r  rH   r  )r<  r  rH   r|   r   )r<  r  r  r  rH   r  r  )rH   r  r  )r<  r  rH   r  )r   r   r   r   ri  r@  r  r  r  r  r  r  r  rL   rU   rS   r  r    s         4 4 4 4' ' ' '
$ $ $ $B B B B4 4 4 4 4
      : : : :" " " " " "rU   r  )frozen_defaultr   Optional[type[Any]]r   c              .    dfd}| |S  ||           S )Nr   rp   rH   c                ~    t           j        dk    rt          j        | d          S t          j        |           S )N)rm  r  T)kw_onlyr   r   )r  version_infodataclasses	dataclass)r   r   s    rS   wrapzir_dataclass.<locals>.wrap  sA    w&&(d6JJJJ (V<<<<rU   )r   rp   rH   rp   rL   )r   r   r  s    ` rS   ir_dataclassr    s;    = = = = = = {499rU   Optional[list[int]]c                 v    t           j        j                                        } | | j        r| j        j        S d S r   )rN   rR  rS  rT  fw_metadatabw_donated_idxs)rW  s    rS   get_donated_idxsr    s7    m2::<<O"'B"*::4rU   c                  "    e Zd ZdZdZdZdZdZdS )TritonAttrsDescriptorVersionr   r3   r  rm  rd  N)r   r   r   V0_NO_TRITONV1_COMPILERV2_BACKENDSV3_BACKENDS_TUPLEV4_DICTrL   rU   rS   r	  r	    s,        LKK	  GGGrU   r	  c                    t           j                            d          t          j        S dd l} dd l} t          | j        j	        d          rt          j
        S t          | j	        j	        d          rt          j        S t          j        S )Nr   r   AttrsDescriptor)	importlibutil	find_specr	  r
  triton.backends.compilertriton.compiler.compilerr	  rw  compilerr  r  r  )r   s    rS   #get_triton_attrs_descriptor_versionr     s    ~))1+88########v'):;; 4 ,77	)+<	=	= 4+77 ,33rU   c                 :    t                      t          j        k    S r   )r  r	  r  rL   rU   rS   triton_version_uses_attrs_dictr    s    .004P4XXXrU   r=   c                    ddl m} t          | |j                  sdS t          | j        t
          j        j                  r$t
          j        j	        j
        | j        j        v rdS dS )zq
    Returns True if the node is an op that is not cudagraphable.
    Usually only custom ops have this tag.
    r3   r  FT)r  r  r~   r  r  rN   r  r  r  r  r  r  r  s     rS   is_cudagraph_unsafe_opr    sn    
 dB-.. u 	4#UZ%:;;HL)T-=-BBBt5rU   c                    t           j                            dd          } t          j                    rXddlm}  |            }|rFt           j                            |dd          }| r!t           j	                            || g          n|} | S )NLD_LIBRARY_PATHr  r   )get_runtime_pathr  lib)
r#  r3  r  rl   r  libfb.py.parutilr  r$  r$  pathsep)r$  r  runtime_pathlib_paths       rS   get_ld_library_pathr$  1  s    :>>+R00D K555555'')) 	Kw||L)UCCH8<J2:??Hd#3444(DKrU   c                @    ddl m} t          | |          o| j        d uS )Nr   )SubgraphPythonWrapperCodegen)torch._inductor.codegen.wrapperr&  r~   partition_signatures)r  r&  s     rS   #is_codegen_graph_partition_subgraphr)  >  s9    LLLLLL 	7899 	5(4rU   c                     t           j        j        j        j        pt
          j        d uot           j        j        j        S r   )rN   r  rl   r   
cudagraphs&_unstable_customized_partition_wrapperr  r  rL   rU   rS   is_using_cudagraph_partitionr-  G  s8    %0 	F19E1 /
 
01rU   c                    ddl m} |j        j                            | d          r,|j        j                            | d          rt          j        S t          j        S )Nr3   rv  l        i   )	ry  rw  rz  r{  statically_known_ltr=  rN   rY  r[  )r  rw  s     rS   dtype_from_sizer0  N  se    w++e  
'

/
/h
?
? {{rU   )r  rE   r   c                d    | dk    r#t           j        j                                        S d| v rdS dS )z;
    Returns True if the device supports MKL-DNN BF16.
    r  rE   TF)rN   r
  r  _is_mkldnn_bf16_supportedr   s    rS   is_mkldnn_bf16_supportedr4  \  ;     ey99;;;	+		t5rU   c                d    | dk    r#t           j        j                                        S d| v rdS dS )z;
    Returns True if the device supports MKL-DNN FP16.
    r  rE   TF)rN   r
  r  _is_mkldnn_fp16_supportedr3  s    rS   is_mkldnn_fp16_supportedr8  h  r5  rU   elementsSequence[Sequence[T]]headersSequence[T]c           
        d |D             }| D ]l}t          |          t          |          k    sJ t          |          D ]8\  }}t          ||         t          t          |                              ||<   9mg }|                    d                    d t          ||          D                                  t          |          t          |          dz  z   t          |          dz
  z   }|                    d|z             | D ]B}|                    d                    d t          ||          D                                  Cd                    |          S )	Nc                F    g | ]}t          t          |                    S rL   )rW   rI   )rQ   r   s     rS   rT   ztabulate_2d.<locals>.<listcomp>u  s$    +++ac#a&&kk+++rU   |c              3  ,   K   | ]\  }}d || d V  dS r5  NrL   )rQ   hro  s      rS   r   ztabulate_2d.<locals>.<genexpr>{  s2      HH41a,a,,,,HHHHHHrU   r  r3   r  c              3  ,   K   | ]\  }}d || d V  dS rA  rL   )rQ   r   ro  s      rS   r   ztabulate_2d.<locals>.<genexpr>  s2      HHtq!l!QllllHHHHHHrU   rW  )rW   r   r   rI   r  r$  r   r   )r9  r;  widthsrowr   r   r  total_widths           rS   tabulate_2drG  t  s]   ++7+++F 4 43xx3w<<''''cNN 	4 	4DAqF1Is3q66{{33F1II	4E	LLHH3w3G3GHHHHHIIIf++Vq1S[[1_EK	LL{"### J JSXXHHs37G7GHHHHHIIII99UrU   dict1r  dict2
d1_defaultValType | None
d2_defaultEGenerator[tuple[KeyType, ValType | None, ValType | None], None, None]c              #    K   t          |                                           t          |                                          z  }|D ];}|                     |          }|                    |          }|||n|||n|fV  <dS )a  
    Zip two dictionaries together, replacing missing keys with default values.

    Args:
        dict1 (dict): The first dictionary.
        dict2 (dict): The second dictionary.
        d1_default (Any): the default value for the first dictionary
        d2_default (Any): the default value for the second dictionary

    Yields:
        tuple: A tuple containing the key, the value from dict1 (or d1_default if missing),
               and the value from dict2 (or d2_default if missing).
    N)r!   r]  r  )rH  rI  rJ  rL  all_keysr<  value1value2s           rS   	zip_dictsrR    s      ( %**,,''*UZZ\\*B*BBH  	
 	
33 (FFj(FFj
 	
 	
 	
 	
	
 	
rU   config_patchesc                   dd	}|                      d
t          j        j                  }|                                 } |rD || dd            || dd            || dt
          j        j                     || dd           | S )a1  
    Ensures the configuration is internally consistent for standalone AOTInductor.

    If `aot_inductor.compile_standalone` is set to True in the provided
    `config_patches` (or falls back to the global config), this function ensures
    that the following configs are also enabled:
        - `aot_inductor.package_cpp_only`

    Args:
        config_patches (dict[str, Any]): A dictionary of user-provided config
            overrides for AOTInductor compilation.

    Returns:
        dict[str, Any]: The possibly-updated `config_patches` dictionary.
    rS  r  config_namerI   config_valuer   rH   r  c                    |                      |t          t          |                    }||| |<   d S |s||k    rt          d| d| d          d S d S )NzInvalid config: =z. when aot_inductor.compile_standalone is True.)r  rM   rl   r   )rS  rU  rV  r   s       rS   patch_configz2maybe_aoti_standalone_config.<locals>.patch_config  s     "";0L0LMM=*6N;''' 	5L00m;mmmmm  	 	00rU   zaot_inductor.compile_standalonezaot_inductor.package_cpp_onlyTz aot_inductor.embed_kernel_binaryz#aot_inductor.emit_multi_arch_kernelz+aot_inductor.model_name_for_generated_files
aoti_model)rS  r  rU  rI   rV  r   rH   r  )r  rl   aot_inductorcompile_standalonecopyrN   r  r  )rS  rY  r\  s      rS   maybe_aoti_standalone_configr^    s    "	 	 	 	 (++)6+>+Q  $((**N 
^%DdKKK^%GNNNAu}GXCX	
 	
 	
 	I<	
 	
 	
 rU   c                     ddl m}  | j        j        }|dS t	          |t
                    st          d          |dk    rdS t          j        d|          st          d          dS )	zL
    Validates if a model name is suitable for use in code generation.

    r   rk   NTz4Invalid AOTI model name: Model name must be a stringr  z^[a-zA-Z_][a-zA-Z0-9_]*$zVInvalid AOTI model name: Model name can only contain letters, numbers, and underscores)	r8  rl   r[  model_name_for_generated_filesr~   rI   rN  r   r   )rl   
model_names     rS   is_valid_aoti_model_namerb    s    
 '&&&&&$CJtj#&& QOPPPRt 8/<< 
d
 
 	
 4rU   r(   unbacked_onlyOrderedSet[sympy.Symbol]c                B    |rt          |           S t          |           S r   )r'   r&   )rR   rc  s     rS   get_free_symbolsrf    s$     $Q'''ArU   cudagraph partition due to Optional[BaseSchedulerNode]c                    t           j        j        sdS | |  }|rC|j        x}r:|                                x}r$|j                            dd          x}r| d| }t                              |           dS )z
    Cudagraph partition may lead to extra memory overhead so we
    log partition reasons to help users understand the overhead.
    Nstack_tracez. Found from : 
 )	rl   r   r+  r  get_origin_noder  r  perf_hint_logr*  )rG  r  r  warning_msgir_noder  rj  s          rS   maybe_log_cudagraph_partitionro    s     =# "S""K 	F	!WF  //111WF $L,,]DAAA[	F %EEEE+&&&&&rU   dict[str, str]c                    i t           j        dt           j                            dt           j                            t
          j                            i} t          j                    rt          j
        d          | d<   | S )zA
    Get a base environment for running Python subprocesses.
    
PYTHONPATHTORCH_CUSTOM_PYTHONPATHr=  
PYTHONHOME)r#  r3  r  r!  r$  r  r$  rl   r  	sysconfigget_path)envs    rS   python_subprocess_envrx    sv    

* 	bjnn%rzsx'@'@
 
	 C   7%.v66LJrU   c                  (    e Zd ZU dZded<   ded<   dS )CUDAGraphWrapperMetadataz
    Metadata for Customized CUDAGraphWrapper.

    Currently assumes there is 1 dynamo graph and will extend to
    multiple graphs in the future.
    rv   num_partitionspartition_indexNr   rL   rU   rS   rz  rz  .  s8            rU   rz  .c                      e Zd ZU dZded<   dS )CUDAGraphWrapperNzOptional[CUDAGraphWrapperType]r  )r   r   r   r  r   rL   rU   rS   r~  r~  E  s#         .2G222222rU   r~  CUDAGraphWrapperTypec                    | t           _        d S r   )r,  r  )r  s    rS   !set_customized_partition_wrappersr  W  s    5<*222rU    tuple[list[Any], dict[str, Any]]c                \   | j         j        }| j                             g || j         j        | j         j                  }| j         j        }t          j        ||f          \  }}ddfd|D             }dddfd	fd
|D             }t          j        ||          \  }}||fS )NrH   r|   c                    t          | t          j        j        j                  o$t          | t          j        j        j                   S r   )r~   rN   r  r  r;   GeneratorStater  s    rS   _is_tensor_irz(snode_args_kwargs.<locals>._is_tensor_ird  sA    !U_/677 

u!0A
 A
 =
 	
rU   c                v    g | ]5} |          r&t           j        j                            |d           n|6S )F)guard_shape)rN   r  r  ir_node_to_tensor)rQ   r)  r  s     rS   rT   z%snode_args_kwargs.<locals>.<listcomp>i  sZ         =	,,QE,BBB  rU   r  c                0    t          j        | ||          S )Nr   )rN   r   )r  r   r   s      rS   _tensorz"snode_args_kwargs.<locals>._tensorp  s    {4uV<<<<rU   r   r   c                    t          | t          j                  s| S  |                                 | j        | j                  }|S r   )r~   rN   r  r  r   r   )r   rC  r  s     rS   to_real_tensorz)snode_args_kwargs.<locals>.to_real_tensors  s@    !U\** 	Hgaffhh22
rU   c                &    g | ]} |          S rL   rL   )rQ   r)  r  s     rS   rT   z%snode_args_kwargs.<locals>.<listcomp>y  s#    666q""666rU   r  )rH   r  )r   r   rH   r   )r  r[  fill_non_provided_argsconstant_argsr  pytreer"   tree_unflatten)r  r   r  	flat_argsflat_args_pytree_specr  r  r  s        @@@rS   snode_args_kwargsr  [  s   :D:,,*$*)*
 D ZF'-':D&>'J'J$I$
 
 
 

    	  I= = = =      7666I666I(4IJJLD&<rU   r  )ru   rv   rH   rv   )rz   r{   rH   r|   )r   r   )r   r   r   rv   r   rv   rH   r   r  )r   r  rH   r  )r  r  rH   r{   )r$  r%  r&  r%  rH   r{   )r  r-  rH   r.  )r4  r5  r6  r5  rH   r5  )r<  r=  rH   rI   )rl  rm  rH   rn  )r   r5  rH   rt  )rl  r  rH   r  )r  r  rH   r|   )r  r2   r  r  rH   r|   )r  r   r   r  r  r  rH   r  )rC   )r   rI   rH   r  )r3   rC   )
r  r  r  r  r   rv   r   rI   rH   r   )rL   r  r  r  rC   )r  r  r  r  r   rv   r  rv   r  r   r   rI   rH   r   )r  r   r  rI   rH   r  )r  r   r  r   rH   r  )r)  rv   r*  rv   rH   rv   )rR   r  r  rv   rH   r  )rR   r  rH   r  )r   r  rH   r  )r	  r
  rH   r  )r	  r  r  r  rH   rI   )r	  r
  r  r6   rH   r)  r   )rr  rs  rt  ru  rH   rv  )r   r{  r  r|  rH   rv  r  )r  r{   rH   r  )r  rI   rH   r|   )r  rh   r0  rv   rH   r  )r  r|   rH   r|   )r   rI   rH   r  )r  r{   r  r  rH   r{   )r)  r   rH   r  )r   r   rH   r|   )r  r  rH   r  )r  r  rH   r2   )r  r  rH   r  r  )r  r   rH   r   )NNT)r  r  r  r  r  r|   rH   r  )r<  r  rH   r=  )r|  r)   r<  rD  rH   r=  )r   rV  rH   rv   r  r   )r  r  rH   r|   r  )r  rv   r   r  r  r   rH   r4   )rA  r<   r
  r  rH   r|   )r  rI   rH   r|   )
rA  r<   r  r|   r  r|   r  r|   rH   r|   )r+  r;   r*  r|   rH   r|   )
rA  r<   r  rv   r-  rv   r  rv   rH   r|   )rl  rI   rH   r|   )r  rs  r-  rs  r  rs  rH   r|   )r  rs  r-  rs  r  rs  rH   r=  )r   rI   rH   rI   )rH   r  )rA  r<   rH   r|   )rA  r<   r  r  r  r;   rH   r|   )FTFN)rA  r<   r  r;   r  r;   r  r|   r  r|   r  r|   r  r   rH   r|   )r   r  r   r  r  r  rH   r  )r   r  rH   r  )r   r  r   r  r  r  rH   r   )r   r  r   r  r  r  rH   rI   )r   r  r   r  r  r  rH   r+  )r4  r  r5  r  rH   r  )r=  r  r>  r?  rH   r   )rG  rI   rH   r  )rH   r  )r^  r  rH   r|   )r[  r\  rH   r|   )r  r{   rH   rV  )rb  r|   r   r   r  r   rH   rc  )r   rV  rH   r   )r  rI   rH   r|   )r  rI   rH   rv   )r  r  rH   r|   )
r  r  r  r1   r  r  rG  rI   rH   r  )r  r  rH   r|   )r  r  r  r  rH   r|   )r  r  rH   r|   )r  rA   rH   r|   )r  r  r  r  rH   r|   )r  rI   r  r  r  r  rH   r   )r  rA   r  r  r  r  r  r  r  r  rH   r  )r  rv   r  rv   rH   rv   )r  r  rH   rv   )r  r   rH   r  )r   rI   rH   rI   )r   r  rH   r|   )r   rI   rH   r|   )r   rV  rH   r|   )r  r  r  r  r/  rV  r0  rV  r1  rI   r2  r|   rH   r|   )r	  r  rH   r  )r   r  rH   r|   )rM  r  rH   r|   )rH   r  )r   r  r   r  r  r  rH   rY  )r[  rh  rH   ri  )r  rm  rn  r  ro  rp  rH   rm  )rR   r  rH   r  )rr  rs  r  r  r  r  rH   r  )r[  rh  r  r  rH   r  )r   r{   rH   r|   )r  r  r  r@   rH   r  )r   rV  rH   rI   )r   rI   rH   rV  )r=  r  r   r  rH   r|   )rH   r  )r   rI   r  r0   r  r=  rH   r  )r   rI   rH   r  )r   rV  rH   rV  )r   r  r   r|   rH   r   )rH   r  )rH   r	  )r  r=   rH   r|   )r  r6   rH   r|   )r  rv   rH   rV  )r   rI   rH   r|   )r9  r:  r;  r<  rH   rI   )NN)
rH  r  rI  r  rJ  rK  rL  rK  rH   rM  )rS  r  rH   r  )rR   r(   rc  r|   rH   rd  )rg  N)rG  rI   r  r  r  rh  rH   r  )rH   rp  )r  r  rH   r  )r  rA   rH   r  (p  
__future__r   rX  r  r  enumr  r  r|  r  r  r\  r  r  r#  r  r   r7  r   r  ru  r/  r  r  rE  collections.abcr   r   r   r   r   r	   r
   r   typingr   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r   r   r   r   rN   torch.utils._pytreer  _pytreer  $torch._inductor.analysis.device_infor   torch._inductor.runtime.hintsr   torch.utils._dtype_abbrsr    torch.utils._ordered_setr!   r"   r#   OPTIMUS_EXCLUDE_POST_GRADr  r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   torch._prims_commonr0   torch.fxr1   torch.fx.noder2   r  r4   r$  r6   rz  r8   r  r9   r:   r;   r<   r=   r>   output_coder@   r  rA   rB   rV   rG   r   r[   torch._dynamo.device_interfacer\   torch._dynamo.utilsr]   torch.autogradr^   torch.autograd.profiler_utilr_   (torch.fx.passes.graph_transform_observerr`   torch.fx.passes.shape_propra   torch.utils._sympy.functionsrb   rc   rd   re   rf   torch.utils._sympy.symbolrg   rh   torch.utils._sympy.value_rangesri   rj   r  rl   runtime.runtime_utilsrm   r;  _IS_WINDOWS	getLoggerr   r   _logginggetArtifactLoggerrl  rp   r2  r9  	VarRangesr  rv   	InputTypeGPU_KERNEL_BIN_EXTSrK  r  r1  r  rx   ry   r   Functionr   r   r   r   r  r  r  r#  r,  r3  rk  rs  r~  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r(  rq  rz  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r;  clear_on_fresh_inductor_cacheclear_inductor_cachesfresh_inductor_cacherC  rU  	lru_cacherZ  r\  r`  rc  r  r  ro  r  r  r  r  r  r	  r  r  r  r)  r\  r`  rk  rr  rs  r|  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r%  r(  r*  r3  r<  rF  rK  rO  rV  rZ  r_  ra  re  r  rp  r  r  r  r  r  r  r6  r  r  Enumr  r  r  r  r  r  r  r  r  r  r  r  r
  r  r  r%  r(  r  r+  r.  r<  rH  rL  rP  rX  rg  rl  ry  r  ru  r  r  r  r  r  r  r^  r  compiler  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r  r$  r)  r-  r0  SUPPORTED_MKLDNN_DEVICESr4  r8  rG  rR  r^  rb  rf  ro  rx  rz  PartitionFnTyper  r~  r,  r  r  rL   rU   rS   <module>r     s   " " " " " " "                       				        				  				      



                                                                                    $ $ $ $ $ $ $ $ $ ? ? ? ? ? ? : : : : : : 0 0 0 0 0 0 / / / / / / ; ; ; ; ; ; ; ; ($ 
             >>>>>>>>>>>//////////CCCCCC$$$$$$"""""",,,,,,555555$$$$$$TTTTTTTTTTTTTTTT,,,,,,======== +**	GCLL
     D C C C C C 0 0 0 0 0 0 % % % % % % 2 2 2 2 2 2 K K K K K K 0 0 0 0 0 0              8 7 7 7 7 7 7 7 D D D D D D D D       = = = = = = lg%g!!00<HH WT]]UZ'(	U5<el:;<	'77 	 {Q'A--+2B2B2BDX2B2BB5 5 5 5
L L L L    EN    d###       $#"G G G G GV 9<S S S S Sl       ; ; ; ;@ @ @ @
+ + + +* * * *#A #A #A #AL+ + + +   "/ / / /G G G G @OI I I I I0   0' ' ' ' ' 	    ( %'     ) ) ) )' ' ' '# # # #   $ $ $ $  IcNNWTT"""E E E E E8WQU^ E E E   :   *!) !) !) !)HN2 N2 N2 N2f 48    (G G G G:, , , ,^% % % %   	D 	D 	D 	DU U U U	> 	> 	> 	>   2   - - - -8 8 8 8v   ' ' ' '& 
			   : !#  " " " "	 	 	 	    .29 9 9 9 9z !5 $ " A A A A! ! ! !H Q7 7 7 7    *    , , , , , , , ,
R' R' R' R' R' R' R' R'j
 
 
 
 
 
 
 
 @ @ @ @       @? ? ? ? ?' ? ? ?     8 J J J J ) ) ) )I I I I #'    (           #     : 7< d d d d d dN CH         BJ J J J CO, , , , ,    $    . 5( 5( 5( 5(p @ @ @ @ R R R R:+ + + +\      H H H H& & & &
 
 
 
" ""&"&= = = = =@' ' ' 'C C C C C C C C"       $   / / / /( ( ( (V      # # # #* . . . .$ IMF F F F F*	 	 	 	   B& & & && & & &              D D D D %6 %6 %6 %6P    Q Q Q Q0 0 0 0   ( ( ( (# # # #K K K K   (* * * * *$) * * *!
 !
 !
 !
H      " -1# # # # #L( ( ( (% % % %# # # #J J J JL L L L *=)<
 
 
 
 
< *=)<    :T T T T       2            ,! ! ! !   . . . .N N N N $ $ $ $NH H H H>   L L L L' ' ' '    2   (   0	: 	: 	: 	: 37$ $ $ $ $<   $ 3  3  3  3F   :   &/ / / / '#)* $%
 
  HG*>*D*D*F*FGGG  "*Y''H H H H   	 	 	 	       & 1 1 1 1 1 1 1 1
 68  7 7 7 7    /9jll  : : : :) ) ) )	# 	# 	# 	#    ')


')

-" -" -" -" -" 01 -" -" -"` D)))t      *)       49    4 4 4 42Y Y Y Y   &
 
 
 
   1 1 1 1    * 	 	 	 		 	 	 	   & "&!%	 
  
  
  
  
F. . . .b   6    :(,' ' ' ' '2   6 d###       $# 38$./@ 3 3 3 3 3 3 3 3 *:)9);); &= = = =           rU   