a
    h                    @   s  U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Zd dl
mZ d dlmZmZ d dlmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlm  mZ d dlmZm Z  d dl!m"Z" d dl#m$Z% d dl&m'Z' d d	l(m)Z) d d
l*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z> ddl?m$Z$ ddl@mAZA ddlBmCZCmDZDmEZE ddlFmGZG ddlHmIZI ddlJmKZKmLZL ddlMmNZNmOZOmPZP erd dlQZQe$jRZSeTeUd< eVeWZXejYeUd< ejZj[Z[ejZj\Z\eG dd dZ]eG dd dZ^eG dd dZ_ej`eTd d!d"ZaejbeTd#d$d%ZcejbeTd#d&d'Zdej`eed d(d)ZfG d*d+ d+Zgeg Zhdejiejej` ejej` eek ejid,d-d.Zlej`eTd d/d0Zmej`eTd d1d2Znej`eTd d3d4Zoej`eTd d5d6Zpej`eTd d7d8Zqej`eTd d9d:Zrej`eTd d;d<Zsej`eTd d=d>Ztejbeuejej` ejej` f d?d@dAZvejej` ekdBdCdDZweejej` euej` f eedEdFdGZxdejjiejj`eyeydJdKdLZzejjiejj`ejj`ej{eyeyejj`dMdNdOZ|ej}eydPdQdRZ~ejej{ dSdTdUZejj`eTd dVdWZej{dSdXdYZej{eudZd[d\Zejjidd]d^d_Zejjidd]d`daZdejej` ejbejbee<ej`  ddbdcddZddeejbejej` ejej` eeee<ej`  euejbejbf dfdgdhZdddiejbeejee  ee<ej`  euejbejbf djdkdlZeedmZeeeedndodpZej`eed dqdrZejidsdtduZejdvdw Zeej`eef ejeuej`eef  dxdydzZejbejbd{d|d}Zejjbejjbejj`ejj`ejeeejj`ejj`d~ddZejbejbejbeeeuejbejbf dddZejbdd?ddZejbejbd?ddZdejie^e_ee<ej`  dddZdd Ze]dSddZejidsddZejiejey ejey eye^ejej` eueyejee ejee f dddZd dlmZ ej}eeej}dddZdd Zdejie^ejej` dddZejjiejejj` dddZdddejbeejee  euejbejbf dddZdejjbekekeTeeekejek f  eTeek ddddZdS )    Ndefaultdict)	dataclassreplace)AnyCallableOptionalTYPE_CHECKINGUnion)countersis_node_meta_valid)(create_structured_trace_for_min_cut_info)config)trace_structured)extract_tensor_metadata)BackwardState)is_sym_nodepy_sym_types)magic_methodsmethod_to_operator)find_symbol_binding_fx_nodesfree_symbolshint_intis_symbol_binding_fx_nodestatically_known_falsestatically_known_true)graph_drawer)
OrderedSet)CheckpointPolicy   )GraphInfoProvider)dp_knapsackgreedy_knapsackilp_knapsack)KnapsackEvaluator)get_aot_graph_name)get_cuda_generator_meta_valis_with_effects)fx_graph_cseget_aten_targetraise_getitemsAOT_PARTITIONER_DEBUGlogc                   @   s   e Zd ZU dZee ed< ee ed< ee ed< ee ed< ee ed< ejddd	Z	ejdd
dZ
ejdddZejdddZejdddZdS )OpTypesz8Class for keeping track of different operator categoriesfusible_opscompute_intensive_ops
random_opsview_opsrecomputable_opsnodec                 C   s   t || jv S N)r)   r.   selfr4    r8   K/var/www/auris/lib/python3.9/site-packages/torch/_functorch/partitioners.py
is_fusibleM   s    zOpTypes.is_fusiblec                 C   s   t || jv S r5   )r)   r/   r6   r8   r8   r9   is_compute_intensiveP   s    zOpTypes.is_compute_intensivec                 C   s   t || jv S r5   )r)   r0   r6   r8   r8   r9   	is_randomS   s    zOpTypes.is_randomc                 C   s   t || jv S r5   )r)   r1   r6   r8   r8   r9   is_viewV   s    zOpTypes.is_viewc                 C   s   t || jv S r5   )r)   r2   r6   r8   r8   r9   is_recomputableY   s    zOpTypes.is_recomputableN)__name__
__module____qualname____doc__r   r   __annotations__fxNoder:   r;   r<   r=   r>   r8   r8   r8   r9   r-   C   s   
r-   c                   @   s   e Zd ZU eej ed< eej ed< eej ed< eej ed< eeje	f ed< eej ed< e
jeej ddd	Zejed
ddZejed
ddZejed
ddZeje	d
ddZdS )NodeInfoinputs_required_fw_nodesrequired_bw_nodesunclaimed_nodesfw_orderstatic_lifetime_input_nodesreturnc                    s    t dd  jD  fdddS )Nc                 s   s   | ]
}|V  qd S r5   r8   .0nr8   r8   r9   	<genexpr>l       z-NodeInfo.required_fw_nodes.<locals>.<genexpr>c                    s
    j |  S r5   )rK   rQ   r7   r8   r9   <lambda>l   rS   z,NodeInfo.required_fw_nodes.<locals>.<lambda>key)sortedrH   rU   r8   rU   r9   required_fw_nodesi   s    zNodeInfo.required_fw_nodes)rQ   rN   c                 C   s
   || j v S r5   )rH   r7   rQ   r8   r8   r9   is_required_fwo   s    zNodeInfo.is_required_fwc                 C   s
   || j v S r5   )rI   r[   r8   r8   r9   is_required_bwr   s    zNodeInfo.is_required_bwc                 C   s
   || j v S r5   )rJ   r[   r8   r8   r9   is_unclaimedu   s    zNodeInfo.is_unclaimedc                 C   s$   || j v sJ d| d| j| S )NNode z not in fw nodes!)rH   rK   r[   r8   r8   r9   get_fw_orderx   s    zNodeInfo.get_fw_orderN)r?   r@   rA   listrD   rE   rC   r   dictint	functoolscached_propertyrZ   boolr\   r]   r^   r`   r8   r8   r8   r9   rF   ]   s   
rF   c                   @   s6   e Zd ZU eed< eed< eed< eed< eed< dS )MinCutOptionsban_if_used_far_apartban_if_long_fusible_chainsban_if_materialized_backwardban_if_not_in_allowlistban_if_reductionN)r?   r@   rA   rf   rC   r8   r8   r8   r9   rg   }   s
   
rg   )r4   rN   c                 C   s   | j dd tjtjfv S )N	recompute)metagetr   ZMUST_RECOMPUTEZPREFER_RECOMPUTEr3   r8   r8   r9   must_recompute   s    rp   )fx_grN   c                 C   s    | j jD ]}t|r dS qdS )NTF)graphnodesrp   rq   r4   r8   r8   r9   has_recomputable_ops   s    ru   c                 C   s<   | j jD ].}t|rt|jdrtjj|jjv r dS qdS )NtagsTF)	rr   rs   rp   hasattrtargettorchTagnondeterministic_seededrv   rt   r8   r8   r9   has_recomputable_rng_ops   s    
r|   c                 C   s6   t | jd tjtjfrdS t | jd tjs2J dS )Nvalr      )
isinstancern   ry   SymIntSymBoolSymFloatr3   r8   r8   r9   sym_node_size   s    r   c                   @   s   e Zd Zdd ZdS )InvalidNodeBasec                 C   s   dS )NzInvalid Noder8   rU   r8   r8   r9   __repr__   s    zInvalidNodeBase.__repr__N)r?   r@   rA   r   r8   r8   r8   r9   r      s   r   )joint_graphrG   outputssubgraphrN   c           
         s  t  }i  |D ] }||j}|j|_| |< q| jD ]}t|rV|dkrVt |< q8| v rbq8q8|jdkrvt |< q8|jdkrt	j
|ji |j} fdd|D }t|rt |< q8|| fdd |< q8|jdkr|| fd	d |< q8|jd
kr8q8g }|D ]h}	t|	t jrf|	 vr6td|	 dt |	 trVJ d|	 d| |	  n
||	 q
|t| |  |  |S )a  
    Given a graph, extracts out a subgraph that takes the specified nodes as
    inputs and returns the specified outputs.

    This includes specifying non-placeholder nodes as inputs.

    The general strategy is to initialize all inputs with proxies as we
    encounter them, and trace through the graph, only keeping values which take
    in valid proxies. Then, all dead code is eliminated.
    backwardplaceholdercall_functionc                    s&   g | ]}t |tjrt  | tqS r8   )r   rD   rE   r   rP   xenvr8   r9   
<listcomp>   s   z6_extract_graph_with_inputs_outputs.<locals>.<listcomp>c                    s    |  S r5   r8   r   r   r8   r9   rV      rS   z4_extract_graph_with_inputs_outputs.<locals>.<lambda>get_attrc                    s    |  S r5   r8   r   r   r8   r9   rV      rS   outputr_   z couldn't be found in envz was invalid, but is output)rD   Graphr   namern   rs   _must_be_in_backwardInvalidNodeoppytreearg_tree_leavesargskwargsany	node_copyr   rE   RuntimeErrorr   appendr   tupleeliminate_dead_codeZlint)
r   rG   r   r   	new_graphr4   new_nodeall_argsZoutput_valuesr   r8   r   r9   "_extract_graph_with_inputs_outputs   sV    









r   c                 C   s,   | j dko*dt| jvo*t|  o*t|  S Nr   tangents)r   strrx   _is_bwd_seed_offset_is_fwd_seed_offsetr3   r8   r8   r9   
_is_primal   s    
r   c                 C   s   | j dkodt| jv S r   r   r   rx   r3   r8   r8   r9   _is_tangent   s    r   c                 C   s&   | j dko$dt| jv p$dt| jv S )Nr   Zbwd_seedZbwd_base_offsetr   r3   r8   r8   r9   r     s    
r   c                 C   s&   | j dko$dt| jv p$dt| jv S )Nr   Zfwd_seedZfwd_base_offsetr   r3   r8   r8   r9   r     s    
r   c                 C   s   | j dkot| jdtS )Nr   r}   )r   r   rn   ro   r   r3   r8   r8   r9   _is_backward_state  s    r   c                 C   s   | j dd dkS )Npartitioner_tagZis_backwardrn   ro   r3   r8   r8   r9   _has_tag_is_backward  s    r   c                 C   s   | j dd dkS )Nr   Zmust_be_in_backwardr   r3   r8   r8   r9   _has_tag_must_be_in_backward  s    r   c                 C   s   t | pt| ot| S r5   )r   r   r'   r3   r8   r8   r9   r     s    r   )joint_modulerN   c                C   s>   t jdd | jjddD  }|d | }||d  }||fS )Nc                 s   s   | ]}|j V  qd S r5   r   rP   r4   r8   r8   r9   rR   $  rS   z+_extract_fwd_bwd_outputs.<locals>.<genexpr>r   r   )r   r   rr   
find_nodes)r   num_fwd_outputsr   fwd_outputsbwd_outputsr8   r8   r9   _extract_fwd_bwd_outputs   s    r   )saved_valuesr   c                 C   s&   | D ]}|j |kr| |  q"qd S r5   )r   remove)r   r   Zsaved_valuer8   r8   r9   _remove_by_name+  s    

r   )fwd_module_outputsrN   c                 C   s>   t | }tt | d ddD ]}t| | s|d } q:q|S )Nr   )lenranger   )r   idxir8   r8   r9   find_first_sym_node2  s    r         @-q=)rr   r4   maxminc                 C   s\  |  |X | jtjjjj|fd}tjjj|jd |jd< t|jd |jd< W d    n1 sh0    Y  |  |d | jtjjj	j|dgdfd}tjjj	|jd dgd|jd< t|jd |jd< W d    n1 s0    Y  |  |` | jtjj
jj|tjfd}tjj
j|jd tj|jd< t|jd |jd< W d    n1 sb0    Y  |  |\ | jtjjjj||fd}tjjj|jd ||jd< t|jd |jd< W d    n1 s0    Y  |  |X | jtjjjj|fd}tjjj|jd |jd< t|jd |jd< W d    n1 sN0    Y  |  |\ | jtjjjj||fd}	tjjj|jd ||	jd< t|	jd |	jd< W d    n1 s0    Y  |  |	l | jtjj
jj|	tjfdt|j d}
tjj
j|	jd tj|
jd< t|
jd |
jd< W d    n1 sN0    Y  |
S )Nr   r}   tensor_metar   T
fp8_scale_r   r   )inserting_afterr   ry   opsatenabsdefaultrn   r   amaxprimsconvert_element_typefloat64	clamp_min
reciprocalmulTensorfloat32r   r   )rr   r4   r   r   Zabs_nodeZ	amax_nodeZamax_64_nodeZclamp_min_nodeZreciprocal_nodeZmul_node
scale_noder8   r8   r9   calculate_quantization_scaling=  s    
2



2


*



*


*



4


4r   )rr   r4   r   
quant_typer   	clamp_maxrN   c                 C   sn  |  |` | jtjjjj|tjfd}tjjj|jd tj|jd< t	|jd |jd< W d    n1 sp0    Y  |  |b | jtjj
jj||fd}tjj
j|jd |jd |jd< t	|jd |jd< W d    n1 s0    Y  |  |\ | jtjj
jj||fd}tjj
j|jd ||jd< t	|jd |jd< W d    n1 sd0    Y  |  |\ | jtjj
jj||fd}	tjj
j|jd ||	jd< t	|	jd |	jd< W d    n1 s0    Y  |  |	h | jtjjjj|	|fdt|j d}
tjjj|	jd ||
jd< t	|
jd |
jd< W d    n1 s`0    Y  |
S )Nr   r}   r   
fp8_quant_r   )r   r   ry   r   r   r   r   r   rn   r   r   r   r   r   r   r   r   )rr   r4   r   r   r   r   Ztarget_node_32Zscaled_target_nodeZclamp_min_scaled_nodeZclamp_max_scaled_nodeZquant_activation_noder8   r8   r9   perform_quantization  sx    


(


(



*



*


*r   )tensorrN   c                 C   s   |   }|  }|| d S )z
    Calculate the size of a PyTorch tensor in megabytes (MB).

    Args:
        tensor (torch.Tensor): Input tensor

    Returns:
        float: Memory size in MB
    i   )numelelement_size)r   Znum_elementsr   r8   r8   r9   calculate_tensor_size  s    r   rM   c                  C   s.   t jjjd dd} dd | dD } | S )N!activation_quantization_aten_passallowed_dtypesztorch.bfloat16c                 S   s    g | ]}t t|d d qS ).r   )getattrry   split)rP   dtyper8   r8   r9   r     s   z&get_allowed_dtypes.<locals>.<listcomp>;)ry   	_inductorr   post_grad_fusion_optionsro   r   )r   r8   r8   r9   get_allowed_dtypes  s    r   c                 C   s   t  }t| r| jd j|vr"dS tjjjd dd}t	| jd }tjjjd ddsd||kS tjjjd ddrt
||kpt||k S t
||kS d S )Nr}   Fr   
size_in_mbd   Zskip_dynamo_guardsZquantize_dynamic_shape)r   r   rn   r   ry   r   r   r   ro   r   r   r   )r4   r   Zsize_thresholdr   r8   r8   r9   should_quantize  s4    r   c                  C   s*   t jjjd dd} tt | dd S )Nr   r   ztorch.float8_e5m2r   r   )ry   r   r   r   ro   r   r   )r   r8   r8   r9   get_quant_type  s    r   )r   rN   c                 C   s   t | }|j|jfS )z
    Calculate the range of values for a given torch.dtype.
    Args:
        dtype (torch.dtype): The input dtype.
    Returns:
        tuple: A tuple containing the minimum and maximum values.
    )ry   Zfinfor   r   )r   infor8   r8   r9   calculate_range  s    
r   )rr   rN   c              	      s  | j ddd }|jd }t }t|\}}t  g g  }}|D ]}|jddr@tjj	j
d ddrt| ||d	}	t| ||	|||}
t|	s||	 n
||	 n| |h | jtjjjj||fd
t|j d}
tjjj|jd ||
jd< t|
jd |
jd< W d    n1 s"0    Y  |
 |< q@ fdd|D }t|}|| }|rz|d | | ||d   }|dt| td d  d7  < d S )Nr   r   r   saved_for_quantizationFr   use_scalingTr   r   r   r}   r   c                    s    g | ]}| v r | n|qS r8   r8   r   Znode_to_quantr8   r9   r   D  s   z*quantize_activation_fw.<locals>.<listcomp>inductorZ%activation_quantization_fwd_aten_passr   )r   r   r   r   rb   rn   ro   ry   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   Z
update_argr   r   )rr   r   r   r   r   r   Ztensor_scale_nodesZsym_scale_nodesr4   r   Z
quant_nodeZoutput_updated_argsr   Zscale_nodesr8   r   r9   quantize_activation_fw  s`    




*

r   c           	   	      s  dd | j D }d }|D ]}|jddr|jd |jd}tjjjd ddr| |8 d|j	
d	d
  t fdd|D }W d    n1 s0    Y  | |\ | jtjjjj||fd}tjjj|jd ||jd< t|jd |jd< W d    n1 s0    Y  | |b | jtjjjj||fd}tjjj|jd |jd |jd< t|jd |jd< W d    n1 s0    Y  | |\ | jtjjjj||fd}tjjj|jd ||jd< t|jd |jd< W d    n1 s0    Y  n| |h | jtjjjj||fdt|j	 d}tjjj|jd ||jd< t|jd |jd< W d    n1 s0    Y  t|j D ]&}||kr||kr||| qqtd d  d7  < d S )Nc                 S   s   g | ]}|j d kr|qS )r   r   r   r8   r8   r9   r   T  rS   z*quantize_activation_bw.<locals>.<listcomp>r   Fdequant_typer   r   r   r    c                 3   s   | ]}|j  kr|V  qd S r5   r   )rP   	bwd_inputZ
scale_namer8   r9   rR   b  s   
z)quantize_activation_bw.<locals>.<genexpr>r   r}   r   Zdequant_r   r   Z%activation_quantization_bwd_aten_passr   )rs   rn   ro   popry   r   r   r   r   r   r   nextr   r   r   r   r   r   r   divr   r   ra   userskeysZreplace_input_withr   )	rr   Z	bw_inputsZactivation_noder4   r   r   Zdivided_target_node_32Zdequant_nodeuserr8   r   r9   quantize_activation_bwS  s    
&


*


"


,


*r  )r   
fwd_module
bwd_modulerL   rN   c              	      s   t jdd d u rd S |r(dd |D ng }dd | D }tjjjd ddr^dd | D }jjd	d
d jd }dd  jjdd
D }|D ]t}|j	|v rt
|r|j	|v rtd|j	 qd|jd< |jd j|jd< d||j	 jd< |jd j||j	 jd< qtddd fddd tj tddd fddd tddd  fddd jjd	d
d jd }	|	D ]}
d|
j	v r|||
j	dd } j|   jj|
j	d}W d    n1 s0    Y  |jd }|j|
j d|jd< ||jd< ||  j| q|tjjjd ddrt jjdd
}|d  }t|D ]}t|s\|} qxq\jjd	d
d jd }|D ]b}
d!|
j	v r j|   jj|
j	d}W d    n1 s0    Y  |j|
j |}qt j tdd"d  fd#dd d S )$Nr   c                 S   s   g | ]
}|j qS r8   r   r   r8   r8   r9   r     rS   z2enable_activation_quantization.<locals>.<listcomp>c                 S   s   i | ]}|j |qS r8   r   r   r8   r8   r9   
<dictcomp>  rS   z2enable_activation_quantization.<locals>.<dictcomp>Zexclude_primalsFc                 S   s   i | ]}d |j vr|j |qS )Zprimalsr   r   r8   r8   r9   r	    s   r   r   r   c                 S   s   i | ]}|j |qS r8   r   r   r8   r8   r9   r	    s   r   z*Skipping quantization of static input %s: Tr   r}   r   Zartifactc                   S   s
   dddS )NZ,before_activation_quantization_fwd_aten_passstringr   encodingr8   r8   r8   r8   r9   rV     s    z0enable_activation_quantization.<locals>.<lambda>c                      s    j ddddS NFT)Zprint_outputZinclude_strideZinclude_deviceZprint_readabler8   r  r8   r9   rV     s   )Zmetadata_fnZ
payload_fnc                   S   s
   dddS )NZ+after_activation_quantization_fwd_aten_passr
  r  r8   r8   r8   r8   r9   rV     s    c                      s    j ddddS r  r  r8   r  r8   r9   rV     s   c                   S   s
   dddS )NZ,before_activation_quantization_bwd_aten_passr
  r  r8   r8   r8   r8   r9   rV     s    c                      s    j ddddS r  r  r8   r  r8   r9   rV     s   r   r   r   r   r   r   c                   S   s
   dddS )NZ+after_activation_quantization_bwd_aten_passr
  r  r8   r8   r8   r8   r9   rV     s    c                      s    j ddddS r  r  r8   r  r8   r9   rV     s   )inductor_configr   ro   ry   r   r   rr   r   r   r   r   r,   debugrn   r   r   r   r   r   r   updatereplace_all_uses_with
erase_nodera   reversedr   r  )r   r  r  rL   Zstatic_input_namessaved_values_namesr   Zbwd_module_inputsr4   Zquant_fwd_module_outputsZfwd_noder   Zquant_bwd_inputr   Zquant_bwd_module_inputsZbwd_input_locZbw_inputZscaled_fwd_module_outputsZscale_bwd_inputr8   )r  r  r9   enable_activation_quantization  s    





0





0

r  )rL   )r   r   saved_sym_nodesr   rL   rN   c                C   sT  t | |d\}}| jjdd}g tt|}g tt|}	g tt|}
g tt|}g tt|}t	| j|| |	 | |d}t
j }|jddD ]r}|jst||j t||j q|rtdd |jD rt||j t||j qt|rt||j |sJ qt }g }g }|D ]4}t|}|rD|| || n
|| qt| j}t|||	D ]d}d|jvr~qjt|jd | }t|dd	 d
D ]"}||vrq|||  q||O }qj|  |||  t	| j||
 || | d}t	| j|| |	 | | |d}tj| |}tj| |}t |||| ||fS )Nr   r   r   r   c                 s   s.   | ]&}|j tjjjju o$t|jd kV  qdS r   N)rx   ry   r   _c10d_functionalZwait_tensorr   r   r  rO   r8   r8   r9   rR   C  s   z+_extract_fwd_bwd_modules.<locals>.<genexpr>r}   c                 S   s   | j S r5   r   )sr8   r8   r9   rV   i  rS   z*_extract_fwd_bwd_modules.<locals>.<lambda>rW   forward)!r   rr   r   filterr   r   r   r   r   r   ry   distributedis_availabler  r   r   allr   r   addr   r   	itertoolschainrn   r   rY   clearextendrD   Z_lazy_graph_moduleZ_make_graph_moduler  )r   r   r  r   rL   r   r   Zplaceholdersprimal_inputstangent_inputsfwd_seed_offset_inputsZbwd_seed_offset_inputsZbackward_state_inputsZ	bwd_graphZdistributed_enabledr4   Zsaved_symbolsZsaved_sym_nodes_bindingZsaved_sym_nodes_derivedsymbolZsymbol_bindingsZnew_symbolsr  Z	fwd_graphr  r  r8   r8   r9   _extract_fwd_bwd_modules  s    






r,  )static_lifetime_input_indicesrL   )r   r-  rL   rN   c                   sj  t | rt| |||dS ttt| jj}ttt| jj}|| }t| |d\}}	t	| j||d}
t
dd |
jD  g }g }| jjD ]}|j vrqt|r|| qd|jvr|jdkr|j}tdd |D sJ || q fd	d
|jD }d|jv r(tdd |D r(|| q|| qtt| }tt| }t| ||||dS )a  
    Partitions the :attr:`joint_module` in a manner that closely resembles the
    behavior observed in the original ``.forward()`` and ``.backward()`` of the
    callable, i.e., the resulting forward graph contains those operators that
    are executed in the original ``.forward()`` callable passed to
    :func:`aot_function`.

    The default partitioner collects the operators that are between the forward
    inputs and the forward outputs. This helps in finding the tensors which have
    to be stashed for the backward pass. These stashed tensors become the output
    of the generated forward graph. The remaining operators are then placed in
    the backward graph.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    )r   r-  r  r  c                 s   s   | ]}|j d kr|jV  qdS r   Nr   r   r   r8   r8   r9   rR     s   z$default_partition.<locals>.<genexpr>r   r   c                 s   s   | ]}|j tjkV  qd S r5   )rx   operatorgetitemrP   r  r8   r8   r9   rR     rS   c                    s   g | ]}|j  vr|qS r8   r   rO   Zforward_node_namesr8   r9   r     s   z%default_partition.<locals>.<listcomp>c                 s   s   | ]}t |V  qd S r5   r   rO   r8   r8   r9   rR     s   r  r   rL   )ru   #min_cut_rematerialization_partitionra   r  r   rr   rs   r   r   r   r   r   r   r   rn   r   r  r"  r'  rb   fromkeysr  r,  )r   _joint_inputsr   r-  rL   r(  r*  rG   r   r   forward_only_graphr   r  r4   r  Zbackward_usagesr8   r3  r9   default_partition  s^    




r:  g    .A)r   rN   c                 C   s
   | |j  S r5   )itemsize)r   r   r8   r8   r9   _tensor_nbytes  s    r<  c                    s   t ddd d| jv r| jd }t|tr0dS t|ttfrTt fdd|D S t|trxt fdd| D S t|t	j
r |S td	t| d
|  | jdks| jt	jjjju rdS td|  dd S )NrM   c                 S   s(   t | tjsdS tt|  dd| jS )Nr      fallback)r   ry   r   r<  r   r   r   r   r8   r8   r9   object_nbytes  s    z_size_of.<locals>.object_nbytesr}   r   c                 3   s   | ]} |V  qd S r5   r8   rO   r@  r8   r9   rR     rS   z_size_of.<locals>.<genexpr>c                 3   s   | ]\}} |V  qd S r5   r8   )rP   _rQ   rA  r8   r9   rR   	  rS   zUnknown metadata type z	 on node r   r   r_   zO didn't have `val` metadata; we should always have `val` metadata on the nodes.)rc   rn   r   r   ra   r   sumrb   itemsry   r   r   typer   rx   r   r   Z_assert_scalarr   )r4   r}   r8   rA  r9   _size_of  s"    




rF  )rr   c                 C   sb   ddl m} |t}| jD ]"}|jdkr||jj  d7  < qtdt	|
 tddd d S )Nr   r   r   r   z%sTrX   reverse)collectionsr   rc   rs   r   rx   r?   r,   r   rY   rD  r0  
itemgetter)rr   r   Zcntr4   r8   r8   r9   
_count_ops  s    

rK  c                  C   sl   g } t tjjD ]V}ttjj|}t|tjjs2q| D ]*}t||}tj	j
|jv r:| |  qq:q| S r5   )dirry   r   r   r   r   _opsZOpOverloadPacketZ	overloadsrz   Z	pointwiserv   r   )r   	attr_nameZopoverloadpacketoverloadZop_overloadr8   r8   r9   pointwise_ops   s    

rP  )	depth_maprN   c                    s*    fdd| D }t | tdddS )Nc                    s&   i | ]}t |tjjjr| | qS r8   )r   ry   rD   r4   rE   )rP   argrQ  r8   r9   r	  3  s   zsort_depths.<locals>.<dictcomp>r   TrG  )rY   rD  r0  rJ  )r   rQ  Z
arg_depthsr8   rS  r9   sort_depths2  s    
rT  )gmrN   c           	         s0  t  i  | jjddD ]}| fdd |< qdd t| jjD  fdd}ttt	| jj}d	}t
j}|D ](}|jD ]}| |k r| }|}qq~|d	u r| S t| jjd	|  D ](}|jd
kr|jtjjjjkr|| qt| jj| d	 D ]}|| qtj | }|S )a  
    This pass finds the first bwd node in the graph (by looking at users of
    tangents) and then reorders the graph by walking from this node to all the
    way to the end of the graph. At each op in this traveral, we insert this op
    in a new graph and try to bring only the relevant subgraph from the other
    non-bwd edges relevant for this op. This closely mimics the behavior of
    autograd engine.

    Why is this pass required in the first place?

    This is an artifact of how partitioners work today. The starting point of
    partitioner is a joint graph, which is fwd and then bwd graph. In the case
    of checkpointing, we keep portions of fwd graph in their original place in
    the joint graph, while obtaining a bwd graph. As a result, the resulting bwd
    graph has copies of recomputed fwd subgraphs followed by the original bwd
    graph. If we run this naively, this leads to bad memory footprint, because
    the fwd subgraphs are live for way longer duration than necessary. This pass
    reorders the operations such that we prioritize the ops for the original bwd
    graph while only realizing those ops from the fwd graph that are necessary
    at any given point in the graph.
    r   r   c                    s    |  S r5   r8   r   r   r8   r9   rV   U  rS   z5reordering_to_mimic_autograd_engine.<locals>.<lambda>c                 S   s   i | ]\}}||qS r8   r8   rP   r   r4   r8   r8   r9   r	  W  rS   z7reordering_to_mimic_autograd_engine.<locals>.<dictcomp>c                    s   | g}t  }t|dkrH| } | |v s|  v r2q||  || j7 }qt|fddd}|D ]} |  fdd | < q`d S )Nr   c                    s    |  S r5   r8   rT   )orderr8   r9   rV   f  rS   zSreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph.<locals>.<lambda>rW   c                    s    |  S r5   r8   r   r   r8   r9   rV   h  rS   )r   r   r   r#  Zall_input_nodesrY   r   )r4   	cur_nodesZinsertable_nodesr   r   rW  r8   r9   insert_node_in_graphY  s    
zAreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graphNr   )rD   r   rr   r   r   	enumeraters   ra   r  r   mathinfr  r   rx   ry   r   r   Zcopy_r   GraphModule)	rU  r4   rZ  r)  Zfirst_node_in_bwdZminimum_ordertangentr  Znew_gmr8   rY  r9   #reordering_to_mimic_autograd_engine9  s.    

r`  )	fw_module	bw_modulefw_nodebw_nodedevice	rng_countlast_fwd_inputlast_bwd_inputc                 C   s  |j }|dusJ | j}	|j}
tjjj}| j|4 | jd| }t||j	d< |}W d   n1 sn0    Y  |j|4 |jd| }t||j	d< |}W d   n1 s0    Y  t
|j}||d< | j|. |	jd||jg|jR |d}W d   n1 s0    Y  || |	| t
|j}||d< |
|B |
jd||jg|jR |d}|| |
| W d   n1 s0    Y  ||fS )a%  
    Note [CUDA Graph Safe RNG Functionalization]

    CUDA Graph capture doesn't work with get_rng_state and set_rng_state because these functions operate on CPU values,
    while CUDA Graph RNG capture uses on-device CUDA tensors. To solve this, we use graphsafe_set_state with a
    CUDA Generator registered to the CUDA Graph before capture begins. graphsafe_set_state updates the generator's pointer
    to reference a different GeneratorImpl, ensuring subsequent calls are correctly forwarded to the desired generator
    (and its cuda-tensor RNG state during graph capture).

    For each RNG operation's forward/backward pair:

    - We create two generators initialized with identical values
    - Each forward and backward call advances its respective generator equally
    - This keeps generators synchronized so forward and backward operations use matching RNG values

    When forward is called multiple times before backward (causing desynchronization):

    - We save the forward RNG state
    - We update the backward Generator's state before executing backward

    Before each CUDA Graph replay, replay_prologue updates captured RNG pointers with current states, ensuring backward Generator
    changes are reflected during replay.

    This function modifies both forward and backward computation graphs by:

    Creating RNG state placeholders for both passes
    Updating the forward node to use graph-safe RNG state
    Updating the backward node to use graph-safe RNG state

    For more details: https://github.com/pytorch/pytorch/issues/113541
    NZfwd_rng_state_r}   Zbwd_rng_state_Z	rng_stater   r   r   )indexrr   ry   _prims	rng_primsgraphsafe_run_with_rng_stater   r   r&   rn   rb   r   create_noderx   r   r  r  inserting_before)ra  rb  rc  rd  re  rf  rg  rh  Z
device_idxfw_graphbw_graphrm  Zfwd_rng_stateZbwd_rng_stateZ	fw_kwargsfunctional_fw_nodeZ
bwd_kwargs
rng_outputr8   r8   r9   %apply_graphsafe_rng_functionalization  sH    )
""
&



*rt  )r   ra  rb  num_sym_nodesrN   c           '   
      s  t  }dd }ttj ddd ttj ddd}|| }||}||}	i }
| jjD ]T}t|r\t|j	d	r\tj
j|j	jv r\||j }||j }|	|j }||d
|
|< q\tjjj}tjjj}d }|jjddD ]}d|jv r|} qq|d u rtdg }tt|jjdd}tt|jjdd}t fdd|
 D }|td t|dk}tjj}tjo| o|j p|jj}t |
! D ]\}\}}|d }|d } |}|j}|j}|r
|d ur
|j"dkr
t#||||||||\}}q|$|~ |j%d||j	g|j&R |j'd}|j%dt(j)|dfi d}|j%dt(j)|dfi d} |*|  |+| |,| W d    n1 s0    Y  |$|6 dt| }!|-|!}"|||"j.d< W d    n1 s0    Y  |$|F |j%d||"|j	g|j&R |j'd} |*|  |+| W d    n1 sN0    Y  q|rtt/|jjdd}#|#j&d }$t|$| }%|$d |% t0| |$|%d   }&|j1|& |j+|# |2  |2  ||fS )Nc                 S   sF   i }| j jD ]4}|jdkrt|jdrtjj|jjv r|||j	< q|S )Nr   rv   )
rr   rs   r   rw   rx   ry   rz   r{   rv   r   )ZgmodZrandom_nodesr4   r8   r8   r9   get_rng_ops  s    
z*functionalize_rng_ops.<locals>.get_rng_opsrM   c                 S   s^   d| j vrdS | j d }t|ts(|f}|D ]&}t|tjr,|jjdkr,|j  S q,tdS )zV
        Check the example value of the node outputs to find the device type.
        r}   Ncudacpu)rn   r   r   ry   r   re  rE  )r4   
candidates	candidater8   r8   r9   
get_device	  s    


z)functionalize_rng_ops.<locals>.get_devicere  c                 S   s$   | d ur| j dkrtj S t S )Nrw  )rE  ry   rw  Zget_rng_stater|  r8   r8   r9   get_sample_rng_state  s    
z3functionalize_rng_ops.<locals>.get_sample_rng_staterv   )fwdbwdr   r   r_  zaCouldn't find tangent node in graph inputs. This is unexpected, please file a bug if you see thisc                 3   s   | ]} |d  V  qdS )r~  Nr8   )rP   	node_pairr{  r8   r9   rR   B  s   z(functionalize_rng_ops.<locals>.<genexpr>rx  r   r~  r  rw  r   ri  r   Zrng_state_output_r}   r   )3r$  countr   ry   re  rr   rs   rp   rw   rx   rz   r{   rv   r   rk  rl  Zrun_and_save_rng_staterun_with_rng_stater   r   r  r  r   valuesdiscardr   r   r   Zgraphsafe_rng_functionalizationZfallback_randomZtest_configsZ*graphsafe_rng_func_ignores_fallback_randomr[  rD  rE  rt  ro  rn  r   r   r0  r1  r  r  r   r   rn   iterr   r   	recompile)'r   ra  rb  ru  uidrv  r}  Zjoint_graph_rng_opsZfw_graph_rng_opsZbw_graph_rng_opsZrecomputable_rng_ops_mapr4   Z	base_noderc  rd  Zrun_and_save_rngr  Zbw_tangent_start_nodeZfw_rng_state_outputsrg  rh  ZdevicesZmulti_cuda_devicesZ
ind_configZ'use_rng_graphsafe_rng_functionalizationrf  r  re  rp  rq  rr  staters  Z
state_nameZbw_rng_state_nodeZfw_output_nodeZ
fw_outputsZsym_node_start_idxr   r8   r  r9   functionalize_rng_ops  s    








		

*
.
.


r  c                 C   sB   | j jD ]4}t|jtjjr|jjdkrt|st	j
|jd< qdS )z
    By default, the partitioner is not allowed to recompute collectives
    unless they come from a user-annotated AC region.
    See Note [Recomputing collectives in the partitioner]
    r  rm   N)rr   rs   r   rx   ry   rM  
OpOverload	namespacerp   r   	MUST_SAVErn   )r   r4   r8   r8   r9   force_save_collectives  s    
r  c                 C   s|   | j jD ]n}t|r|jD ],}t|r|jd |jd krtj|jd< q|jddrtdd |jD stj|jd< q| S )a  
    If there are two consecutive checkpointed blocks with no operator in
    between, we would still want to stash the tensor at the boundary of
    checkpointed blocks. The following pass makes the last output node
    non-recomputable to allow for that.
    Zac_graph_idrm   Zhas_backward_hookFc                 s   s   | ]}t |V  qd S r5   )rp   r2  r8   r8   r9   rR     s   z)cleanup_recompute_tags.<locals>.<genexpr>)	rr   rs   rp   r  rn   r   r  ro   r   )r   r4   r  r8   r8   r9   cleanup_recompute_tags  s    
r  )r   	node_infomin_cut_optionsdont_banc           $         s  d u rt  t trPt dd | jD }|t dd jD  }td| dd dd fd	d
zdd l}W n. ty } zt	d|W Y d }~n
d }~0 0 
fddfdd}fddt
dfdd}	| t    fdd}
| jD ]}|jdkr(q|
jv rt|
jvr\j|jd dtjd qj|jd dtjd t|rj|jd dtjd qt|st|r|
| 
|r||r|
| d|jvrd|jvp
d|jv o
t|jd tj }t|r$t
t|}n2|rJt|jdtrBdntj}n|	|
j}j|jd |jd |d |j D ]$}j|jd |jd tjd qxqt!t"j# t$t$d 
fd!d"}j%r
j&D ]}
fd#d$|j D }
fd%d$|j D }t'|dkr||t(|}t)|j D ]d}
|r 
*||kr ||r | v rZq td&|
*|||
*| |
| q qΈj+rt  }| jD ]}
|sq
*||fg}
*|}t'|dkrt,-|\}}||v rq|.| 
*||d' krLt'|dkrLtd(||
*|
*| |
| q|j D ]>}
|rR||rR| vrRt,/|
*||f qRq̐qz|0d)d\}}W nB t1y   td* td+2|j3j45 t6  Y n0 |\}t  }fd,d|D D ]$\}|7fd-d|D  qt  }|D ]>\} }!| d d. |!d d/ ksjJ | d d. }"|.|" qDt8| d0d1 t9| jD 	t:fd2d|D 	fd3d4d5}#|# fS )6Nc                 s   s0   | ](}|j d krt|jdrt|jjV  qdS )r   _overloadpacketN)r   rw   rx   r   r  r   r8   r8   r9   rR     s   z solve_min_cut.<locals>.<genexpr>c                 s   s   | ]}t |V  qd S r5   )r   rP   r   r8   r8   r9   rR     s   z&Ops banned from re-materialization: %sc                 S   sn   |j tjjjkrdS |jd }tjj|\}}|D ]4}|j	| }| |u rP dS t
|tr4| |v r4 dS q4dS NFr   T)rx   ry   r   higher_orderZauto_functionalizedr   Z_higher_order_opsZauto_functionalizeZget_mutable_argsr   r   ra   )abZ
mutable_opmutable_arg_namesrB  r   rR  r8   r8   r9   !can_fuse_into_auto_functionalized  s    


z8solve_min_cut.<locals>.can_fuse_into_auto_functionalizedc                 S   sH   |j tjjjkrdS |jd }|D ] }|jd | }| |u r" dS q"dS )NFZtensors_to_cloner   T)rx   ry   r   r   triton_kernel_wrapper_functionalr   )r  r  r  r   rR  r8   r8   r9   .can_fuse_into_triton_kernel_wrapper_functional	  s    
zEsolve_min_cut.<locals>.can_fuse_into_triton_kernel_wrapper_functionalc                    sh   t |tjkrdS  | |r dS | |r.dS | jtju rT| jd jtjj	j
u rTdS | of|S )NTr   F)r)   r   catrx   r0  r1  r   ry   r   r  r  r:   )r  r  )r  r  op_typesr8   r9   r:     s    



z!solve_min_cut.<locals>.is_fusibler   zANeed networkx installed to perform smart recomputation heuristicsc                    sl    | rdS t| g}t|dkrh| }|jD ]2}|sP ||sP dS  |r2|| q2qdS r  )r=   r   r   r   r  r\   r#  )r4   rX  curr  )r:   r  r  r8   r9   is_materialized_backwards-  s    



z0solve_min_cut.<locals>.is_materialized_backwardsc                    s  | j dkrdS | jtjkrdS | jdd tjkr6dS tj	rJ
| rJdS | jtjjtjjfv rddS jrz| sdS n| s| rdS jr | rtd| t| j dS | jdk r| jtjkrdS jrtdd | jD }t| }|d	 |k S dS )
Nr   Frm   Tzmaterialized backwards: %s %si  c                 s   s"   | ]}t |tjrt|V  qd S r5   )r   rD   rE   rF  r  r8   r8   r9   rR   f  s   zBsolve_min_cut.<locals>.should_ban_recomputation.<locals>.<genexpr>r~   )r   rx   r0  r1  rn   ro   r   r  r   recompute_viewsr=   r   lift_fresh_copyr   Z
lift_freshrk   r>   r<   r;   rj   r,   r  r   r  dist_from_bwZmax_dist_from_bwrl   rC  r   rF  )r4   Zinput_tensors_sizeZoutput_size)r  r  r  r8   r9   should_ban_recomputation;  s:    

z/solve_min_cut.<locals>.should_ban_recomputationc                    s*    j dkrdS t fdd jD  S )Nr   Tc                 3   s   | ]} |V  qd S r5   r8   r2  )r:   r4   r8   r9   rR   q  rS   z9solve_min_cut.<locals>.is_materialized.<locals>.<genexpr>)r   r"  r  r3   )r:   r3   r9   is_materializedm  s    
z&solve_min_cut.<locals>.is_materializedrM   c                    s   t jr| |v rdS t| }t jr0| r0tjS t| jd t	rVt| jd t
jsVtS t|dtt| jdd  } | r|S |d S d S )Nr   r}   g?r   r      )r   Z treat_parameters_as_free_to_saverF  r  r=   r\  r]  r   rn   r   ry   r   INT_INFrc   r   r   r  )r4   rL   Zmem_sz)r  r  r8   r9   get_node_weights  s    z&solve_min_cut.<locals>.get_node_weightc                    s    | rdS | v r@t| jtjjo0| jjdk}tjs<|s@dS t	| rLdS d| j
v rlt| j
d tjrldS  |  jd| jd tjd dS )NFr  r}   source_incapacityT)r=   r   rx   ry   rM  r  r  r   (unsafe_allow_optimization_of_collectivesrp   rn   r   r#  add_edger   r\  r]  )r4   Zis_collective)banned_nodesr  nx_graphr  r8   r9   ban_recomputation_if_allowed  s    



z3solve_min_cut.<locals>.ban_recomputation_if_allowedr   r  Zsinkr  Z_outr}   r           )start_nodes	max_rangerN   c           	         s   g }| D ]}t |||df qt|dkrt |\}}}|sP|S |jD ]H}|rV||krtqV|| ||f}||vrVt || qVq&|S )z
        Finds the first unfusible node in the chain of nodes starting from
        `start_nodes` and returns its position.
        Tr   )heapqheappushr`   r   heappopr  r\   )	r  r  Zsorted_nodesrQ   rB  r4   Znode_is_fusibler  r}   )r:   r  r8   r9   find_first_unfusible  s$    


z+solve_min_cut.<locals>.find_first_unfusiblec                    s    g | ]}  |r |qS r8   )r\   r`   r2  r  r8   r9   r     s   
z!solve_min_cut.<locals>.<listcomp>c                    s   g | ]}  |r|qS r8   )r\   r2  r  r8   r9   r     s   z1used above/below fusible %s:(%s) -> %s -> %s:(%s)r   ztoo long %s %s %s %sr  z-Failed to compute min-cut on following graph:
c                 3   s   | ]}| | fV  qd S r5   r8   rO   )r  r8   r9   rR   g  rS   c                 3   s   | ]}| v r|fV  qd S r5   r8   )rP   v)non_reachableur8   r9   rR   h  rS   c                 S   s   i | ]\}}||qS r8   r8   rV  r8   r8   r9   r	  r  rS   z!solve_min_cut.<locals>.<dictcomp>c                 3   s   | ]} | V  qd S r5   r8   r   name_to_noder8   r9   rR   t  rS   c                    s    |  S r5   r8   r   )node_idxr8   r9   rV   t  rS   zsolve_min_cut.<locals>.<lambda>rW   );r   get_default_op_listr+   rs   r2   r,   r   networkxImportErrorr   floatZDiGraphr   rI   rG   r  r   r\  r]  rp   r   r   r\   rn   r   ry   r   r   r   ro   r   rL   r  ra   rD   rE   rc   rh   rZ   r   r   r   r`   ri   r  r  r#  r  Zminimum_cut	ExceptionjoinZ	readwriteZedgelistZgenerate_edgelistvisualize_min_cut_graphr  get_name_to_noder[  rY   )$r   r  r  r  Zjoint_module_opsZops_ignorednxer  r  r  r4   Zis_non_tensor_nodeweightr  r  Z	used_nodeZordersZfw_usersZfirst_unfusible_usevisitedZ
start_nodeZfusibleZstart_orderrB  r  Z	cut_value	partitionZ	reachableZcutsetZnbrsZ	cut_nodesZnode_inZnode_outZ	node_namer   r8   )r  r  r  r  r:   r  r  r  r  r  r  r  r  r  r  r9   solve_min_cut  s   


2	


&









"
r  c                 C   s   dd l }dd l}|j|  }||d }| D ]@}| |  |  d }|	t
| |tdkr6|d q6td |d d S )Nr   r  r]  Zredz2Visualizing the failed graph to min_cut_failed.svgzmin_cut_failed.svg)r  pydotZnx_pydotZto_pydotZ	to_stringZgraph_from_dot_dataZ	get_edges
get_sourceZget_destinationZ	set_labelr   r  Z	set_colorr,   r   Z	write_svg)r  r  r  Z
dot_formatZ	dot_graphZedger  r8   r8   r9   r  y  s    
r  c               K   C   s  t jt jt jt jt jt jt jt jt j	t j
t jt jt jt jt jt jt jt jt jt jt jt jt jt jt jt jt jt jt jt jt jt j t j!t j"t j#t j$t j%t j&t j't j(t j)t j*t j+t j,t j-t j.t j/t j0t j1t j2t j3t j4t j5t j6t j7t j8t j9t j:t j;t j<t j=t j>t j?t j@t jAt jBt jCt jDt jEt jFtGjHt jIt jJt jKt jLgK} t jIt jJt jMg}|t jNt jOt jPtQjRt jSt jTt jUt jVt jWg	7 }|}| tQjtQjXt jYt jLt jZtQj[tQj@t j[t j\tQjRt jVt j]t jNt jSt jOt j^t j_t j`t jat jbt jct jdt jet jft jgt jht jit jTt jjt jkt jlt jmt jntQjotQjpg#7 } | t jqt jrg7 } | |7 } | ts 7 } | t jtg7 } | dd tuD 7 } tv| }tvtwdtxf  t jyt jzt j{g}t j|t j}t j~t jt jt jt jt jt jt jt jg}||B }t|tv||tv||S )Nc                 S   s   g | ]}t |qS r8   )r   )rP   mr8   r8   r9   r     rS   z'get_default_op_list.<locals>.<listcomp>.)r   r#  subr  atan2r   r   r   pow	remainderfmod__and____or____xor__
__lshift__
__rshift__eqnegegtleltr   Zbitwise_notceilfloorfracnegZreluroundZsilutruncr,   log10log1plog2lgammaexpexpm1erferfccosacoscoshsinasinsinhtanatantanhatanhsqrtZrsqrtr   ZsigmoidZsoftplus	thresholdZthreshold_backwardclampwhereZlerpZaddcmulZgeluZgelu_backwardrC  meanZ_grad_sum_to_sizeZsum_to_sizer   toZtype_asr0  r1  ZsqueezeZ	unsqueezeZrsubZ_to_copyaliasviewslicetr   Zbroadcast_in_dimexpandZ
as_stridedZpermuteselectr   r   cloneZ	full_likevarZstdZ_unsafe_viewZreshapeZbroadcast_tensorsZscalar_tensorZonesZ	new_zerosr  ZarangeZtriuZvar_meanisinfr   fullzerosemptyZ
empty_likeZargmaxmaximumiotaZ'_low_memory_max_pool_offsets_to_indicesrj  ZgatherrP  Z
zeros_liker   r   r   r   Znative_dropoutZ	rand_likeZ
randn_likemmZconvolutionZconvolution_backwardZbmmZaddmmZ#_scaled_dot_product_flash_attentionZ'_scaled_dot_product_efficient_attentionZ_flash_attention_forwardZ_efficient_attention_forwardZupsample_bilinear2dZ
_scaled_mmr-   )Zdefault_recomputable_opsZrecomputable_view_opsr1   r2   r0   r/   r.   r8   r8   r9   r    s:   M&
r  c                 C   s   i }| j D ]}|||j< q
|S r5   )rs   r   )rr   r  r4   r8   r8   r9   r  2  s    
r  )r   memoryruntimes
max_memoryr  all_recomputable_banned_nodesrN   c           
      C   s   t j}|dkrt|||S |dkr.t|||S |dkrBt|||S |dkrtd tj| |||d}t||t	|dj
t|dS t|r||| |||\}}	d	||	fS td
| d S )NZgreedyZilpZdpZdynamic_memory_budget_dpzdynamic_memory_budget_dp is an experimental solver. It does not guarantee performance improvements. Additionally, it is not guaranteed to be stable.)r   r  Z recorded_knapsack_input_memoriesZ recorded_knapsack_input_runtimes)graph_info_provider)Zknapsack_algoZmax_mem_budgetr  z,Not aware of memory budget knapsack solver: )r   Zactivation_memory_budget_solverr"   r#   r!   r,   warningr    Zinialize_from_graphr$   Zget_knee_point_memory_budgetcallabler   )
r   r  r  r  r  r  ZSOLVERr  Zsaved_node_idxZrecomp_node_idxr8   r8   r9   #_optimize_runtime_with_given_memory9  sB    


r  no_dispatch)r   r?  rN   c                    sL   t | j} fddfdd|D }fdd|  D }| j||dS )Nc                    s   t |  dS )Nr>  )r   )dr>  r8   r9   realize_symboln  s    z8_remove_symbols_without_guarding.<locals>.realize_symbolc                    s   g | ]} |qS r8   r8   rP   r  r  r8   r9   r   q  rS   z4_remove_symbols_without_guarding.<locals>.<listcomp>c                    s   g | ]} |qS r8   r8   r  r  r8   r9   r   r  rS   )stride)ra   shaper  Znew_empty_strided)r   r?  r   r  r8   )r?  r  r9    _remove_symbols_without_guardingk  s
    
r!  c                    s  t j}dd }|dkrdS |dkrt N ddlm} t|jjf\ |	 fdd	}|W  d    S 1 s|0    Y  n|d
krddl
m} t|jjf\ |dd }j i  W d    n1 s0    Y  | }t|dS td| d S )Nc                 S   s   t | tjr0t | jd tjr0t| jd ddS t | tjr`t | jd tjr`t| jd ddS t | tjrt | jd tj	rdS t | tjrt | jd tj
rdS | S d S )Nr}   r=  r>        ?T)r   rD   rE   rn   ry   r   r!  r   r   r   r   r   r8   r8   r9   materialize_argy  s    z)estimate_runtime.<locals>.materialize_argZtestingr   Zprofiler   )benchmarkerc                      s   j  i S r5   )rx   r8   r   r   r4   r8   r9   rV     rS   z"estimate_runtime.<locals>.<lambda>Zflops)FlopCounterModeF)Zdisplayz Not aware of runtime estimator: )r   Z*activation_memory_budget_runtime_estimatorr  Z$torch._inductor.runtime.benchmarkingr$  r   Ztree_mapr   r   Zbenchmark_gpuZtorch.utils.flop_counterr&  rx   Zget_total_flopsr   r   )r4   ZRUNTIME_MODEr#  r$  msr&  modeZcounted_flopsr8   r%  r9   estimate_runtimev  s$    $
.
r)  )r   r  rN   c                    sl  |dks|dk rt d| ttjtjtjtjtjd}tjrRt	|ddddd}|dkr`j
S t|\}}|dkr||S ttj tddd	j
|		kr|S 	fd
dttj d	fddt	|dddd}t|\}}||k r|S t	|dd t \}}	||k r:|S ddlm tfddj
D ttj ttj dfdd}
|
|	}dd |D fdd|D }t|tddtdkrΈj
 S fddD 
dd D ddlm  
fdd tjrZfd!d"}|d#|d$g}|d dd  |d dd  kr|d |d fg}|r| \}}|d |d  d%k r|| || qn||d |d  d& }|dd  |dd  kr|||f |dd  |dd  krn|||f qn|  dd lm} d'd |D }d(d |D }|jd)d* |j||d+d, t |D ]*\}}|j!|d-||| fd.d/d0d1 qn|"d2 |#d3 |$d4 |%d |& }|'  t() }tj*d urtj*}t(j+|dd5 d6}t,j-. r&t,j-/ r&d7t,j-0  }t(j12|d8| d9t3  d:}|4| t56d;| |d<d S )=Nr   r   zJThe valid ranges for memory budget are 0 <= m <= 1. The provided value is )rh   ri   rj   rk   rl   F)rh   ri   rj   rk   )r   rN   c                 S   s   t tt| d S N    eA)rC  maprF  )r   r8   r8   r9   estimate_activations_size  s    z:choose_saved_values_set.<locals>.estimate_activations_sizec                    s   | d    S r*  r8   )sz)max_act_sizemin_act_sizer8   r9   get_normalized_size  s    z4choose_saved_values_set.<locals>.get_normalized_sizeZactivationsc                    s    |    S r5   r8   r2  )r-  r/  r0  r8   r9   get_mem_ratio  s    
z.choose_saved_values_set.<locals>.get_mem_ratio)rh   ri   rj   )rk   get_node_storagec                 3   s   | ]} |V  qd S r5   r8   r   r4  r8   r9   rR     rS   z*choose_saved_values_set.<locals>.<genexpr>)r  rN   c                    s    fdd| D S )Nc                    s*   g | ]"}|j td k r |vr|qS )r+  )r  rc   r  r5  input_storagesr8   r9   r     s   zRchoose_saved_values_set.<locals>.get_recomputable_banned_nodes.<locals>.<listcomp>r8   )r  r6  r8   r9   get_recomputable_banned_nodes  s    z>choose_saved_values_set.<locals>.get_recomputable_banned_nodesc                 S   s$   g | ]}|j d dtjkr|qS )rm   F)rn   ro   r   r  r  r8   r8   r9   r     s   z+choose_saved_values_set.<locals>.<listcomp>c                    s   g | ]}| vr|qS r8   r8   r  )must_save_nodesr8   r9   r   	  s   TrG  c                    s   g | ]} t |qS r8   rF  r  )r1  r8   r9   r   	  s   c                 S   s   g | ]}t |qS r8   )r)  r   r8   r8   r9   r   	  s   r  c           
   
      s    . t |t| d|\}}}W d    n1 s:0    Y  t }|D ]*}z||  W qN tyv   Y qN0 qN|sJ t|| |\}}	trt|||||d ||fS )Nr   )r   r  saved_node_idxsrecomputable_node_idxsexpected_runtimememories_banned_nodesruntimes_banned_nodesZmin_cut_saved_values)	r  r   r   r#  BaseExceptionissubsetr  r+   r   )
memory_budgetr  r   r=  r;  r<  r  r   r   rB  )aggressive_optionsr  r>  r  r?  r8   r9   get_saved_values_knapsack	  sN    
z:choose_saved_values_set.<locals>.get_saved_values_knapsackc                    s(   | d\}}| t |  |fS )N)r  r   )rC  )r  r   r=  )r3  rD  r   r  r?  r8   r9   estimate_for_budgetC	  s    

z4choose_saved_values_set.<locals>.estimate_for_budgetr  r"  gMbP?r  c                 S   s   g | ]}|d  qS )r  r8   rP   itemr8   r8   r9   r   `	  rS   c                 S   s   g | ]}|d  qS )r   r8   rF  r8   r8   r9   r   a	  rS   )
      )Zfigsizeo)markerz.4fzoffset points)r   rH  center)Z
textcoordsZxytextZhazMemory Budgetz Runtime of Recomputed Componentsz:Pareto Frontier of Memory Budget vs. Recomputation Runtime)exist_okr   Z_rank_Zmemory_budget_paretorB  z.svgz%Generated Pareto frontier curve at %s)rB  r  r   )7r   rg   r   Zban_recompute_used_far_apartZ!ban_recompute_long_fusible_chainsZ#ban_recompute_materialized_backwardZban_recompute_not_in_allowlistZban_recompute_reductionsZaggressive_recomputationr   rG   r  ra   rD   rE   r  Ztorch._inductor.fx_utilsr5  r   rY   rF  r   torch.utils._mode_utilsr  Zvisualize_memory_budget_paretor   r   sortZmatplotlib.pyplotZpyplotfigureZplotr[  ZannotateZxlabelZylabeltitleZgridZgcfshowosgetcwdZmemory_budget_pareto_dirmakedirsry   r   r!  is_initializedZget_rankpathr  r%   Zsavefigr,   r  )r   r  rB  r  Zruntime_optimized_saved_valuesrB  Zmore_aggressive_optionsZmore_aggressive_saved_valuesZ%aggressive_recomputation_saved_valuesr  r8  Zrecomputable_banned_nodesrE  optionsZbisectslhsrhsZmidZpltZx_valuesZy_valuesr   txtZfigZfig_dirZrank_suffixZfig_namer8   )rC  r  r-  r3  r5  r1  rD  r7  r   r/  r>  r0  r9  r  r  r?  r9   choose_saved_values_set  s
   



+
"








r\  )r   r   c              	      s   ddl m dd }fdd}tj rtj rtj dkr|| r|| rt x  R dd	 |D g}tjj|dd
 |d }t	|   fdd	|D }W d    n1 s0    Y  W d    n1 s0    Y  |S )Nr   unset_fake_temporarilyc                 S   s2   | j D ]&}t|jtjjr|jjdv r dS qdS )N>   r  Zc10d_functionalTF)rs   r   rx   ry   rM  r  r  )r   r4   r8   r8   r9   has_collectives	  s    


z2_broadcast_rank0_decision.<locals>.has_collectivesc              	      s   d dd | jD }t|d }dd ttj	 D  t
 D   tj | W d    n1 st0    Y  W d    n1 s0    Y  t fdd D S )N/c                 s   s   | ]}|j V  qd S r5   r   r   r8   r8   r9   rR   	  rS   zD_broadcast_rank0_decision.<locals>.has_same_nodes.<locals>.<genexpr>zutf-8c                 S   s   g | ]}d qS r5   r8   )rP   rB  r8   r8   r9   r   	  rS   zE_broadcast_rank0_decision.<locals>.has_same_nodes.<locals>.<listcomp>c                 3   s   | ]} d  |kV  qdS r  r8   r   Z
all_inputsr8   r9   rR   	  rS   )r  rs   hashlibsha256encode	hexdigestr   ry   r   get_world_sizer  Zall_gather_objectr"  )r   Znode_strrG   r]  ra  r9   has_same_nodes	  s    Jz1_broadcast_rank0_decision.<locals>.has_same_nodesr   c                 S   s   g | ]
}|j qS r8   r   r   r8   r8   r9   r   	  rS   z-_broadcast_rank0_decision.<locals>.<listcomp>)srcc                    s   g | ]} | qS r8   r8   rO   r  r8   r9   r   	  rS   )
torch._subclasses.fake_tensorr^  ry   r   r!  rV  rf  r  Zbroadcast_object_listr  )r   r   r_  rg  Zobjectsr  r8   )r  r^  r9   _broadcast_rank0_decision	  s(    Nrj  r   r-  )r   r-  rN   c                   s  | j   |   | j }tjr,t|}|| _ | j }t| }t| }	|rNt| } tj	s\t
|   fdd}
|du rtg }|
| |}t|jdkrt| | ||jdS t| j jD ]V}|jdkrtd|_q||sd|_qtd|_|jD ]}t|j|jd |_qqtj}|jD ],}t|jd	dtr|jd	 } q>qt|||d
}tjr^t||}ttt |}ttdd |}t!| || |jd\}}|r|	rt"| ||t|\}}t#|}t$|}t$|}t%rt&dd |D }t'dd |D d }t()d| t()d| t*dd |j jD }t*dd |j jD }||@ }t+t}|j jD ]8}|j,|v rZt-|j.drZ|t/|j.j0  d7  < qZt()dt|t|t| t&|1 t23ddd}t()d| ||fS )ax  
    Partitions the joint graph such that the backward recomputes the forward.
    Recomputing helps in trading off memory bandwidth with computation.

    To create the fwd and bwd graph, we copy the joint graph, manually set the
    outputs to just original forward or backward outputs. And then we run the
    resulting graphs through dead code elimination.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.
        _joint_inputs: The inputs to the joint graph. This is unused.
        compiler: This option determines the default set of recomputable ops.
            Currently, there are two options: ``nvfuser`` and ``inductor``.
        recomputable_ops: This is an optional set of recomputable ops. If this
            is not None, then this set of ops will be used instead of the
            default set of ops.
        num_fwd_outputs: The number of outputs from the forward graph.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    c                    sV  t | j t | jjD ]J}|jdkr<d|jv r<| nt|rN| |v r|j	 qt
tt| jj}t
tt| jj}|| }t| d\}}dd |D  t| j||d}t fdd|jD tfdd| jjD }	tfd	dt|D }
d
}i }| jjD ] }|v r"|||< |d7 }q"t||	||
S )Nr   r   r  c                 s   s$   | ]}|d ur|j dkr|V  qd S )Nr   r   )rP   rJ  r8   r8   r9   rR   
  s   zNmin_cut_rematerialization_partition.<locals>.classify_nodes.<locals>.<genexpr>r  c                 3   s"   | ]}|j d kr |j V  qdS r.  r/  r   r  r8   r9   rR   	
  s   
c                 3   s"   | ]}|vr| vr|V  qd S r5   r8   r   )rI   rZ   r8   r9   rR   
  s   c                 3   s   | ]\}}| v r|V  qd S r5   r8   )rP   r   prk  r8   r9   rR   
  s   r   r   )r  rr   r   rs   r   rx   r#  r   r  r  ra   r  r   r   r   r   r[  rF   )r   r-  r4   r(  r*  rG   r   r   r9  rJ   rL   Zfw_cntrK   r  )r  rI   rZ   r-  r9   classify_nodes	  s\    





z;min_cut_rematerialization_partition.<locals>.classify_nodesNr   )r   r-  rL   r   r+  r   rB  )rB  c                 S   s
   t |  S r5   r4  rT   r8   r8   r9   rV   M
  rS   z5min_cut_rematerialization_partition.<locals>.<lambda>r5  c                 S   s   g | ]}t |t|fqS r8   )rF  r   r  r8   r8   r9   r   e
  rS   z7min_cut_rematerialization_partition.<locals>.<listcomp>c                 s   s   | ]}t |V  qd S r5   r:  r  r8   r8   r9   rR   h
  rS   z6min_cut_rematerialization_partition.<locals>.<genexpr>z'Theoretical Activations Stored: %.2f GBz,Theoretical Per Activation Storage Sizes: %sc                 s   s   | ]}|j d kr|jV  qdS r   Nr/  r   r8   r8   r9   rR   m
  s   c                 s   s   | ]}|j d kr|jV  qdS rn  r/  r   r8   r8   r9   rR   p
  s   r  z# remat/fw/bw: %d/%d/%dTrG  zCount of Ops Rematerialized: %s)4rr   r   r  r   Zcser(   ru   r|   r  r  r  r   rI   r:  rL   r  rs   r   rc   r  r\   r  r   Zactivation_memory_budgetr   rn   ro   r  r\  rj  ra   r  r   r,  r  r`  r*   r+   rY   rC  r,   r   r   r   r   rw   rx   r   r  rD  r0  rJ  )r   r8  compilerr   r-  rq   Z	cse_graphr   Zgraph_has_recomputable_opsZgraph_has_recomputable_rng_opsrm  r  r4   r  rB  r   r  ra  rb  Zsorted_sizesZtotal_activations_size_gbZfw_module_nodesZbw_module_nodesZremat_nodescountsZrematerialized_opsr8   r  r9   r6  	  s    "
6









r6  fx_graphTF)tracedfnamefigname
clear_metaprogparse_stack_tracedot_graph_shaperN   c                 C   s   |r0t | j}t| |} | jjD ]
}i |_q$tj	|\}	}
|
sNdt
j }
td|	|
 tj| |||d}| }t|d|
d }|	 |
 }|d u r|| n|||d d S )Nr   zWriting FX graph to file: %s%s)rw  rx  Zwrite_)rv  )copydeepcopyrr   rD   r^  rs   rn   rS  rW  splitextr   Ztorch_compile_graph_formatr,   r   r   ZFxGraphDrawerZget_main_dot_graphr   lstrip)rr  rs  rt  ru  rv  rw  rx  r   r4   baseextgr   Zwrite_methodr8   r8   r9   
draw_graph
  s*    	

r  )N)r   r   )N)N)r   )r   )rq  TNFN)ry  rd   rb  r  r$  loggingr\  r0  rS  os.pathrI  r   Zdataclassesr   r   typingr   r   r   r	   r
   ry   Ztorch._inductor.inductor_primsZtorch.distributedZtorch.fxrD   Ztorch.utils._pytreeutilsZ_pytreer   Ztorch._dynamo.utilsr   r   Z;torch._functorch._activation_checkpointing.ac_logging_utilsr   Ztorch._inductorr   r  Ztorch._loggingr   ri  r   Z%torch.fx.experimental._backward_stater   Z"torch.fx.experimental.proxy_tensorr   r   Ztorch.fx.experimental.sym_noder   r   Z%torch.fx.experimental.symbolic_shapesr   r   r   r   r   r   Ztorch.fx.passesr   Ztorch.utils._ordered_setr   Ztorch.utils.checkpointr   r   Z-_activation_checkpointing.graph_info_providerr    Z"_activation_checkpointing.knapsackr!   r"   r#   Z,_activation_checkpointing.knapsack_evaluatorr$   Z_aot_autograd.logging_utilsr%   Z_aot_autograd.utilsr&   r'   Zcompile_utilsr(   r)   r*   ZsympyZdebug_partitionerr+   rf   rC   	getLoggerr?   r,   Loggerr   r   r   r-   rF   rg   rE   rp   r^  ru   r|   rc   r   r   r   r   ra   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   r   r   r   r   r   r   r   r   r   r  r  r,  r:  r  r<  rF  rK  cacherP  rb   rT  r`  re  rt  r  r  r  r  r  r  r  r  rN  r  r!  r)  r\  rj  r6  r  r8   r8   r8   r9   <module>   s  
  G	  NE
;T  y
_

(N^ L*     ).*  t/ 
 O     