o
    Zh9,                     @   s,  d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dl	m
Z
 zd dlmZmZmZmZmZmZmZmZmZmZ W n eyM Z zededZ[ww e eZee j 			dd	ed
ededeee   deee   de!e"e ef eef fddZ#G dd deZ$de
d
edee fddZ%dS )    N)IntEnum)Optional)Graphis_submodule)SACStats)
lpDot	LpInteger
LpMaximize
LpMinimize	LpProblemLpStatuslpSum
LpVariablePULP_CBC_CMDvaluezBPlease install pulp package. See: https://github.com/coin-or/pulp.   graphmemory_budget
world_sizeac_units
fsdp_unitsreturnc           %         s  t  j}d}d}tdt}tdtt|ddt}	tdtt|dd}
tdtt|d}td	tt|d}td
tt|d}tdtt|d}tdtt|d}tdd}|rt	|}t|D ] j d |vr||	 dk7 }qm|rt|D ]t
 fdd|D r||	 dk7 }qt|D ]!td |D ]} j | dkr||	 |	|  dk7 }qqt|D ] j d r||	 dk7 }qt|D ]v j d s. j d  j d k r.td j d  td j d  td j d  td td  j d  j d<  j d | } j d | }|| ||
  || |	   k7 }qt|D ]9||	 |
 k7 } j d roqY j d | } j d | }||
 || | |	  k7 }qYt|D ]F j d | } j d | } j d }dg| }t|D ]} j j|  d }d||< q|| || t|| k7 }q jd d | }t|D ] j d | }|| | || |  k7 }qt|D ]||| k7 }qt|D ]0t j d D ]#} j d  | } j d! | }|| ||
  | k7 }q'qt|D ]6 j d" }|| ||	  k7 }|| ||  k7 }|| ||  |d|	    k7 }qQ|||k7 }|t|7 }td#d$dd%} || }!|!dkrtd&t|!  i dd'fS i }"t|D ]t|	 jdkrt|
 jd(|" j d < qtt|jd)}#t|j| }$|"|#|$fS )*a6  
    MILP to decide which modules to AC and how much memory to discard.
    The objective is to minimize recomputation time.
    The constraint is to ensure peak memory is under budget.

    Args:
        graph: graph representation of the model as a module submodule tree
            where each node is a submodule with memory & runtime stats
        memory_budget: memory budget in GiB
        world_size: number of GPUs. In the case of FSDP, world_size will be
            used to compute the amount of parameter and gradient memory on each rank
        ac_units: a list of user-specified AC units.
        fsdp_units: a list of FSDP units. AC units cannot be supermodules of FSDP units.

    Returns:
        Dict[str, float]: the optimal SAC solution, mapping from module fqn to
            the percentage of activation memory to **discard**
        float: the recomputation time of the optimal SAC solution
        int: upper bound on the peak memory of the optimal SAC solution.
            note that value of -1 means that the ILP solver failed to find a solution.

    d   i   @ZSACyr   r   rdamrcprctmax_mfqnc                 3   s$    | ]}t | j d  V  qdS )r!   N)r   nodes).0Z	fsdp_unitr   i O/var/www/auris/lib/python3.10/site-packages/torch/distributed/_tools/sac_ilp.py	<genexpr>d   s
    
zsac_milp.<locals>.<genexpr>Zis_leafZ
sac_memoryZact_fw_per_modulezFor module {%s}: z.activation memory from memory tracker is {%d},z-activation memory from SAC estimator is {%d}.z!Something is wrong. Please check!z&Overriding the latter with the former.Zact_grad_per_moduleZ	act_totalZpos_fw_post_orderindexZparam_per_moduleZ
grad_totalZ
n_segmentsZslopesZ
interceptsZsac_runtime皙?   ZgapRelZ	timeLimitmsg$Solver failed to find a solution: %s      )lenr"   r   r
   r   matrixlistranger   setanyZ	ad_matrixloggerwarningZ	name2nodeZfw_post_orderr   r   r   solveerrorr   roundvarValuer   Z	objective)%r   r   r   r   r   Z	num_nodesMZMEM_MULTIPLIERprobr   r   r   r   r   r   r   r    Zac_units_setjZACM_iZIA_iZAG_iZTA_iposZcoeffpZP_1ZTG_isZslopeZ	interceptZACT_isolverstatusZac_decisionsZrecomputation_timeZpeak_memr&   r$   r'   sac_milp#   s   




*$
"$ ,



rF   c                   @   s   e Zd ZdZdZdS )SACDecisionr   r   N)__name__
__module____qualname__	RECOMPUTESAVEr&   r&   r&   r'   rG      s    rG   	sac_statsc                    s  d|  kr
dksn t d| dt| j}tdt}tdtt|ddt	 | j
r>| jD ]}| | tjjk7 }q0nt| jdd | jdd D ]\}}| |  | k7 }qM| jD ]}| | tjjk7 }q_| jD ]\}}||kr| |  | k7 }qo| | tjjk7 }qot|t| j }	|t | j|	k7 }|t | j7 }td	d
dd}
||
}|dkrtdt|  g S  fddt|D S )aB  
    This is adapted from --
    https://github.com/facebookresearch/xformers/blob/c6c0ac31f1b08542a0bc27278c6ed10f825f6963/xformers/checkpoint.py#L375

    Given the SACStats of a module, including list of operators, their memory, runtimes, and metadata,
    decide via MILP an optimal set of operators to checkpoint under a given ``memory_budget``.

    Args:
        sac_stats: the SACStats object of the module
        memory_budget: a float between zero and one

    Returns:
        List[int]: the decision whether each operator should be saved (1) or recomptued (0).
    r   r   z5`memory_budget` must be a float between 0 and 1. Got .zSAC-per-modulexNr/   r*   
   r,   r.   c                    s   g | ]	}t  | jqS r&   )r<   r=   )r#   r%   rO   r&   r'   
<listcomp>'  s    z?get_optimal_checkpointing_policy_per_module.<locals>.<listcomp>)
ValueErrorr2   Z
func_namesr   r	   r   r3   r4   r5   r   Zforce_store_randomZrand_opsrG   rL   r   zipZview_like_opsrK   Zinplace_opsmathceilsumZmemoryr   Zruntimesr   r:   r8   r;   r   )rM   r   Znum_opsr?   r%   i1i2opZ	op_parentZ
max_memoryrD   rE   r&   rQ   r'   +get_optimal_checkpointing_policy_per_module   s8   



&

r[   )r   NN)&loggingrU   enumr   typingr   Z"torch.distributed._tools.ilp_utilsr   r   Z&torch.distributed._tools.sac_estimatorr   Zpulpr   r   r	   r
   r   r   r   r   r   r   ImportErrorerr	getLoggerrH   r8   setLevelINFOfloatintr4   strtupledictrF   rG   r[   r&   r&   r&   r'   <module>   sV    4



 :