
    eThn2                         S SK r SSKJrJrJr  \" 5       (       a  S SKr\R                  " \5      r " S S5      r	S r
S r " S S	\5      rg)
    N   )ExplicitEnumis_torch_availableloggingc                   z    \ rS rSrSrS/ S4S jrSS jrS rS rS	 r	S
 r
S rS rS rS rS rS rS rS rSrg)DebugUnderflowOverflow   a  
This debug class helps detect and understand where the model starts getting very large or very small, and more
importantly `nan` or `inf` weight and activation elements.

There are 2 working modes:

1. Underflow/overflow detection (default)
2. Specific batch absolute min/max tracing without detection

Mode 1: Underflow/overflow detection

To activate the underflow/overflow detection, initialize the object with the model :

```python
debug_overflow = DebugUnderflowOverflow(model)
```

then run the training as normal and if `nan` or `inf` gets detected in at least one of the weight, input or output
elements this module will throw an exception and will print `max_frames_to_save` frames that lead to this event,
each frame reporting

1. the fully qualified module name plus the class name whose `forward` was run
2. the absolute min and max value of all elements for each module weights, and the inputs and output

For example, here is the header and the last few frames in detection report for `google/mt5-small` run in fp16
mixed precision :

```
Detected inf/nan during batch_number=0
Last 21 forward frames:
abs min  abs max  metadata
[...]
                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
2.17e-07 4.50e+00 weight
1.79e-06 4.65e+00 input[0]
2.68e-06 3.70e+01 output
                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
8.08e-07 2.66e+01 weight
1.79e-06 4.65e+00 input[0]
1.27e-04 2.37e+02 output
                  encoder.block.2.layer.1.DenseReluDense.wo Linear
1.01e-06 6.44e+00 weight
0.00e+00 9.74e+03 input[0]
3.18e-04 6.27e+04 output
                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
1.79e-06 4.65e+00 input[0]
3.18e-04 6.27e+04 output
                  encoder.block.2.layer.1.dropout Dropout
3.18e-04 6.27e+04 input[0]
0.00e+00      inf output
```

You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value was
around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which
renormalizes the weights, after it zeroed some of the elements, which pushes the absolute max value to more than
64K, and we get an overflow.

As you can see it's the previous frames that we need to look into when the numbers start going into very large for
fp16 numbers.

The tracking is done in a forward hook, which gets invoked immediately after `forward` has completed.

By default the last 21 frames are printed. You can change the default to adjust for your needs. For example :

```python
debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
```

    To validate that you have set up this debugging feature correctly, and you intend to use it in a training that
    may take hours to complete, first run it with normal tracing enabled for one of a few batches as explained in
    the next section.


    Mode 2. Specific batch absolute min/max tracing without detection

    The second work mode is per-batch tracing with the underflow/overflow detection feature turned off.

    Let's say you want to watch the absolute min and max values for all the ingredients of each `forward` call of a
given batch, and only do that for batches 1 and 3. Then you instantiate this class as :

```python
debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
```

And now full batches 1 and 3 will be traced using the same format as explained above. Batches are 0-indexed.

This is helpful if you know that the program starts misbehaving after a certain batch number, so you can
fast-forward right to that area.


Early stopping:

You can also specify the batch number after which to stop the training, with :

```python
debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
```

This feature is mainly useful in the tracing mode, but you can use it for any mode.


**Performance**:

As this module measures absolute `min`/``max` of each weight of the model on every forward it'll slow the training
down. Therefore remember to turn it off once the debugging needs have been met.

Args:
    model (`nn.Module`):
        The model to debug.
    max_frames_to_save (`int`, *optional*, defaults to 21):
        How many frames back to record
    trace_batch_nums(`List[int]`, *optional*, defaults to `[]`):
        Which batch numbers to trace (turns detection off)
    abort_after_batch_num  (`int``, *optional*):
        Whether to abort after a certain batch number has finished
   Nc                     Xl         X0l        X@l        [        R                  " / U5      U l        / U l        SU l        SU l        SU l	        SU l
        U R                  5         U R                  5         g )Nr   Fz                 )modeltrace_batch_numsabort_after_batch_numcollectionsdequeframesframebatch_numbertotal_callsdetected_overflowprefixanalyse_modelregister_forward_hook)selfr   max_frames_to_saver   r   s        P/var/www/auris/envauris/lib/python3.13/site-packages/transformers/debug_utils.py__init__DebugUnderflowOverflow.__init__   sh    
 0%:" "'',>?
!&)""$    c                     Ub  U R                  U5        U R                  R                  SR                  U R                  5      5        / U l        g N
)expand_framer   appendjoinr   )r   r   s     r   
save_frame!DebugUnderflowOverflow.save_frame   s<    e$499TZZ01
r   c                 :    U R                   R                  U5        g N)r   r#   )r   lines     r   r"   #DebugUnderflowOverflow.expand_frame   s    

$r   c                 Z    [        SR                  U R                  5      5        / U l        g r    )printr$   r   r   s    r   trace_frames#DebugUnderflowOverflow.trace_frames   s    dii$%r   c                     / U l         g r(   )r   r-   s    r   reset_saved_frames)DebugUnderflowOverflow.reset_saved_frames   s	    r   c                 
   [        SU R                   35        [        S[        U R                  5       S35        [        SS SSS S35        [        S	R	                  U R                  5      5        [        S
5        / U l        g )Nz&
Detected inf/nan during batch_number=zLast z forward frames:abs min8 abs max	 metadatar!   

)r,   r   lenr   r$   r-   s    r   dump_saved_frames(DebugUnderflowOverflow.dump_saved_frames   sq    78I8I7JKLc$++&''7891Qym956dii$%fr   c                 x    U R                   R                  5        VVs0 s H  u  pX!_M	     snnU l        g s  snnf r(   )r   named_modulesmodule_names)r   namems      r   r   $DebugUnderflowOverflow.analyse_model   s1    
 59JJ4L4L4NO4NQW4NOOs   6c                    [         R                  " U5      (       a3  U R                  [        X5      5        [	        X5      (       a  SU l        g g Uc  U R                  SS SU 35        g U R                  SS SU 35        g )NTNonez>17r6   znot a tensor)torch	is_tensorr"   get_abs_min_maxdetect_overflowr   )r   varctxs      r   analyse_variable'DebugUnderflowOverflow.analyse_variable   sx    ??3oc78s(()-& )[AcU34 4AcU;<r   c                     U R                  SU R                   SU R                   S35        U R                  SS SSS S35        g )	Nr9   z *** Starting batch number=z ***r4   r5   r6   r7   r8   r"   r   r   r-   s    r   batch_start_frame(DebugUnderflowOverflow.batch_start_frame   sK    D-HIZIZH[[_`aYqM9Q-yABr   c                 `    U R                  U R                   SU R                  S-
   S35        g )Nz *** Finished batch number=r   z ***

rN   r-   s    r   batch_end_frame&DebugUnderflowOverflow.batch_end_frame   s0    T[[M)DTEVEVYZEZD[[cder   c           
         U R                  U R                   SU R                  U    SUR                  R                   35        UR                  SS9 H  u  pEU R                  XT5        M     [        U[        5      (       a+  [        U5       H  u  pgU R                  USU S35        M     OU R                  US5        [        U[        5      (       ao  [        U5       H_  u  pg[        U[        5      (       a/  [        U5       H  u  pU R                  U	SU SU S35        M      MI  U R                  USU S35        Ma     OU R                  US	5        U R                  5         g )
Nr6   F)recursezinput[]inputzoutput[z][output)r"   r   r?   	__class____name__named_parametersrK   
isinstancetuple	enumerater%   )
r   modulerW   rX   r@   pixjys
             r   create_frame#DebugUnderflowOverflow.create_frame   sD   T[[M4+<+<V+D*EQvGWGWG`G`Fabc ..u.=GD!!!* > eU##!%(%%a6!A7 ) !!%1 fe$$!&)a'' )!--a71#Rs!1DE !- ))!wqc^< * !!&(3r   c                 N    U R                   R                  U R                  5        g r(   )r   apply_register_forward_hookr-   s    r   r   ,DebugUnderflowOverflow.register_forward_hook   s    

445r   c                 :    UR                  U R                  5        g r(   )r   forward_hook)r   r_   s     r   ri   -DebugUnderflowOverflow._register_forward_hook   s    $$T%6%67r   c                    SnU R                   U R                  ;   a  SOSnU(       a  U R                  5         U R                  S:X  a  U R	                  5         U =R                  S-  sl        XR
                  :X  a  U =R                   S-  sl         SnU R                  XU5        U(       a  U R                  5         U(       a  U R	                  5         U R                  (       a"  U(       d  U R                  5         [        S5      eU R                  bA  U R                   U R                  :  a&  [        SU R                    SU R                   S35      eg g )	NFTr   r   zDebugUnderflowOverflow: inf/nan detected, aborting as there is no point running further. Please scroll up above this traceback to see the activation values prior to this event.z'DebugUnderflowOverflow: aborting after z' batches due to `abort_after_batch_num=z` arg)r   r   r1   r   rO   r   re   r.   r   r;   
ValueErrorr   )r   r_   rW   rX   last_frame_of_batch
trace_modes         r   rl   #DebugUnderflowOverflow.forward_hook   s<    $!..$2G2GGTU
##%q ""$A ZZ""&&0
 ""$!!*""$ j  %%1d6G6G$JdJd6d9$:K:K9L M++/+E+E*FeM  7e1r   )
r   r   r   r   r   r   r?   r   r   r   r(   )rZ   
__module____qualname____firstlineno____doc__r   r%   r"   r.   r1   r;   r   rK   rO   rR   re   r   ri   rl   __static_attributes__ r   r   r   r      s\    sj 24b`d %" P=Cf868-r   r   c                 r    U R                  5       nUR                  5       S SUR                  5       S SU 3$ )Nz8.2er6   )absminmax)rI   rJ   abs_vars      r   rG   rG   %  s6    ggiGkkmD!7;;="6au==r   c                 ,   Sn[         R                  " U 5      R                  5       R                  5       (       a  Sn[	        U S35        [         R
                  " U 5      R                  5       R                  5       (       a  Sn[	        U S35           U$ )a  
Report whether the tensor contains any `nan` or `inf` entries.

This is useful for detecting overflows/underflows and best to call right after the function that did some math that
modified the tensor in question.

This function contains a few other helper features that you can enable and tweak directly if you want to track
various other things.

Args:
    var: the tensor variable to check
    ctx: the message to print as a context

Return:
    `True` if `inf` or `nan` was detected, `False` otherwise
FTz	 has nansz	 has infs)rE   isnananyitemr,   isinfgerz   numelr{   r|   rI   mean)rI   rJ   detectedn100n1000n10000s         r   rH   rH   *  s    " H{{3""$$Y {{3""$$Y  	 	 	 Or   c                       \ rS rSrSrSrSrg)DebugOptioniX  underflow_overflowtpu_metrics_debugrx   N)rZ   rs   rt   ru   UNDERFLOW_OVERFLOWTPU_METRICS_DEBUGrw   rx   r   r   r   r   X  s    -+r   r   )r   utilsr   r   r   rE   
get_loggerrZ   loggerr   rG   rH   r   rx   r   r   <module>r      sT     < <  
		H	%G GT>
+\,, ,r   