
    eThT                     `   S r SSKrSSKJr  SSKrSSKrSSKJ	r	  SSK
Jr  SSKJrJrJr  \" 5       (       a
  SSKrSSKJr  \R$                  " \5      rS r\" 5       (       a  \" 5       (       a  SS	KJr  OSS
KJr   " S S\5      r " S S\5      rSqS rS rS rS r S r!S r"SS jr#SS jr$g)z
Integration with Deepspeed
    N)partialmethod   )dep_version_check)is_accelerate_availableis_torch_availablelogging)nnc                      [         R                  R                  S5      S Ln U (       a   [        R                  " S5      ngg ! [        R
                   a     gf = f)N	deepspeedTF)	importlibutil	find_specimportlib_metadatametadataPackageNotFoundError)package_exists_s     [/var/www/auris/envauris/lib/python3.13/site-packages/transformers/integrations/deepspeed.pyis_deepspeed_availabler   $   sW    ^^--k:$FN 	"++K8A  "66 		s   A AA)HfDeepSpeedConfig)objectc                   ,   ^  \ rS rSrSrU 4S jrSrU =r$ )r   9   a"  
This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.

A `weakref` of this object is stored in the module's globals to be able to access the config from areas where
things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`). Therefore
it's important that this object remains alive while the program is still running.

[`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to sync the configuration
with values of [`TrainingArguments`] by replacing special placeholder values: `"auto"`. Without this special logic
the DeepSpeed configuration is not modified in any way.

Args:
    config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict.

c                 f   > [        U 5        [        S5        [        S5        [        TU ]  U5        g )N
accelerater   )set_hf_deepspeed_configr   super__init__selfconfig_file_or_dict	__class__s     r   r   HfDeepSpeedConfig.__init__J   s)    %,'+&,-     )__name__
__module____qualname____firstlineno____doc__r   __static_attributes____classcell__r"   s   @r   r   r   9   s     . .r$   r   c                   `   ^  \ rS rSrSrU 4S jrS rS rSS jr\	" \SS9r
SS	 jrS
 rSrU =r$ )HfTrainerDeepSpeedConfigR   z
The `HfTrainerDeepSpeedConfig` object is meant to be created during `TrainingArguments` object creation and has the
same lifespan as the latter.
c                 @   > [         TU ]  U5        S U l        / U l        g N)r   r   _dtype
mismatchesr   s     r   r   !HfTrainerDeepSpeedConfig.__init__X   s    ,-r$   c                 J    U R                   c  [        S5      eU R                   $ )Nz8trainer_config_process() wasn't called yet to tell dtype)r3   
ValueError)r    s    r   dtypeHfTrainerDeepSpeedConfig.dtype]   s"    ;;WXX{{r$   c                 6    U R                  U5      nUc  gUS:H  $ )NFauto)	get_value)r    ds_key_longvals      r   is_auto HfTrainerDeepSpeedConfig.is_autob   s"    nn[);&= r$   c           
          U R                  U5      u  pVUc  gUR                  U5      S:X  a  X%U'   gU(       d  gUR                  U5      nUb.  Xr:w  a(  U R                  R                  SU SU SU SU 35        ggg)a  
A utility method that massages the config file and can optionally verify that the values match.

1. Replace "auto" values with `TrainingArguments` value.

2. If it wasn't "auto" and `must_match` is true, then check that DS config matches Trainer
config values and if mismatched add the entry to `self.mismatched` - will assert during
`trainer_config_finalize` for one or more mismatches.

Nr;   z- ds =z vs hf )find_config_nodegetr4   append)r    r=   hf_valhf_key
must_matchconfigds_keyds_vals           r   
fill_match#HfTrainerDeepSpeedConfig.fill_matchi   s     ..{;>::f'#6NF#&"2OO""U;-qxqQWPX#YZ #3r$   F)rH   c                    UR                   UR                  -  UR                  -  nU R                  SUR                  SU(       + 5        U R                  SUR                  S5        U R                  SUSU(       + 5        U R                  SUR                  S5        U R                  SUR
                  S	5        U R                  S
UR                  UR                  /S5        U R                  SUR                  S5        U R                  SUR                  S5        U R                  SS5        U R                  SUR
                  S	5        UR                  (       d  UR                  (       a  UR                  S:X  a  SOSnOSnUR                  (       aE  U R                  R!                  S0 5      U R                  S'   UR                  U R                  S   S'   U R                  SUR                  =(       d    UR                  =(       a    US:H  S5        U R                  SUS:H  S5        U R                  SUR"                  S5        U R                  SUR$                  =(       d    UR&                  S5        U R)                  S5      (       a  [*        R,                  U l        gU R1                  S5      (       a  [*        R2                  U l        g[*        R4                  U l        g) zr
Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object
creation.
train_micro_batch_size_per_gpuper_device_train_batch_sizegradient_accumulation_stepstrain_batch_sizeztrain_batch_size (calculated)gradient_clippingmax_grad_normzoptimizer.params.lrlearning_ratezoptimizer.params.betaszadam_beta1+adam_beta2zoptimizer.params.epsadam_epsilonzoptimizer.params.weight_decayweight_decayzscheduler.params.warmup_min_lrr   zscheduler.params.warmup_max_lrapexampN
checkpointuse_node_local_storagezfp16.enabledz%fp16|fp16_full_eval+fp16_backend(amp)zamp.enabledzfp16+fp16_backend(apex)zamp.opt_levelfp16_opt_levelzbf16.enabledzbf16|bf16_full_eval)
world_sizerP   rQ   rL   rT   rU   
adam_beta1
adam_beta2rV   rW   	fill_onlyfp16fp16_full_evalfp16_backendsave_on_each_noderI   rD   r\   bf16bf16_full_evalis_truetorchbfloat16r3   is_falsefloat32float16)r    argsauto_find_batch_sizerR   rc   s        r   trainer_config_process/HfTrainerDeepSpeedConfig.trainer_config_process   sR     ??T-M-MMPTPpPpp,,,)$$		
 	),,)	

 	+$$		
 	+T-?-?Q-t/A/A?S$__doo.#	

 	.0A0A>R79J9JN[7;8$:L:Lo^ 99++%)%6%6&%@6eLL!!(,b(IDKK%BFBXBXDKK%&>? 	ii.4..ILE4I3	
 	|v'=?XY)<)<>NO)Id6I6ILab <<''..DK]]>**--DK--DKr$   c                    / SQnU Vs/ s H  oPR                  U5      (       d  M  UPM     nn[        U5      S:  Ga  [        UR                  S5      (       a  UR                  R                  nGO[        UR                  S5      (       a   [        UR                  R                  5      nO[        UR                  S5      (       aF  [        UR                  R                  S5      (       a!  UR                  R                  R                  nOy[        UR                  S5      (       aO  [        UR                  R                  S5      (       a*  [        UR                  R                  R                  5      nO[        SU S35      eU R                  SXw-  5        U R                  5       (       a6  U R                  S	[        S
U-  U-  5      5        U R                  SSU-  5        U R                  SUS5        U R                  SUR                  U5      S5        [        U R                  5      S:  a*  SR                  U R                  5      n[        SU S35      egs  snf )zx
This stage is run after we have the model and know num_training_steps.

Now we can complete the configuration process.
)$zero_optimization.reduce_bucket_size-zero_optimization.stage3_prefetch_bucket_size4zero_optimization.stage3_param_persistence_thresholdr   hidden_sizehidden_sizestext_configzThe model's config file has neither `hidden_size` nor `hidden_sizes` entry, therefore it's not possible to automatically fill out the following `auto` entries in the DeepSpeed config file: zb. You can fix that by replacing `auto` values for these keys with an integer value of your choice.rr   rs   g?rt   
   z scheduler.params.total_num_stepsznum_training_steps (calculated)z!scheduler.params.warmup_num_stepswarmup_steps
z]Please correct the following DeepSpeed config values that mismatch TrainingArguments values:
zF
The easiest method is to set these DeepSpeed config values to 'auto'.N)r?   lenhasattrrI   ru   maxrv   rw   r7   r`   is_zero3intrL   get_warmup_stepsr4   join)	r    rm   modelnum_training_stepshidden_size_based_keysxhidden_size_auto_keysru   r4   s	            r   trainer_config_finalize0HfTrainerDeepSpeedConfig.trainer_config_finalize   s   "

 -C V,BqllSTo,B V$%)u||]33#ll66~66!%,,";";<}55'%,,BZBZ\i:j:j#ll66BB}55'%,,BZBZ\j:k:k!%,,":":"G"GH 55J4K LYY  NNA;C\]}}Ck)K78 J$ 	.-	

 	/!!"45	
 t!#4??3J'L(oq  $[ !Ws
   II)r3   r4   )NTF)r&   r'   r(   r)   r*   r   r8   r?   rL   r   r`   ro   r   r+   r,   r-   s   @r   r/   r/   R   s>    


![4 jU;IH(T@ @r$   r/   c                 0    [         R                  " U 5      qg r2   )weakrefref_hf_deepspeed_config_weak_ref)hf_deepspeed_config_objs    r   r   r     s    
 %,KK0G$H!r$   c                      S q g r2   )r   r%   r$   r   unset_hf_deepspeed_configr     s
     %)!r$   c                  X    [         b#  [        5       b  [        5       R                  5       $ g)NF)r   r~   r%   r$   r   is_deepspeed_zero3_enabledr   $  s&    $05R5T5`,.7799r$   c                  P    [         b  [        5       b  [        5       R                  $ g r2   )r   rI   r%   r$   r   deepspeed_configr   +  s#    $05R5T5`,.555r$   c                    ^^^ [        USS5      mUR                  5       nTb  TUl        / mSS[        R                  4UUU4S jjjmT" XSS9  T$ )z
Loads state dict into a model specifically for Zero3, since DeepSpeed does not support the `transformers`
tensor parallelism API.

Nearly identical code to PyTorch's `_load_from_state_dict`
	_metadataNFmodulec                   > Tc  0 OTR                  US S 0 5      nX4S'   XUS/ / T4n[        5       (       a  [        U Vs/ s H  ofR                  U5      (       d  M  UPM     sn5      S:  a  SS Kn[        U R                  US S SS95      nUR                  5        V	s/ s H  oU;   d  M
  X   PM     n
n	[        U
5      S:  aT  UR                  R                  U
SS9   [        R                  R                  5       S:X  a  U R                  " U6   S S S 5        U R                  R                  5        H  u  pUc  M
  T" XX+-   S-   U5        M     g s  snf s  sn	f ! , (       d  f       NP= f)	Nassign_to_params_buffersTr   F)prefixrecurse)modifier_rank.)rD   r   r{   
startswithr   dictnamed_parameterskeyszeroGatheredParametersrh   distributedget_rank_load_from_state_dict_modulesitems)r   
state_dictr   r   local_metadatarm   keyr   r   kparams_to_gathernamechild
error_msgsloadr   s                r   r   /_load_state_dict_into_zero3_model.<locals>.loadC  s]   '/X\\&"+r5R5M12ND"b*M &''C
0e
nn]cNd
0e,fij,j  $F$;$;6#2;X]$;$^_=G__=Nh=NWgRg 3 0 3=Nh#$q( ^^667GWX6Y((113q844d; Z "??002KD U(;=UV 3 1f  i
 ZYs$    EE 	E-E 2E
E*)r   ) F)getattrcopyr   r	   Module)model_to_loadr   r   r   r   s     @@@r   !_load_state_dict_into_zero3_modelr   2  s\     z;5H"J'
JWRYY W W4 	UCr$   c                   ^ ^ SSK JnJn  UR                  nSnSU;   a#  UR                  (       a  [        S5      eU" US9nO?UR                  5       (       a  [        R                  S5        T R                  5       nSUS	'   Sn	S
U;   a  U" U5      n	X4$ [        X5      (       a  UU 4S jn
U" XS9n	X4$ T R                  TUS9n	X4$ )zQ
A convenience wrapper that deals with optimizer and lr scheduler configuration.
r   )
DummyOptimDummySchedulerN	optimizerz|--adafactor was passed, but also found `optimizer` configured in the DeepSpeed config. Only one optimizer can be configured.)paramszDetected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the custom optimizer has both CPU and GPU implementation (except LAMB)Tzero_allow_untested_optimizer	schedulerc                 b   > [         R                   " T5      nS Ul        UR                  TU S9nU$ )Nr   r   )r   lr_schedulercreate_scheduler)r   trainer_copyr   r   trainers      r   _lr_scheduler_callable5deepspeed_optim_sched.<locals>._lr_scheduler_callable  s=    #yy1 -1)+<<'9Y  =   $#r$   )lr_scheduler_callabler   )accelerate.utilsr   r   rI   	adafactorr7   
is_offloadloggerinfocreate_optimizer
isinstancer   )r   hf_deepspeed_configrm   r   model_parametersr   r   rI   r   r   r   s   `  `       r   deepspeed_optim_schedr   b  s     < ''F If>>8  &67	))++KKV ,,.	26./Lf%i0& ""# i,,	$ *)bL "" #33GYen3oL""r$   c                    SSK Jn  U R                  nU R                  nU R                  R
                  R                  R                  nUR                  XTU5        UR                  UR                  5       5        U(       aK  UR                  5       (       d  [        S5      eUR                  S5        UR                  S5        Su  pxSn	Xx4$ SU l        UR                  R!                  S0 5      R!                  S	S5      n
U
S
:  a"  SSKnUR%                  XJUR'                  5       S9n[)        [+        S UR-                  5       5      5      n	[/        XXQU	5      u  pxXx4$ )a  
Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.

If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made.

Args:
    trainer: Trainer object
    num_training_steps: per single gpu
    resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
    inference: launch in inference mode (no optimizer and no lr scheduler)
    auto_find_batch_size: whether to ignore the `train_micro_batch_size_per_gpu` argument as it's being
        set automatically by the auto batch size finder

Returns: optimizer, lr_scheduler

We may use `deepspeed_init` more than once during the life of Trainer, when we do - it's a temp hack based on:
https://github.com/deepspeedai/DeepSpeed/issues/1394#issuecomment-937405374 until Deepspeed fixes a bug where it
can't resume from a checkpoint after it did some stepping https://github.com/deepspeedai/DeepSpeed/issues/1612

r   )r   zMZeRO inference only makes sense with ZeRO Stage 3 - please adjust your configr   r   )NNNtensor_parallelautotp_size   )r   tp_sizer8   c                     U R                   $ r2   )requires_grad)ps    r   <lambda> deepspeed_init.<locals>.<lambda>  s    r$   )deepspeed.utilsr   r   rm   acceleratorstatedeepspeed_pluginhf_ds_configr   setLevelget_process_log_levelr~   r7   del_config_sub_treer   rI   rD   r   tp_model_initr8   listfilter
parametersr   )r   r   	inference	ds_loggerr   rm   r   r   r   r   r   r   s               r   deepspeed_initr     sN   * 4MME<<D!--33DDQQ //=OP t1134"++--lmm 	//<//?",	  "" !%,,001BBGKKM[\]Q;++%PcPiPiPk+lE'@%BRBRBT UV"7$DT#
	 ""r$   c                     SS K n[        UR                  U S35      5      n[        U5      S:  a>  [        R	                  SU 35        U R                  UUSSS9u  pVUc  [        SU 35      eg [        SU 35      e)Nr   z/global_step*zAttempting to resume from T)load_module_strictload_optimizer_statesload_lr_scheduler_statesz-[deepspeed] failed to resume from checkpoint z!Can't find a valid checkpoint at )globsortedr{   r   r   load_checkpointr7   )deepspeed_enginecheckpoint_pathr   r   deepspeed_checkpoint_dirs	load_pathr   s          r   deepspeed_load_checkpointr     s    
  &tyyO3DM1R'S T
$%)00ABC'771"&%)	 8 
	 L_L]^__  <_<MNOOr$   r   )T)%r*   r   importlib.metadatar   r   importlib.utilr   r   	functoolsr   dependency_versions_checkr   utilsr   r   r   rh   r	   
get_loggerr&   r   r   accelerate.utils.deepspeedr   DeepSpeedConfigbuiltinsr   r/   r   r   r   r   r   r   r   r   r   r%   r$   r   <module>r     s     /   # 9 H H  
		H	%
 !7!9!9O 3. .2}0 }B !% I)-`:#z;#|Pr$   