o
    Zh                     @   sx  d dl Z d dlmZ d dlmZ ddlmZ ddlmZm	Z	 e	
eZe r*d dlZdd Z			d1d	ee d
ed dee dedef fddZ			d1d	ee d
ed dee dedef fddZ			d1d	ee d
ed dee dedef fddZ	d2d	ed
ddee dedef fddZ	d2d	ed
ddee dedef fddZ	d2d	ed
ddee dedef fddZeeeeeedZ		d3dedededee d ee f
d!d"Zd2d	ed ee fd#d$Zd2d	ed ee fd%d&Zd2d	ed ee fd'd(Zd2d	ed ee fd)d*Zd2d	ed ee fd+d,Z d2d	ed ee fd-d.Z!eeeee e!dZ"d2d	ed ee fd/d0Z#dS )4    Nwraps)Optional   )PretrainedConfig)is_torch_availableloggingc                    s,   dd dd  t  fdd}|S )ad  
    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).

    Args:
        rope_forward (Callable):
            The forward pass of the RoPE implementation.

    Returns:
        The decorated forward pass.
    c                 S   s   t |d }t| jdr| jj}n| jj}||kr8t| ds-| j| j||d d\| _}| jd| jdd dS | j	
|| _	| jd| j	dd dS )	zbLongrope uses long factor if sequence is larger than original pretraining length, short otherwise.r    original_max_position_embeddingslong_inv_freqseq_leninv_freqF
persistentN)torchmaxhasattrconfigr	   max_position_embeddingsrope_init_fnr
   register_bufferoriginal_inv_freqto)selfposition_idsdevicer   r	   _ r   O/var/www/auris/lib/python3.10/site-packages/transformers/modeling_rope_utils.pylongrope_frequency_update+   s   

z6dynamic_rope_update.<locals>.longrope_frequency_updatec                 S   s   t |d }|| jkr#| j| j||d\}| _| jd|dd || _|| jk rD| j| jkrF| j	|| _| jd| jdd | j| _dS dS dS )a  
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        r   r   r   Fr   N)
r   r   Zmax_seq_len_cachedr   r   Zattention_scalingr   Zoriginal_max_seq_lenr   r   )r   r   r   r   r   r   r   r   dynamic_frequency_update>   s   
z5dynamic_rope_update.<locals>.dynamic_frequency_updatec                    sB   d| j v r | ||jd n| j dkr| ||jd | ||S )Ndynamic)r   longrope)	rope_typer   )r   xr   r    r   rope_forwardr   r   wrapperQ   s
   

z$dynamic_rope_update.<locals>.wrapperr   )r&   r'   r   r%   r   dynamic_rope_update   s
   r(   r   r   ztorch.devicer   returnztorch.Tensorc           
      K   s   | durt |dkrtd| d|  t |dkr#|d }|d }n#| durF| j}t| dr2| jnd}t| d	dp?| j| j }t|| }d}d|t	j
d|d
t	jdj|t	jd|   }	|	|fS )a  
    Computes the inverse frequencies according to the original RoPE implementation
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    Nr   zUnexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in `_compute_default_rope_parameters`, got `rope_kwargs`= and `config`=basedimpartial_rotary_factor      ?head_dim   dtyper   r2   )len
ValueError
rope_thetar   r-   getattrhidden_sizenum_attention_headsintr   arangeint64r   float)
r   r   r   rope_kwargsr+   r,   r-   r/   attention_factorr   r   r   r    _compute_default_rope_parameters\   s&   
,r@   c                 K   sx   | durt |dkrtd| d|  t |dkr|d }n	| dur(| jd }t| ||fi |\}}|| }||fS )a  
    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    Nr   zUnexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in `_compute_linear_scaling_rope_parameters`, got `rope_kwargs`=r*   factor)r4   r5   rope_scalingr@   )r   r   r   r>   rA   r   r?   r   r   r   '_compute_linear_scaling_rope_parameters   s   

rC   c                 K   s  | durt |dkrtd| d|  t |dkr+|d }|d }|d }|d }n)| durT| j}t| d	r:| jnd
}t| d| j| j }	t|	| }| j	}| j
d }d
}
|dur`||kr`|n|}||| | |d  ||d    }d
|tjd|dtjdj|tjd|   }||
fS )a4  
    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length, used to update the dynamic RoPE at inference time.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    Nr   zUnexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in `_compute_dynamic_ntk_parameters`, got `rope_kwargs`=r*   r+   r,   r   rA   r-   r.   r/   r   r0   r1   r3   )r4   r5   r6   r   r-   r7   r8   r9   r:   r   rB   r   r;   r<   r   r=   )r   r   r   r>   r+   r,   r   rA   r-   r/   r?   r   r   r   r   _compute_dynamic_ntk_parameters   s2   

$,rD   c                    s  t |dkrtd| | j}t| dr| jnd}t| d| j| j }t|| }| j	d }| j	
d}	| j	
d}
| j	
d	}d
| j	v rQ| j	d
 }| j| }n| j}ddd}|	du rr|
rn|rnt|||
||| }	n||}	| j	
dpyd}| j	
dpd}dd   fdd}dd }|td|dj|tjd|  }d| }d||  }||||||\}}d||||d j|tjd }|d|  ||  }||	fS )a  
    Computes the inverse frequencies with NTK scaling. Please refer to the
    [original paper](https://arxiv.org/abs/2309.00071)
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r   zYUnexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got r-   r.   r/   rA   r?   mscalemscale_all_dimr	   r   c                 S   s"   | dkrdS d| t |  d S )Nr   r.   g?)mathlog)scalerE   r   r   r   
get_mscale  s   z,_compute_yarn_parameters.<locals>.get_mscaleN	beta_fast    	beta_slowc                 S   s*   |t || d t j   dt |  S )zPInverse dimension formula to find the dimension based on the number of rotationsr0   )rG   rH   pi)Znum_rotationsr,   r+   r   r   r   r   find_correction_dim  s   *z5_compute_yarn_parameters.<locals>.find_correction_dimc                    s@   t  | |||}t  ||||}t|dt||d fS )z.Find dimension range bounds based on rotationsr   r   )rG   floorceilr   min)Zlow_rotZhigh_rotr,   r+   r   lowhighrO   r   r   find_correction_range#  s   z7_compute_yarn_parameters.<locals>.find_correction_rangec                 S   s>   | |kr|d7 }t j|t jd|  ||   }t |dd}|S )NgMbP?r1   r   r   )r   r;   float32clamp)rR   r   r,   Zlinear_funcZ	ramp_funcr   r   r   linear_ramp_factor)  s
   z4_compute_yarn_parameters.<locals>.linear_ramp_factorr0   r3   )r   )r4   r5   r6   r   r-   r7   r8   r9   r:   rB   getr   r=   r   r;   r   )r   r   r   r>   r+   r-   r/   r,   rA   r?   rE   rF   r	   rJ   rK   rM   rV   rY   Z	pos_freqsZinv_freq_extrapolationZinv_freq_interpolationrS   rT   Zinv_freq_extrapolation_factorr   r   rU   r   _compute_yarn_parameters   sH   



"
 
r[   c                 K   s@  t |dkrtd| | j}t| dr| jnd}t| d| j| j }t|| }| j	d }| j	d }	| j	
d}
| j	
d	}t| d
rO| j}| j| j }
n| j}|du rl|
dkr]d}ntdt|
t|  }|r|||kr|tj|tj|d}n	tj|	tj|d}tjd|dtj|d | }d|||   }||fS )a  
    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
    [original implementation](https://github.com/microsoft/LongRoPE)
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r   z]Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got r-   r.   r/   long_factorshort_factorrA   r?   r	   Nr   )r2   r   r0   )r4   r5   r6   r   r-   r7   r8   r9   r:   rB   rZ   r	   r   rG   sqrtrH   r   ZtensorrW   r;   r<   r=   )r   r   r   r>   r+   r-   r/   r,   r\   r]   rA   r?   r	   Zext_factorsZinv_freq_shaper   r   r   r   _compute_longrope_parametersB  s8   


r_   c                 K   s   t | ||fi |\}}| jd }| jd }| jd }| jd }	|	| }
|	| }dtj | }t||
k|| |}|	| | ||  }d| | | ||  }||k  ||
k  }t|||}||fS )a  
    Computes the inverse frequencies for llama 3.1.

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    rA   low_freq_factorhigh_freq_factorr	   r0   r   )r@   rB   rG   rN   r   where)r   r   r   r>   r   r?   rA   r`   ra   Zold_context_lenZlow_freq_wavelenZhigh_freq_wavelenZwavelenZinv_freq_llamaZsmooth_factorZsmoothed_inv_freqZis_medium_freqr   r   r   _compute_llama3_parameters  s   



rc   )defaultZlinearr!   yarnr"   Zllama3r#   received_keysrequired_keysoptional_keysignore_keysc                 C   s   d|v r|dh8 }| d |dur||8 }|| }|r&td|  d| |dur1|| | }n|| }|rDtd|  d|  dS dS )zYCompare the received keys in `config.rope_scaling` against the expected and optional keystyper#   Nz9Missing required keys in `rope_scaling` for 'rope_type'='z': z5Unrecognized keys in `rope_scaling` for 'rope_type'=')addKeyErrorloggerwarning)r#   rf   rg   rh   ri   Zmissing_keysZunused_keysr   r   r   _check_received_keys  s   	

ro   c                 C   s@   | j }|d|dd }dh}t| }t||||d d S )Nr#   rj   ri   )rB   rZ   setkeysro   )r   ri   rB   r#   rg   rf   r   r   r   !_validate_default_rope_parameters  s
   rs   c                 C   sx   | j }|d|dd }ddh}t| }t||||d |d }|d u s0t|tr0|dk r:td|  d S d S )Nr#   rj   rA   rp   r.   8`rope_scaling`'s factor field must be a float >= 1, got 	rB   rZ   rq   rr   ro   
isinstancer=   rm   rn   )r   ri   rB   r#   rg   rf   rA   r   r   r   (_validate_linear_scaling_rope_parameters  s   rw   c                 C   s   | j }|d|dd }ddh}dh}t| }t|||||d |d }|d u s4t|tr4|dk r>td|  d S d S )Nr#   rj   rA   r	   rp   r.   rt   ru   )r   ri   rB   r#   rg   rh   rf   rA   r   r   r   )_validate_dynamic_scaling_rope_parameters  s   rx   c                 C   s6  | j }|d|dd }ddh}h d}t| }t|||||d |d }|d u s5t|tr5|dk r=td|  |d}|d urWt|trO|d	k rWtd
|  |d}	|	d urmt|	tsmtd|	  |d}
|
d urt|
tstd|
  |	pd|
pdk rtd|	 d|
 d d S d S )Nr#   rj   rA   >   rF   rM   r?   rK   r	   rE   rp   r.   rt   r?   r   L`rope_scaling`'s attention_factor field must be a float greater than 0, got rK   z6`rope_scaling`'s beta_fast field must be a float, got rM   z6`rope_scaling`'s beta_slow field must be a float, got rL   r   zO`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast=z( (defaults to 32 if None) and beta_slow=z (defaults to 1 if None)ru   )r   ri   rB   r#   rg   rh   rf   rA   r?   rK   rM   r   r   r   _validate_yarn_parameters  s6   


rz   c                 C   s  | j }|d|dd }h d}h d}t| }t|||||d t| dr,| jnd}t| d| j| j	 }t
|| }	|d	}
t|
tsYtd
d |
D rYtd|
  t|
|	d ksptd|	d  dt|
  |d}t|tstdd |D rtd|  t||	d kstd|	d  dt|  t| drtd d S |d}|d u rtd nt|tr|dk rtd|  |d}|d urt|tr|dk rtd|  d S d S d S )Nr#   rj   >   r#   r]   r\   >   rA   r?   r	   rp   r-   r.   r/   r]   c                 s       | ]
}t |ttfV  qd S Nrv   r:   r=   .0r$   r   r   r   	<genexpr>,      z0_validate_longrope_parameters.<locals>.<genexpr>zC`rope_scaling`'s short_factor field must be a list of numbers, got r0   z5`rope_scaling`'s short_factor field must have length z, got r\   c                 s   r{   r|   r}   r~   r   r   r   r   2  r   zB`rope_scaling`'s long_factor field must be a list of numbers, got z4`rope_scaling`'s long_factor field must have length r	   aY  This model has set a `original_max_position_embeddings` field, to be used together with `max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, as it is compatible with most model architectures.rA   z1Missing required keys in `rope_scaling`: 'factor'rt   r?   g        ry   )rB   rZ   rq   rr   ro   r   r-   r7   r8   r9   r:   rv   listallrm   rn   r4   Zwarning_oncer=   )r   ri   rB   r#   rg   rh   rf   r-   r/   r,   r]   r\   rA   r?   r   r   r   _validate_longrope_parameters  sH   




r   c           
      C   s6  | j }|d|dd }h d}t| }t||||d |d }|d u s0t|tr0|dk r8td|  |d }|d	 }|d u sIt|tsQtd
|  |d u sZt|tsbtd|  ||krqtd| d|  |d }	|	d u s~t|	t	std|	  |	| j
krtd|	 d| j
  d S d S )Nr#   rj   >   rA   r`   r#   r	   ra   rp   rA   r.   rt   r`   ra   z<`rope_scaling`'s low_freq_factor field must be a float, got z=`rope_scaling`'s high_freq_factor field must be a float, got zc`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=z and low_freq_factor=r	   zP`rope_scaling`'s original_max_position_embeddings field must be an integer, got zg`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got z and max_position_embeddings=)rB   rZ   rq   rr   ro   rv   r=   rm   rn   r:   r   )
r   ri   rB   r#   rg   rf   rA   r`   ra   r	   r   r   r   _validate_llama3_parametersP  sL   
r   c                 C   sd   t | dd}|du rdS |d|dd}t|}|dur'|| |d dS td| d dS )	zO
    Validate the RoPE config arguments, given a `PretrainedConfig` object
    rB   Nr#   rj   rd   rp   zTMissing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='')r7   rZ   ROPE_VALIDATION_FUNCTIONSrm   rn   )r   ri   rB   r#   Zvalidation_fnr   r   r   rope_config_validation  s   

r   )NNNr|   )NN)$rG   	functoolsr   typingr   Zconfiguration_utilsr   utilsr   r   Z
get_logger__name__rm   r   r(   r:   tupler=   r@   rC   rD   r[   r_   rc   ZROPE_INIT_FUNCTIONSstrrq   ro   rs   rw   rx   rz   r   r   r   r   r   r   r   r   <module>   s   
?

+

*

4

a

@

/
&2&
