
    fTh5-                     `    S r SSKJr  SSKJr  \R
                  " \5      r " S S\5      rS/r	g)zPhi-3 model configuration   )PretrainedConfig)loggingc                      ^  \ rS rSrSrSrS/rSSSSS.rS/S	/4S
S/S
/4S
/S
/4S.r                       SU 4S jjr	S r
S rSrU =r$ )
Phi3Config   at  
This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the
[microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct).

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 32064):
        Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`Phi3Model`].
    hidden_size (`int`, *optional*, defaults to 3072):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 8192):
        Dimension of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 32):
        Number of hidden layers in the Transformer decoder.
    num_attention_heads (`int`, *optional*, defaults to 32):
        Number of attention heads for each attention layer in the Transformer decoder.
    num_key_value_heads (`int`, *optional*):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details checkout [this
        paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
        `num_attention_heads`.
    resid_pdrop (`float`, *optional*, defaults to 0.0):
        Dropout probability for mlp outputs.
    embd_pdrop (`int`, *optional*, defaults to 0.0):
        The dropout ratio for the embeddings.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio after computing the attention scores.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the decoder.
    max_position_embeddings (`int`, *optional*, defaults to 4096):
        The maximum sequence length that this model might ever be used with.
    original_max_position_embeddings (`int`, *optional*, defaults to 4096):
        The maximum sequence length that this model was trained with. This is used to determine the size of the
        original RoPE embeddings when using long scaling.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    rms_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon value used for the RMSNorm.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
    tie_word_embeddings (`bool`, *optional*, defaults to `False`):
        Whether to tie weight embeddings
    rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings.
    rope_scaling (`dict`, *optional*):
        The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
        contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
        the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
        divided by the number of attention heads divided by 2.
    partial_rotary_factor (`float`, *optional*, defaults to 1.0):
        Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0.
    bos_token_id (`int`, *optional*, defaults to 1):
        The id of the "beginning-of-sequence" token.
    eos_token_id (`int`, *optional*, defaults to 32000):
        The id of the "end-of-sequence" token.
    pad_token_id (`int`, *optional*, defaults to 32000):
        The id of the padding token.
    sliding_window (`int`, *optional*):
        Sliding window attention window size. If `None`, no sliding window is applied.

Example:

```python
>>> from transformers import Phi3Model, Phi3Config

>>> # Initializing a Phi-3 style configuration
>>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

>>> # Initializing a model from the configuration
>>> model = Phi3Model(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```phi3past_key_valuescolwise_reprowwise_rep)zlayers.*.self_attn.qkv_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormc                 f  > Xl         X l        X0l        X@l        XPl        Uc  UnX`l        Xpl        Xl        Xl        Xl	        Xl
        Xl        Xl        Xl        Xl        UU l        UU l        UU l        U R%                  5         U R'                  5         UU l        [*        TU ]X  " SUUUUS.UD6  g )N)bos_token_ideos_token_idpad_token_idtie_word_embeddings )
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headsresid_pdrop
embd_pdropattention_dropout
hidden_actmax_position_embeddings original_max_position_embeddingsinitializer_rangerms_norm_eps	use_cache
rope_thetarope_scalingpartial_rotary_factor_rope_scaling_adjustment_rope_scaling_validationsliding_windowsuper__init__)selfr   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r   r(   r)   r*   r   r   r   r-   kwargs	__class__s                            c/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/phi3/configuration_phi3.pyr/   Phi3Config.__init__|   s    6 %&!2!2#6 &"5#6 &$!2$'>$0P-!2("$(%:"%%'%%', 	
%%% 3		

 	
    c                     U R                   c  gU R                   R                  SS5      nUb  US;   a  SU R                   S'   ggg)zS
Adjust the `type` of the `rope_scaling` configuration for backward compatibility.
Ntype)suyarnlongrope)r)   get)r0   rope_scaling_types     r3   r+   #Phi3Config._rope_scaling_adjustment   sU     $ --11&$? (->.-P(2Df% .Q(r5   c                    U R                   c  g[        U R                   [        5      (       a  [        U R                   5      S:w  a  [	        SU R                    35      eU R                   R                  SS5      nU R                   R                  SS5      nU R                   R                  SS5      nUb  US;  a  [	        SU 35      e[        U[        5      (       a  [        S	 U 5       5      (       d  [	        S
U 35      e[        U R                  U R                  -  U R                  -  5      n[        U5      US-  :X  d  [	        SUS-   S[        U5       35      e[        U[        5      (       a  [        S U 5       5      (       d  [	        SU 35      e[        U5      US-  :X  d  [	        SUS-   S[        U5       35      eg)z,
Validate the `rope_scaling` configuration.
Nr   ze`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, got r7   short_factorlong_factor)r:   z=`rope_scaling`'s type field must be one of ['longrope'], got c              3   N   #    U  H  n[        U[        [        45      v   M     g 7fN
isinstanceintfloat.0xs     r3   	<genexpr>6Phi3Config._rope_scaling_validation.<locals>.<genexpr>   s      S9RAJq3,//9R   #%zC`rope_scaling`'s short_factor field must be a list of numbers, got    z5`rope_scaling`'s short_factor field must have length z, got c              3   N   #    U  H  n[        U[        [        45      v   M     g 7frB   rC   rG   s     r3   rJ   rK      s      R9QAJq3,//9QrL   zB`rope_scaling`'s long_factor field must be a list of numbers, got z4`rope_scaling`'s long_factor field must have length )r)   rD   dictlen
ValueErrorr;   listallrE   r   r   r*   )r0   r<   rope_scaling_short_factorrope_scaling_long_factorrotary_ndimss        r3   r,   #Phi3Config._rope_scaling_validation   s    $$++T22c$:K:K6LPQ6Q(()+  !--11&$?$($5$5$9$9.$$O!#'#4#4#8#8#M $(9(M\]n\opqq0$77S9RSSSUVoUpq  4++t/G/GG$JdJdde,-1BBGXYHYGZZ`ade~a  aA  B  /66R9QRRRTUmTno  +,0AAF|WXGXFYY_`cd|`}_~  Br5   )r!   r    r"   r   r%   r   r#   r   r   r   r$   r*   r   r&   r)   r(   r-   r'   r   )i@}  i   i        rX   N        rY   rY   silu   r[   g{Gz?gh㈵>TFg     @Ng      ?    }  r]   N)__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr/   r+   r,   __static_attributes____classcell__)r2   s   @r3   r   r      s    Rh J#4"5'4%2%2"/	 &(9:#%568IJ!"_$56   $)-!!1;
z3' 'r5   r   N)
rb   configuration_utilsr   utilsr   
get_loggerr^   loggerr   __all__r   r5   r3   <module>rn      s;       3  
		H	%T! Tn .r5   