
    fThu                        S SK JrJrJr  S SKrS SKJr  S SKrSSKJ	r	J
r
  SSKJr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJrJrJrJrJ r J!r!J"r"J#r#  SSK$J%r%  \RL                  " \'5      r( " S S\5      r) " S S\!5      r* " S S\5      r+ " S S\\RX                  5      r- " S S\5      r. " S S\ 5      r/ " S S\%5      r0 " S S\5      r1/ S Qr2g)!    )CallableOptionalTupleN   )CacheHybridCache)PretrainedConfig)FlashAttentionKwargs)BaseModelOutputWithPast)rope_config_validation)ALL_ATTENTION_FUNCTIONS)Unpack)logging)deprecate_kwarg   )CohereAttentionCohereDecoderLayerCohereForCausalLMCohereLayerNormCoherePreTrainedModelCohereRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)Gemma2Modelc                      ^  \ rS rSrSrSrS/rSSSSSSSS.rS/S	/4S
S/S
/4S
/S
/4S.r                       SU 4S jjr	Sr
U =r$ )Cohere2Config/   a"  
This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
model according to the specified arguments, defining the model architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.


Args:
    vocab_size (`int`, *optional*, defaults to 256000):
        Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`CohereModel`]
    hidden_size (`int`, *optional*, defaults to 8192):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 22528):
        Dimension of the MLP representations.
    logit_scale (`float`, *optional*, defaults to 0.0625):
        The scaling factor for the output logits.
    num_hidden_layers (`int`, *optional*, defaults to 40):
        Number of hidden layers in the Transformer decoder.
    num_attention_heads (`int`, *optional*, defaults to 64):
        Number of attention heads for each attention layer in the Transformer decoder.
    num_key_value_heads (`int`, *optional*):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details checkout [this
        paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
        `num_attention_heads`.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the decoder.
    max_position_embeddings (`int`, *optional*, defaults to 8192):
        The maximum sequence length that this model might ever be used with.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if `config.is_decoder=True`.
    pad_token_id (`int`, *optional*, defaults to 0):
        Padding token id.
    bos_token_id (`int`, *optional*, defaults to 5):
        Beginning of stream token id.
    eos_token_id (`int`, *optional*, defaults to 255001):
        End of stream token id.
    tie_word_embeddings (`bool`, *optional*, defaults to `True`):
        Whether to tie weight embeddings
    rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings.
    rope_scaling (`Dict`, *optional*):
        Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
        and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
        accordingly.
        Expected contents:
            `rope_type` (`str`):
                The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                'llama3'], with 'default' being the original RoPE implementation.
            `factor` (`float`, *optional*):
                Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                original maximum pre-trained length.
            `original_max_position_embeddings` (`int`, *optional*):
                Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                pretraining.
            `attention_factor` (`float`, *optional*):
                Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, using the
                `factor` field to infer the suggested value.
            `beta_fast` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                ramp function. If unspecified, it defaults to 32.
            `beta_slow` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                ramp function. If unspecified, it defaults to 1.
            `short_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `long_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `low_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
            `high_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
        Whether to use a bias in the query, key, value and output projection layers during self-attention.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    sliding_window (`int`, *optional*, defaults to 4096):
        Size of the sliding window attention context.
    sliding_window_pattern (`int`, *optional*, defaults to 4):
        Pattern for the sliding window attention.
    cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.

```python
>>> from transformers import Cohere2Model, Cohere2Config

>>> # Initializing a Cohere Nextmodel configuration
>>> configuration = Cohere2Config()

>>> # Initializing a model from the Cohere2 configuration
>>> model = Cohere2Model(configuration) # doctest: +SKIP

>>> # Accessing the model configuration
>>> configuration = model.config # doctest: +SKIP
```
cohere2past_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormc                 T  > Xl         Xl        X l        X@l        X0l        XPl        X`l        Uc  UnXpl        Xl        Xl	        Xl
        Xl        UU l        UU l        UU l        UU l        UU l        UU l        X&-  U l        UU l        [)        U 5        [*        TU ]X  " SUUUUS.UD6  g )N)pad_token_idbos_token_ideos_token_idtie_word_embeddings )
vocab_sizemax_position_embeddingshidden_sizelogit_scaleintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_heads
hidden_actinitializer_rangelayer_norm_eps	use_cache
rope_thetarope_scalingattention_biasattention_dropoutsliding_windowsliding_window_patternhead_dimcache_implementationr   super__init__)selfr/   r1   r3   r2   r4   r5   r6   r7   r0   r8   r9   r:   r*   r+   r,   r-   r;   r<   r=   r>   r?   r@   rB   kwargs	__class__s                            c/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/cohere2/modular_cohere2.pyrD   Cohere2Config.__init__   s    6 %'>$&&!2!2#6  &"5#6 $!2,"$(,!2,&<##:$8! 	t$ 	
%%% 3		

 	
    )r=   r>   rB   rA   r7   r1   r8   r3   r9   r2   r0   r5   r4   r6   r<   r;   r?   r@   r:   r/   )i      i X  g      ?(   @   NsilurK   g{Gz?gh㈵>Tr      i Tg     @NF        i      hybrid)__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planrD   __static_attributes____classcell__rG   s   @rH   r   r   /   s    ob J#4"5%.%.%.%."+ )"+ &(9:#%568IJ!"_$56   $  %1?
 ?
rJ   r   c                       \ rS rSrSrg)Cohere2RotaryEmbedding   r.   NrS   rT   rU   rV   r\   r.   rJ   rH   r`   r`          rJ   r`   c                       \ rS rSrSrg)Cohere2LayerNorm   r.   Nrb   r.   rJ   rH   re   re      rc   rJ   re   c                   B   \ rS rSrSrSS\S\\   4S jjr  SS\	R                  S\\	R                  \	R                  4   S	\\	R                     S
\\   S\\	R                     S\\   S\\	R                  \\	R                     \\\	R                        4   4S jjrSrg)Cohere2Attention   z=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    [         R                  R                  5         Xl        X l        [        USUR                  UR                  -  5      U l        UR                  UR                  -  U l
        U R                  S-  U l        UR                  U l        SU l        [         R                  " UR                  UR                  U R                  -  UR                  S9U l        [         R                  " UR                  UR                  U R                  -  UR                  S9U l        [         R                  " UR                  UR                  U R                  -  UR                  S9U l        [         R                  " UR                  U R                  -  UR                  UR                  S9U l        U R                  S-   U R                  R(                  -  S:w  a  UR*                  U l        g S U l        g )NrA   g      T)bias   r   )nnModulerD   rj   rk   getattrr1   r5   rA   r6   num_key_value_groupsscalingr>   	is_causalLinearr=   q_projk_projv_projo_projr@   r?   )rE   rj   rk   s      rH   rD   Cohere2Attention.__init__   s   
		"
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+nnq&8DKK<^<^%^bc%cF!! 	im 	rJ   r$   position_embeddingsr%   past_key_valuecache_positionrF   returnc                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  pU R                  b  [        XX5      u  pUb}  UUU R                  US.nUR                  XU R                  U5      u  pUbJ  U R                  R                  S:X  a0  UR                   S   nU
S S 2S S 2S U2S S 24   US S 2S S 2S U2S S 24   p[        nU R                  R                  S:w  ad  U R                  R                  S:X  a-  UR                  SS	5      (       a  [        R!                  S
5        O["        U R                  R                     nU" U U	U
UU4U R$                  (       d  SOU R&                  U R(                  U R                  S.UD6u  nnUR*                  " / UQSP76 R-                  5       nU R/                  U5      nUU4$ )Nrn   r   )sincosr?   r}   flash_attention_2eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rP   )dropoutrs   r?   )shaperA   rv   view	transposerw   rx   r?   r   updaterk   rj   _attn_implementationr   getloggerwarning_oncer   trainingr>   rs   reshape
contiguousry   )rE   r$   r{   r%   r|   r}   rF   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsseq_lenattention_interfaceattn_outputattn_weightss                      rH   forwardCohere2Attention.forward  sT    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&*';LVY'_$L%"&"5"5"0	L (6'<'<ZW[WeWegs't$J )dkk.N.NRe.e(..r2+5aHWHa6G+H,WXZ[]e^e]eghWhJiL(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2HLL..
%
 
%
!\ "));;;;FFHkk+.L((rJ   )r>   rj   rA   rt   rw   rk   rr   ry   rv   rs   r?   rx   N)NN)rS   rT   rU   rV   rW   r   r   intrD   torchTensorr   r   
LongTensorr   r
   r   r\   r.   rJ   rH   rh   rh      s    G
} 
# 
> +/59:)||:) #5<<#=>:) !.	:)
 !:) !!1!12:) -.:) 
u||Xell3XeELL>Q5RR	S:) :)rJ   rh   c                   l  ^  \ rS rSrS\S\4U 4S jjr\" SSS9     SS\R                  S	\
\R                  \R                  4   S
\\R                     S\\   S\\   S\\   S\\R                     S\\   S\
\R"                  \\
\R"                  \R"                  4      4   4S jj5       rSrU =r$ )Cohere2DecoderLayeriV  rj   rk   c                    > [         TU ]  X5        [        X5      U l        Xl        US-   U R                  R
                  -  S:g  U l        UR                  U l        g )Nrn   r   )rC   rD   rh   	self_attnrj   r@   
is_slidingr?   )rE   rj   rk   rG   s      rH   rD   Cohere2DecoderLayer.__init__W  sO    +)&<$q=DKK,N,NNRSS$33rJ   last_cache_positionz4.53.0)versionr$   r{   r%   r|   r   r:   r}   rF   r~   c                 J   U R                   (       Ga8  UGb4  [        UR                  S   U R                  5      n	U R                  R
                  S:X  a  USS2U	* S24   nO[        R                  " UR                  5      R                  n
[        R                  " [        R                  " U[        R                  S9U R                  * S9n[        R                  " XU5      nUS   U	-
  S-   n[        R                  " USS9n[        R                  " [        XR                  S   5      UR                   S	9nX-  nUSS2SS2SS2U4   nUnU R#                  U5      nU R$                  " SUUUUUUUS
.UD6u  nnU R'                  U5      nX-   U-   nU4nU(       a  UU4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence
Nr   r   )dtype)diagonalr   rn   )mindevice)r$   r{   r%   r|   r   r:   r}   r.   )r   maxr   r?   rj   r   r   finfor   r   tril	ones_likeboolwhereclamparanger   input_layernormr   mlp)rE   r$   r{   r%   r|   r   r:   r}   rF   effective_seq_len	min_dtypesliding_window_maskoffsetmask_indexesresidualhidden_states_attentionself_attn_weightshidden_states_mlpoutputss                      rH   r   Cohere2DecoderLayer.forward^  s   @ ???~9 #N$8$8$;T=P=P Q {{//3FF!/4E3E3F0F!G "KK(;(;<@@	&+jjOON%**EQUQdQdPd'# "'-@^!\'+.??!CV3  %||)+?+?+CD^MbMb  &!/1a0E!F ,,]; 6:^^ 	6
' 3))/)	6
 	6
2!2 !HH]3 !:=NN ")++GrJ   )rj   r   r   r?   )NNFFN)rS   rT   rU   rV   r   r   rD   r   r   r   r   r   r   r   r   r   r
   FloatTensorr   r\   r]   r^   s   @rH   r   r   V  s   4} 4 4 *H=
 26*.,1$)59U||U #5<<#=>U !.	U
 !U $D>U D>U !!1!12U -.U 
u  (51B1BEDUDU1U+V"WW	XU >UrJ   r   c                       \ rS rSr\rSrg)Cohere2PreTrainedModeli  r.   N)rS   rT   rU   rV   r   config_classr\   r.   rJ   rH   r   r     s     LrJ   r   c                     ^  \ rS rSrS\4U 4S jjr         SS\\R                     S\\R                     S\\R                     S\\
   S\\R                     S	\\   S
\\   S\\   S\\R                     S\\   S\4S jjrSrU =r$ )Cohere2Modeli  rj   c                    > [         TU ]  U5        [        UR                  UR                  S9U l        [        US9U l        g )N)r1   eps)rj   )rC   rD   re   r1   r9   r(   r`   
rotary_embrE   rj   rG   s     rH   rD   Cohere2Model.__init__  s6     $&2D2D6K`K`a	0?rJ   r"   r%   position_idsr   r#   r:   r   output_hidden_statesr}   flash_attn_kwargsr~   c
                 :   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [	        S5      eU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nU(       aN  UcK  U R                  (       d:  UR                  u  pn[        U R                   UUUR                  U R                  S9nU	cD  Ub  UR                  5       OSn[        R                   " XUR                  S   -   UR                  S9n	Uc  U	R#                  S5      nU R%                  X%XU5      nUnU R'                  UU5      nU(       a  SOS nU(       a  SOS nU R(                   H7  nU(       a  UU4-  nU" U4UUUUUU	S	.U
D6nUS   nU(       d  M.  UUS   4-  nM9     U R+                  U5      nU(       a  UU4-  n[-        UUUUS
9$ )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)max_batch_sizemax_cache_lenr   r   r   rn   r   r.   )r{   r%   r|   r   r:   r}   )last_hidden_stater   r$   
attentions)rj   r   r   r:   
ValueErrorgradient_checkpointingr   r   r   r&   r   r   r   r   get_seq_lengthr   r   	unsqueeze_update_causal_maskr   r'   r(   r   )rE   r"   r%   r   r   r#   r:   r   r   r}   r   
batch_sizer   _past_seen_tokenscausal_maskr$   r{   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                         rH   r   Cohere2Model.forward  sI    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0%2%8%8"J))%#)){{O !CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L..>L]
 & #oom\J #7BD0d![[M#!m%55!)	$7*."3#-	 $	M *!,M  =#3"55% )( 		-0  -!11&+++%	
 	
rJ   )r(   r   )	NNNNNNNNN)rS   rT   rU   rV   r   rD   r   r   r   r   r   r   r   r   r
   r   r   r\   r]   r^   s   @rH   r   r     s    @} @ 1515371559$(,0/359^
E,,-^
 !.^
 u//0	^

 "+.^
   1 12^
 D>^
 $D>^
 'tn^
 !!1!12^
 $$89^
 
!^
 ^
rJ   r   c                   H   ^  \ rS rSrS\4U 4S jjr       SS jrSrU =r$ )Cohere2ForCausalLMi"  rj   c                 $   > [         TU ]  U5        g r   )rC   rD   r   s     rH   rD   Cohere2ForCausalLM.__init__#  s     rJ   c	           
         Ub\  Uc  US   UR                   S   :  a  US S 2UR                   S   * S 24   nO)UR                   S   UR                   S   :w  a	  US S 2U4   nUbw  Uct  UR                  5       R                  S5      S-
  nUR                  US:H  S5        U(       a6  US S 2UR                   S   * S 24   nUR	                  [
        R                  S9nUb  US   S:X  a  US S.n
O UR	                  [
        R                  S9S S.n
[        U[        5      (       a  UR                  S:X  a  U R                  R                  S:X  d  U
S	   b"  U
S	   R                   u  pnU
S	   R                  nO U
S
   R                   u  pU
S
   R                  nU R                  R                  UUUR                  5       U R                   R"                  R$                  UUUS9nUb  XS'   U
R'                  UUUUUS.5        U
$ )Nr   rn   r   )memory_format)r#   r"   )r"   r#   r   r   r#   r"   )sequence_lengthtarget_lengthr   r   r}   r   logits_to_keep)r   r}   r   r:   r%   )r   longcumsummasked_fill_cloner   contiguous_format
isinstancer   ndimrj   r   r   model5_prepare_4d_causal_attention_mask_with_cache_positionget_max_cache_shapelm_headweightr   r   )rE   r"   r   r%   r#   r}   r   r:   r   rF   model_inputsr   r   r   r   s                  rH   prepare_inputs_for_generation0Cohere2ForCausalLM.prepare_inputs_for_generation&  s-   & &)!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	%,*>)..077;a?L%%n&91=+A	0B/B/D,DE  ,11@W@W1X $):a)?-:NL *3uG^G^)_rvwL 44##q(KK448KKO,81=o1N1T1T.
Q%o6==.:;.G.M.M+
%k299!ZZ]] /-AACll))//-% ^ N %-;)* ,"0#2&"0	
 rJ   r.   )NNNNNTN)	rS   rT   rU   rV   r   rD   r   r\   r]   r^   s   @rH   r   r   "  s2    !} ! Q QrJ   r   )r   r   r   r   )3typingr   r   r   r   torch.nnro   torch.utils.checkpointcache_utilsr   r   configuration_utilsr	   modeling_flash_attention_utilsr
   modeling_outputsr   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   utils.deprecationr   cohere.modeling_coherer   r   r   r   r   r   r   r   gemma2.modeling_gemma2r   
get_loggerrS   r   r   r`   re   rp   rh   r   r   r   r   __all__r.   rJ   rH   <module>r     s     - ,    - 3 B 7 9 5 &  0	 	 	 1 
		H	%B
$ B
J	2 		 	W)		 W)t^, ^B!2 !d
; d
NU* Up \rJ   