
    fTh.                        S r SSKJrJrJrJrJr  SSKrSSKrSSKJ	r	  SSK
Jr  SSKJrJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJrJrJrJrJrJrJ r J!r!J"r"J#r#  SSK$J%r%  \RL                  " \'5      r( " S S\	RR                  5      r* " S S\5      r+ " S S\5      r, " S S\!5      r- " S S\ 5      r. " S S\5      r/ " S S\5      r0 " S S \5      r1 " S! S"\5      r2/ S#Qr3g)$zPyTorch Starcoder2 model.    )CallableListOptionalTupleUnionN)nn   )ACT2FN)CacheDynamicCache)FlashAttentionKwargs)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)logging   )
MistralAttentionMistralDecoderLayerMistralForCausalLM MistralForSequenceClassificationMistralForTokenClassificationMistralModelMistralPreTrainedModelMistralRotaryEmbeddingapply_rotary_pos_embeager_attention_forward   )Starcoder2Configc                   v   ^  \ rS rSrS\4U 4S jjrS\\\R                        S\R                  4S jr
SrU =r$ )Starcoder2MLP5   configc                 D  > [         TU ]  5         UR                  n[        R                  " X!R
                  UR                  S9U l        [        R                  " UR
                  X!R                  S9U l        [        UR                     U l        UR                  U l        g N)bias)super__init__hidden_sizer   Linearintermediate_sizeuse_biasc_fcc_projr
   
hidden_actactresidual_dropout)selfr"   	embed_dim	__class__s      i/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/starcoder2/modular_starcoder2.pyr'   Starcoder2MLP.__init__6   sq    &&	IIi)A)AX	ii 8 8)//Z&++, & 7 7    hidden_statesreturnc                     U R                  U5      nU R                  U5      nU R                  U5      n[        R                  R                  XR                  U R                  S9nU$ )Nptraining)r,   r/   r-   r   
functionaldropoutr0   r<   )r1   r7   s     r4   forwardStarcoder2MLP.forward>   sX    		-0/M2--m?T?T_c_l_l-mr6   )r/   r,   r-   r0   )__name__
__module____qualname____firstlineno__r   r'   r   r   torchFloatTensorr?   __static_attributes____classcell__r3   s   @r4   r    r    5   s>    8/ 8XeE4E4E.F%G EL]L]  r6   r    c                   L  ^  \ rS rSrSS\S\\   4U 4S jjjr  SS\R                  S\
\R                  \R                  4   S\\R                     S\\   S	\\R                     S
\\   S\
\R                  \\R                     \\
\R                        4   4S jjrSrU =r$ )Starcoder2AttentionF   r"   	layer_idxc                 t  > [         TU ]  5         UR                  U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l	        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR                  U R                  -  UR
                  UR                  S9U l        g r$   )r&   r'   r0   r   r)   r(   num_attention_headshead_dimr+   q_projnum_key_value_headsk_projv_projo_projr1   r"   rM   r3   s      r4   r'   Starcoder2Attention.__init__G   s     & 7 7ii 2 2F4N4NQUQ^Q^4^eketetuii 2 2F4N4NQUQ^Q^4^eketetuii 2 2F4N4NQUQ^Q^4^eketetuii : :T]] JFL^L^eketetur6   r7   position_embeddingsattention_maskpast_key_valuecache_positionkwargsr8   c           
      T   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  ad  U R                  R                  S:X  a-  UR                  SS5      (       a  [        R                  S	5        O[         U R                  R                     nU" U U	U
UU4U R"                  (       d  S
OU R$                  U R&                  [)        U R                  SS 5      S.UD6u  nnUR*                  " / UQSP76 R-                  5       nU R/                  U5      n[0        R2                  R5                  UU R6                  U R"                  S9nUU4$ )Nr   r   )sincosr[   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        sliding_window)r>   scalingre   r:   )shaperP   rQ   view	transposerS   rT   r   updaterM   r   r"   _attn_implementationgetloggerwarning_oncer   r<   attention_dropoutrf   getattrreshape
contiguousrU   r   r=   r>   r0   )r1   r7   rX   rY   rZ   r[   r\   input_shapehidden_shapequery_states
key_statesvalue_statesr`   r_   cache_kwargsattention_interfaceattn_outputattn_weightss                     r4   r?   Starcoder2Attention.forwardO   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ %#&nUL'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2HLL"4;;0@$G
%
 
%
!\ "));;;;FFHkk+.mm++4004== , 
 L((r6   )rS   rU   rQ   r0   rT   )N)NN)rA   rB   rC   rD   r   r   intr'   rE   Tensorr   r   
LongTensorr   r   r?   rG   rH   rI   s   @r4   rK   rK   F   s    v/ vHSM v v +/594)||4) #5<<#=>4) !.	4)
 !4) !!1!124) -.4) 
u||Xell3XeELL>Q5RR	S4) 4)r6   rK   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )Starcoder2DecoderLayer   r"   rM   c                   > [         TU ]  U 5        [        XS9U l        [	        U5      U l        [        R                  " UR                  UR                  S9U l
        [        R                  " UR                  UR                  S9U l        g )N)r"   rM   eps)r&   r'   rK   	self_attnr    mlpr   	LayerNormr(   norm_epsiloninput_layernormpost_attention_layernormrV   s      r4   r'   Starcoder2DecoderLayer.__init__   sf    ,FP (!||F,>,>FDWDWX(*V5G5GVM`M`(a%r6   )r   r   r   r   )	rA   rB   rC   rD   r   r}   r'   rG   rH   rI   s   @r4   r   r      s     b/ bC b br6   r   c                       \ rS rSrSrg)Starcoder2RotaryEmbedding    NrA   rB   rC   rD   rG   r   r6   r4   r   r          r6   r   c                       \ rS rSrS rSrg)Starcoder2PreTrainedModel   c                    U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         g g [        U[        R                  5      (       aJ  UR
                  R                  R                  S5        UR                  R                  R                  5         g g )Nrd   )meanstdg      ?)r"   initializer_range
isinstancer   r)   weightdatanormal_r%   zero_	Embeddingpadding_idxr   fill_)r1   moduler   s      r4   _init_weights'Starcoder2PreTrainedModel._init_weights   s   kk++fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .--MM$$S)KK""$ .r6   r   N)rA   rB   rC   rD   r   rG   r   r6   r4   r   r      s    %r6   r   c                   @  ^  \ rS rSrS\4U 4S jjr         SS\\R                     S\\R                     S\\R                     S\\
\\\R                     4      S\\R                     S	\\   S
\\   S\\   S\\R                     S\\   S\4S jjrSrU =r$ )Starcoder2Model   r"   c           	      <  > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        R                  " UR                  UR                  S9U l        UR                  U l        g s  snf )Nr   )r&   r'   r   
ModuleListrangenum_hidden_layersr   layersr   r(   r   normembedding_dropoutrV   s      r4   r'   Starcoder2Model.__init__   sy     mmHMfNfNfHghHg9#F6Hgh
 LL!3!39L9LM	!'!9!9 is   B	input_idsrY   position_idspast_key_valuesinputs_embeds	use_cacherc   output_hidden_statesr[   flash_attn_kwargsr8   c
                 X   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [	        S5      eU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nU(       a  Uc
  [        5       nU	cD  Ub  UR                  5       OSn[        R                  " XUR                  S   -   UR                  S9n	Uc  U	R!                  S5      nU R#                  X%XU5      nUn[$        R&                  R)                  XR*                  U R                  S9nU R-                  X5      nU(       a  SOS nU(       a  SOS nU R.                  S U R                   R0                    H7  nU(       a  X4-  nU" U4UUUUUU	US	.U
D6nUS   nU(       d  M.  UUS   4-  nM9     U R3                  U5      nU(       a  X4-  n[5        UU(       a  UOS UUS
9$ )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   )devicer:   r   )rY   r   rZ   rc   r   r[   rX   )last_hidden_stater   r7   
attentions)r"   rc   r   r   
ValueErrorgradient_checkpointingr<   rm   rn   embed_tokensr   get_seq_lengthrE   arangerg   r   	unsqueeze_update_causal_maskr   r=   r>   r   
rotary_embr   r   r   r   )r1   r   rY   r   r   r   r   rc   r   r[   r   past_seen_tokenscausal_maskr7   rX   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                      r4   r?   Starcoder2Model.forward   sM    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0*nO!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L..>L]
 &--33dmm . 

 #oomJ #7BD0d![[)H4;;+H+HIM#!%55!)
*)."3#-$7
 $
M *!,M  =#3"55' J* 		-0  !11&+/8Od+%	
 	
r6   )r   r   r   )	NNNNNNNNN)rA   rB   rC   rD   r   r'   r   rE   r   r~   r   r   r   rF   boolr   r   r   r?   rG   rH   rI   s   @r4   r   r      s   :/ : 151537KO59$(,0/359[
E,,-[
 !.[
 u//0	[

 "%tE4E4E/F(F"GH[
   1 12[
 D>[
 $D>[
 'tn[
 !!1!12[
 $$89[
 
![
 [
r6   r   c                       \ rS rSrSrg)Starcoder2ForCausalLMi
  r   Nr   r   r6   r4   r   r   
  r   r6   r   c                       \ rS rSrSrg)#Starcoder2ForSequenceClassificationi  r   Nr   r   r6   r4   r   r     r   r6   r   c                       \ rS rSrSrg) Starcoder2ForTokenClassificationi  r   Nr   r   r6   r4   r   r     r   r6   r   )r   r   r   r   r   )4__doc__typingr   r   r   r   r   rE   torch.utils.checkpointr   activationsr
   cache_utilsr   r   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   mistral.modeling_mistralr   r   r   r   r   r   r   r   r   r   configuration_starcoder2r   
get_loggerrA   rm   Moduler    rK   r   r   r   r   r   r   r   __all__r   r6   r4   <module>r      s   (   9 9    ! . B 7 5 &    7 
		H	%BII "=)* =)@b0 b	 6 	% 6 % d
l d
N	. 		*J 		'D 	r6   