
    fTh-                     Z   S SK JrJrJrJr  S SKrS SKrS SKJr  SSKJ	r	J
r
  SSKJr  SSKJrJr  SSKJr  SS	KJrJr  S
SKJrJrJrJrJr  SSKJr  \R:                  " \5      r " S S\5      r  " S S\5      r! " S S\5      r" " S S\5      r# " S S\\5      r$ " S S\5      r%/ SQr&g)    )ListOptionalTupleUnionN)nn   )CacheDynamicCache)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast)Unpack)
LossKwargslogging   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel   )GraniteConfigc                   B   ^  \ rS rSrSrSS\S\\   4U 4S jjjrSr	U =r
$ )GraniteAttention(   z=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                 F   > [         TU ]  X5        UR                  U l        g N)super__init__attention_multiplierscalingselfr   r   	__class__s      c/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/granite/modular_granite.pyr!   GraniteAttention.__init__+   s    +22    )r#   r   )__name__
__module____qualname____firstlineno____doc__r   r   intr!   __static_attributes____classcell__r&   s   @r'   r   r   (   s"    G3} 3# 3 3r)   r   c                   v  ^  \ rS rSrS\S\4U 4S jjr       SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\	\R                     S\	\\R                  \R                  4      S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )GraniteDecoderLayer0   r   r   c                 b   > [         TU ]  X5        UR                  U l        [        XS9U l        g )N)r   r   )r    r!   residual_multiplierr   	self_attnr$   s      r'   r!   GraniteDecoderLayer.__init__1   s*    +#)#=#= )Mr)   hidden_statesattention_maskposition_idspast_key_valueoutput_attentions	use_cachecache_positionposition_embeddingsreturnc	                    Un
U R                  U5      nU R                  " SUUUUUUUUS.U	D6u  pXU R                  -  -   nUn
U R                  U5      nU R	                  U5      nXU R                  -  -   nU4nU(       a  X4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence
    position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
    kwargs (`dict`, *optional*):
        Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
        into the model
)r:   r;   r<   r=   r>   r?   r@   rA    )input_layernormr8   r7   post_attention_layernormmlp)r%   r:   r;   r<   r=   r>   r?   r@   rA   kwargsresidualself_attn_weightsoutputss                r'   forwardGraniteDecoderLayer.forward6   s    D !,,]; ,0>> 
,
')%)/) 3
,
 
,
( !43K3K#KK !55mD/ 43K3K#KK "++Gr)   )r7   r8   )NNNFFNN)r*   r+   r,   r-   r   r/   r!   torchTensorr   
LongTensorr	   boolr   FloatTensorrL   r0   r1   r2   s   @r'   r4   r4   0   s   N} N N 2637*.,1$)59KO?||? !.? u//0	?
 !? $D>? D>? !!1!12? &eELL%,,,F&GH? 
u  (51B1BEDUDU1U+V"WW	X? ?r)   r4   c                       \ rS rSrSrg)GranitePreTrainedModelx   rD   Nr*   r+   r,   r-   r0   rD   r)   r'   rT   rT   x   s    r)   rT   c                     ^  \ rS rSrS\4U 4S jjr         SS\\R                     S\\R                     S\\R                     S\\
   S\\R                     S	\\   S
\\   S\\   S\\R                     S\\   S\4S jjrSrU =r$ )GraniteModel|   r   c           	         > [         TU ]  U5        UR                  U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        g s  snf r   )	r    r!   embedding_multiplierr   
ModuleListrangenum_hidden_layersr4   layersr$   s      r'   r!   GraniteModel.__init__}   sV     $*$?$?!mmEJ6KcKcEdeEd	 3Ede
es   	A+	input_idsr;   r<   past_key_valuesinputs_embedsr?   r>   output_hidden_statesr@   flash_attn_kwargsrB   c
                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [	        S5      eU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nXPR                  -  nU(       a  Uc
  [        5       nU	cD  Ub  UR                  5       OSn[        R                  " XUR                  S   -   UR                   S9n	Uc  U	R#                  S5      nU R%                  X%XU5      nUnU R'                  X5      nU(       a  SOS nU(       a  SOS nU R(                  S U R                   R*                    H7  nU(       a  X4-  nU" U4UUUUUU	US.U
D6nUS   nU(       d  M.  UUS   4-  nM9     U R-                  U5      nU(       a  X4-  n[/        UU(       a  UOS UUS	9$ )
Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   )devicerD   )r;   r<   r=   r>   r?   r@   rA   )last_hidden_staterb   r:   
attentions)r   r>   rd   r?   
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensr[   r
   get_seq_lengthrN   arangeshaperg   	unsqueeze_update_causal_mask
rotary_embr_   r^   normr   )r%   ra   r;   r<   rb   rc   r?   r>   rd   r@   re   past_seen_tokenscausal_maskr:   rA   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                      r'   rL   GraniteModel.forward   s3    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M%(A(AA0*nO!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L..>L]
 & #oomJ #7BD0d![[)H4;;+H+HIM#!%55!)
*)."3#-$7
 $
M *!,M  =#3"55' J* 		-0  !11&+/8Od+%	
 	
r)   )r[   r_   )	NNNNNNNNN)r*   r+   r,   r-   r   r!   r   rN   rP   rO   r	   rR   rQ   r   r   r   rL   r0   r1   r2   s   @r'   rX   rX   |   s    
} 
 151537+/59$(,0/359Z
E,,-Z
 !.Z
 u//0	Z

 "%Z
   1 12Z
 D>Z
 $D>Z
 'tnZ
 !!1!12Z
 $$89Z
 
!Z
 Z
r)   rX   c                       \ rS rSrSrg)KwargsForCausalLM   rD   NrV   rD   r)   r'   r   r      s    3r)   r   c                   h   \ rS rSr           SS\\R                     S\\R                     S\\R                     S\\\	\
\R                     4      S\\R                     S\\R                     S	\\   S
\\   S\\   S\\R                     S\\\R                  4   S\\   S\4S jjrSrg)GraniteForCausalLM   Nra   r;   r<   rb   rc   labelsr?   r>   rd   r@   logits_to_keeprH   rB   c                     Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U R                  " SUUUUUUUU	U
S.	UD6nUR                  n[        U[        5      (       a  [        U* S 5      OUnU R                  US S 2US S 24   5      nUU R                   R                  -  nS nUb)  U R                  " SUX`R                   R                  S.UD6n[        UUUR                  UR                  UR                  S9$ )N)	ra   r;   r<   rb   rc   r?   r>   rd   r@   )logitsr   
vocab_size)lossr   rb   r:   ri   rD   )r   r>   rd   modelrh   
isinstancer/   slicelm_headlogits_scalingloss_functionr   r   rb   r:   ri   )r%   ra   r;   r<   rb   rc   r   r?   r>   rd   r@   r   rH   rK   r:   slice_indicesr   r   s                     r'   rL   GraniteForCausalLM.forward   s+    2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,0:: ,
)%+'/!5),
 ,
  118B>SV8W8W~ot4]kmA}a,?@A$++444%%pVF{{OeOepiopD%#33!//))
 	
r)   rD   )NNNNNNNNNNr   )r*   r+   r,   r-   r   rN   rP   rO   r   r	   r   rR   rQ   r/   r   r   r   rL   r0   rD   r)   r'   r   r      s)    151537KO59-1$(,0/359342
E,,-2
 !.2
 u//0	2

 "%tE4E4E/F(F"GH2
   1 122
 ))*2
 D>2
 $D>2
 'tn2
 !!1!122
 c5<</02
 *+2
 
 2
 2
r)   r   )r   rX   rT   )'typingr   r   r   r   rN   torch.utils.checkpointr   cache_utilsr	   r
   modeling_flash_attention_utilsr   modeling_outputsr   r   processing_utilsr   utilsr   r   llama.modeling_llamar   r   r   r   r   configuration_graniter   
get_loggerr*   rm   r   r4   rT   rX   r   r   __all__rD   r)   r'   <module>r      s     0 /    . B O & (  1 
		H	%3~ 3E+ EP	1 	b
: b
J ?,j >3
) 3
l Kr)   