
    fTh                        S SK JrJr  S SKrS SKJr  SSKJr  SSKJr  SSK	J
r
  SS	KJrJrJrJr  S
SKJr  \
R$                  " \5      r " S S\R*                  5      r " S S\5      r " S S\5      r " S S\5      r " S S\5      r/ SQrg)    )OptionalTupleN)nn   )ACT2FN)Cache)logging   )GraniteMoeDecoderLayerGraniteMoeForCausalLMGraniteMoeModelGraniteMoePreTrainedModel   )GraniteMoeSharedConfigc                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	GraniteMoeSharedMLP$   zj
MLP layer for shared experts

Args:
    config:
        Configuration object with model hyperparameters.
configc                 `  > [         [        U ]  5         UR                  U l        UR
                  U l        [        UR                     U l        [        R                  " U R                  U R                  S-  SS9U l        [        R                  " U R                  U R                  SS9U l        g )Nr
   F)bias)superr   __init__hidden_size
input_sizeshared_intermediate_sizer   
hidden_act
activationr   Linearinput_linearoutput_linearselfr   	__class__s     u/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.pyr   GraniteMoeSharedMLP.__init__-   s    !413 ,,!:: !2!23IIdoot7G7G!7KRWXYYt'7'7uU    hidden_statesreturnc                     U R                  U5      nUR                  SSS9nU R                  US   5      US   -  nU R                  U5      nU$ )Nr
   )dimr   r   )r   chunkr   r    )r"   r'   chunked_hidden_statess      r$   forwardGraniteMoeSharedMLP.forward6   s^    ))-8 - 3 3A2 3 >(=a(@ADYZ[D\\**=9r&   )r   r   r   r   r    )__name__
__module____qualname____firstlineno____doc__r   r   torchTensorr.   __static_attributes____classcell__r#   s   @r$   r   r   $   s7    V5 VU\\ ell  r&   r   c                     ^  \ rS rSrS\S\4U 4S jjr        SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\	\R                     S\	\   S\	\\R                  \R                  4      S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )GraniteMoeSharedDecoderLayer>   r   	layer_idxc                 t   > [         TU ]  X5        UR                  S:X  a  S U l        g [        U5      U l        g )Nr   )r   r   r   r   
shared_mlpr"   r   r=   r#   s      r$   r   %GraniteMoeSharedDecoderLayer.__init__?   s1    +"("A"AQ"F$L_`fLgr&   r'   attention_maskposition_idspast_key_valueoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingsr(   c
                    UnU R                  U5      nU R                  " SUUUUUUUU	S.U
D6u  pnXU R                  -  -   nUnU R                  U5      nU R	                  U5      u  pU R
                  c  UnOXR                  U5      -   nAXU R                  -  -   nU4nU(       a  UU4-  nU(       a  UU4-  nU(       a  UU4-  nU$ )aY  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence
    output_router_logits (`bool`, *optional*):
        Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
        should not be returned during inference.
    position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
    kwargs (`dict`, *optional*):
        Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
        into the model
)r'   rB   rC   rD   rE   rF   rG   rI    )input_layernorm	self_attnresidual_multiplierpost_attention_layernormblock_sparse_moer?   )r"   r'   rB   rC   rD   rE   rF   rG   rH   rI   kwargsresidualself_attn_weightspresent_key_valuemoe_hidden_statesrouter_logitsoutputss                    r$   r.   $GraniteMoeSharedDecoderLayer.forwardC   s   L !,,]; ?Cnn 
?
')%)/) 3
?
 
?
;*; !43K3K#KK !55mD+/+@+@+O(??"-M-0NNM 43K3K#KK ")++G)++G''Gr&   )r?   )NNNFFNFN)r0   r1   r2   r3   r   intr   r5   r6   r   
LongTensorr   boolr   FloatTensorr.   r7   r8   r9   s   @r$   r;   r;   >   s!   h5 h# h 2637*.,1$)59/4KOR||R !.R u//0	R
 !R $D>R D>R !!1!12R 'tnR &eELL%,,,F&GHR 
u  (51B1BEDUDU1U+V"WW	XR Rr&   r;   c                       \ rS rSr\rS/rSrg)GraniteMoeSharedPreTrainedModel   r;   rK   N)r0   r1   r2   r3   r   config_class_no_split_modulesr7   rK   r&   r$   r^   r^      s    )L78r&   r^   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )GraniteMoeSharedModel   r   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        g s  snf N)r   r   r   
ModuleListrangenum_hidden_layersr;   layersr@   s      r$   r   GraniteMoeSharedModel.__init__   sI     mmNSTZTlTlNmnNm)&<Nmn
ns   A)rj   )r0   r1   r2   r3   r   r   r7   r8   r9   s   @r$   rc   rc      s    
5 
 
r&   rc   c                   6   ^  \ rS rSrS/rS\4U 4S jjrSrU =r$ )GraniteMoeSharedForCausalLM   zlm_head.weightr   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rf   )r   r   rc   model	post_initr!   s     r$   r   $GraniteMoeSharedForCausalLM.__init__   s&     *62
r&   )rp   )	r0   r1   r2   r3   _tied_weights_keysr   r   r7   r8   r9   s   @r$   rm   rm      s    *+5  r&   rm   )rm   rc   r^   )typingr   r   r5   r   activationsr   cache_utilsr   utilsr	   granitemoe.modeling_granitemoer   r   r   r   configuration_granitemoesharedr   
get_loggerr0   loggerModuler   r;   r^   rc   rm   __all__rK   r&   r$   <module>r~      s     #   !     C 
		H	%")) 4W#9 Wt9&? 9

O 
"7  fr&   