o
    Zh                     @   s   d dl mZmZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 dd	lmZmZmZmZ d
dlmZ e
eZG dd dejZG dd deZG dd deZG dd deZG dd deZg dZdS )    )OptionalTupleN)nn   )ACT2FN)Cache)logging   )GraniteMoeDecoderLayerGraniteMoeForCausalLMGraniteMoeModelGraniteMoePreTrainedModel   )GraniteMoeSharedConfigc                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	GraniteMoeSharedMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                    s^   t t|   |j| _|j| _t|j | _t	j
| j| jd dd| _t	j
| j| jdd| _d S )Nr	   F)Zbias)superr   __init__Zhidden_sizeZ
input_sizeshared_intermediate_sizer   Z
hidden_act
activationr   ZLinearinput_linearoutput_linearselfr   	__class__ l/var/www/auris/lib/python3.10/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.pyr   -   s   zGraniteMoeSharedMLP.__init__hidden_statesreturnc                 C   s<   |  |}|jddd}| |d |d  }| |}|S )Nr	   )dimr   r   )r   chunkr   r   )r   r   Zchunked_hidden_statesr   r   r   forward6   s
   

zGraniteMoeSharedMLP.forward)
__name__
__module____qualname____doc__r   r   torchTensorr#   __classcell__r   r   r   r   r   $   s    	r   c                       s   e Zd Zdedef fddZ								ddejdeej d	eej	 d
ee
 dee dee deej	 dee deeejejf  deejeeejejf  f fddZ  ZS )GraniteMoeSharedDecoderLayerr   	layer_idxc                    s0   t  || |jdkrd | _d S t|| _d S )Nr   )r   r   r   r   
shared_mlp)r   r   r,   r   r   r   r   ?   s   "z%GraniteMoeSharedDecoderLayer.__init__NFr   attention_maskposition_idspast_key_valueoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingsr   c
                 K   s   |}|  |}| jd||||||||	d|
\}}}||| j  }|}| |}| |\}}| jdu r8|}n|| | }~||| j  }|f}|rQ||f7 }|rX||f7 }|r_||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r   r.   r/   r0   r1   r2   r3   r5   Nr   )Zinput_layernormZ	self_attnZresidual_multiplierZpost_attention_layernormZblock_sparse_moer-   )r   r   r.   r/   r0   r1   r2   r3   r4   r5   kwargsZresidualZself_attn_weightsZpresent_key_valueZmoe_hidden_statesZrouter_logitsZoutputsr   r   r   r#   C   s>   &
	




z$GraniteMoeSharedDecoderLayer.forward)NNNFFNFN)r$   r%   r&   r   intr   r(   r)   r   Z
LongTensorr   boolr   ZFloatTensorr#   r*   r   r   r   r   r+   >   s>    	
r+   c                   @   s   e Zd ZeZdgZdS )GraniteMoeSharedPreTrainedModelr+   N)r$   r%   r&   r   Zconfig_classZ_no_split_modulesr   r   r   r   r9      s    
r9   c                       s"   e Zd Zdef fddZ  ZS )GraniteMoeSharedModelr   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  |qS r   )r+   ).0r,   r   r   r   
<listcomp>   s    z2GraniteMoeSharedModel.__init__.<locals>.<listcomp>)r   r   r   Z
ModuleListrangeZnum_hidden_layersZlayersr   r   r<   r   r      s   
zGraniteMoeSharedModel.__init__)r$   r%   r&   r   r   r*   r   r   r   r   r:      s    r:   c                       s(   e Zd ZdgZdef fddZ  ZS )GraniteMoeSharedForCausalLMzlm_head.weightr   c                    s"   t  | t|| _|   d S )N)r   r   r:   modelZ	post_initr   r   r   r   r      s   
z$GraniteMoeSharedForCausalLM.__init__)r$   r%   r&   Z_tied_weights_keysr   r   r*   r   r   r   r   r?      s    r?   )r?   r:   r9   )typingr   r   r(   r   Zactivationsr   Zcache_utilsr   utilsr   Zgranitemoe.modeling_granitemoer
   r   r   r   Zconfiguration_granitemoesharedr   Z
get_loggerr$   loggerModuler   r+   r9   r:   r?   __all__r   r   r   r   <module>   s   
Z
