o
    Zhg                     @   s~  d dl mZmZmZmZmZ d dlZd dlm  m	Z
 d dlmZ ddlmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZmZ ddl m!Z!m"Z"m#Z# ddl$m%Z% e" rzd dl&m'Z' ddl(m)Z) e#*e+Z,G dd dej-Z.G dd dej-Z/G dd dej-Z0G dd dej-Z1G dd dej-Z2dd Z3dAddZ4d ej5d!e6d"ej5fd#d$Z7	%dBd&ej-d'ej5d(ej5d)ej5d*eej5 d+e8d,e8fd-d.Z9G d/d0 d0ej-Z:G d1d2 d2eZ;e!G d3d4 d4eZ<G d5d6 d6ej-Z=e!G d7d8 d8e<Z>		9	dCd:eej5eej5 df d;ee6 d*eej5 d"eej5e6f fd<d=Z?G d>d? d?e<eZ@g d@ZAdS )D    )CallableListOptionalTupleUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputWithPastMoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)auto_docstringis_torch_flex_attn_availablelogging   )GraniteMoeSharedConfig)	BlockMask)make_flex_block_causal_maskc                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	GraniteMoeSharedMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                    s^   t t|   |j| _|j| _t|j | _t	j
| j| jd dd| _t	j
| j| jdd| _d S )N   Fbias)superr   __init__hidden_size
input_sizeshared_intermediate_sizer	   
hidden_act
activationr   Linearinput_linearoutput_linearselfr   	__class__ m/var/www/auris/lib/python3.10/site-packages/transformers/models/granitemoeshared/modeling_granitemoeshared.pyr#   :   s   zGraniteMoeSharedMLP.__init__hidden_statesreturnc                 C   s<   |  |}|jddd}| |d |d  }| |}|S )Nr   dimr   r   )r*   chunkr(   r+   )r-   r2   chunked_hidden_statesr0   r0   r1   forwardC   s
   

zGraniteMoeSharedMLP.forward)
__name__
__module____qualname____doc__r   r#   torchTensorr9   __classcell__r0   r0   r.   r1   r   1   s    	r   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	GraniteMoeSharedRMSNormư>c                    s&   t    tt|| _|| _dS )zF
        GraniteMoeSharedRMSNorm is equivalent to T5LayerNorm
        N)r"   r#   r   	Parameterr>   Zonesweightvariance_epsilon)r-   r$   epsr.   r0   r1   r#   L   s   

z GraniteMoeSharedRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr   r4   T)Zkeepdim)	dtypetor>   float32powmeanZrsqrtrE   rD   )r-   r2   Zinput_dtypeZvariancer0   r0   r1   r9   T   s
   zGraniteMoeSharedRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tuplerD   shaperE   r-   r0   r0   r1   
extra_repr[   s   z"GraniteMoeSharedRMSNorm.extra_repr)rB   )r:   r;   r<   r#   r9   rO   r@   r0   r0   r.   r1   rA   K   s    rA   c                       s6   e Zd Zdedededdf fddZdd	 Z  ZS )
GraniteMoeSharedParallelExpertsnum_expertsr%   output_sizer3   Nc                    s6   t    tt|||| _|| _|| _|| _	dS )a  
        Initialize the GraniteMoeSharedParallelExperts module.
        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
        used in vllm.

        Args:
            num_experts (int):
                Number of experts.
            input_size (int):
                Size of the input.
            output_size (int):
                Size of the output.
        N)
r"   r#   r   rC   r>   emptyrD   rQ   r%   rR   )r-   rQ   r%   rR   r.   r0   r1   r#   `   s
   

z(GraniteMoeSharedParallelExperts.__init__c                 C   sP   |j |dd}g }t| jD ]}|t|| | j|  qtj|dd}|S )a  
        Forward pass of the GraniteMoeSharedParallelExperts module.

        Args:
            inputs (Tensor):
                Input tensor.
            expert_size:
                Expert size information.

        Returns:
            Tensor: Output tensor.
        r   r5   )	splitrangerQ   appendFZlinearrD   r>   cat)r-   Zinputsexpert_sizeZ
input_listZoutput_listiresultsr0   r0   r1   r9   w   s   z'GraniteMoeSharedParallelExperts.forwardr:   r;   r<   intr#   r9   r@   r0   r0   r.   r1   rP   _   s    rP   c                       s2   e Zd Zdededef fddZdd Z  ZS )GraniteMoeSharedTopKGatingr%   rQ   top_kc                    s2   t    || _|| _|| _tj||dd| _dS )a  
        Initialize the top-k gating mechanism.
        Args:
            input_size (`int`):
                Size of the input.
            num_experts (`int`):
                Number of experts.
            top_k (`int`):
                Number of top experts to select.
        Fr    N)r"   r#   rQ   r%   r_   r   r)   layer)r-   r%   rQ   r_   r.   r0   r1   r#      s
   
z#GraniteMoeSharedTopKGating.__init__c                 C   s   |  | }|j| jdd\}}tj|dd|}tj|d| j	g|j
|jd}|d|d}| d}| }| }	|	d\}
}|j| jdd}| }|| }|||||fS )Nr   r5   r   rG   devicetrunc)Zrounding_mode)r`   floattopkr_   r>   softmaxZtype_aszerossizerQ   rG   rb   Zscatterlongsumtolistflattensortdiv)r-   r2   logitsZtop_k_logitsZtop_k_indicesZtop_k_gatesrg   ZgatesrY   Ztop_k_experts_Zindex_sorted_expertsbatch_indexbatch_gatesr0   r0   r1   r9      s   z"GraniteMoeSharedTopKGating.forwardr\   r0   r0   r.   r1   r^      s    r^   c                       s.   e Zd ZdZdef fddZdd Z  ZS )GraniteMoeSharedMoEz
    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    r   c                    sp   t t|   |j| _|j| _t|j | _t	|j
| j| jd | _t	|j
| j| j| _t| j|j
|jd| _d S )Nr   )r%   rQ   r_   )r"   rs   r#   r$   r%   Zintermediate_sizer	   r'   r(   rP   num_local_expertsr*   r+   r^   num_experts_per_tokrouterr,   r.   r0   r1   r#      s   zGraniteMoeSharedMoE.__init__c                 C   s   |  \}}}|d|}| |\}}}}}	|| }
| |
|}|jddd}| |d |d  }| ||}||dddf  }tj|| | j	f|j
|jd}|d||}|||| j	}||	fS )a  
        Forward pass of the mixture of experts layer.

        Args:
            layer_input (Tensor):
                Input tensor.

        Returns:
            Tensor:
                Output tensor.
            Tensor:
                Router logits.
        r4   r   r5   r   r   Nra   )rh   reshaperv   r*   r7   r(   r+   r>   rg   r%   rG   rb   Z	index_addview)r-   Zlayer_inputbszlengthZemb_sizerp   rq   rr   rY   router_logitsZexpert_inputsr2   r8   Zexpert_outputsrg   Zlayer_outputr0   r0   r1   r9      s   zGraniteMoeSharedMoE.forward)r:   r;   r<   r=   r   r#   r9   r@   r0   r0   r.   r1   rs      s    rs   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr4   r   r5   )rM   r>   rX   )xx1Zx2r0   r0   r1   rotate_half   s   r~   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer~   )qkcossinposition_idsZunsqueeze_dimZq_embedZk_embedr0   r0   r1   apply_rotary_pos_emb   s
   

r   r2   n_repr3   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rM   expandrw   )r2   r   batchnum_key_value_headsslenhead_dimr0   r0   r1   	repeat_kv  s
   0r           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr   r   r4   )r6   rG   )ptrainingr   )r   num_key_value_groupsr>   matmul	transposerM   r   
functionalrf   rI   rH   rG   r   r   
contiguous)r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr0   r0   r1   eager_attention_forward&  s   
&r   c                       s   e Zd ZdZddedee f fddZ						ddej	d	eej	 d
eej
 dee dedeej
 deeej	ej	f  deej	eej	 eeej	  f fddZ  ZS )GraniteMoeSharedAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNr   	layer_idxc                    s   t    || _|| _|d u rtd| jj d |j| _|j	| _	|j
| _| j	| j | _|j| _| j| j | _d| _|j| _| j| j | j	krUtd| j	 d| j dtj| j	| j| j |jd| _tj| j	| j| j |jd| _tj| j	| j| j |jd| _tj| j	| j	|jd| _d S )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).r    )r"   r#   r   r   loggerwarning_oncer/   r:   attention_dropoutr$   num_attention_heads	num_headsr   r   r   Z	is_causalZattention_multiplierr   
ValueErrorr   r)   Zattention_biasq_projk_projv_projo_projr-   r   r   r.   r0   r1   r#   F  s2   

z"GraniteMoeSharedAttention.__init__Fr2   r   r   past_key_value	use_cachecache_positionposition_embeddingsr3   c                 K   sj  |  \}	}
}| |}| |}| |}||	|
| j| jdd}||	|
| j| jdd}||	|
| j| jdd}|d urF|nd\}}|d urWt	||||\}}|d url|||d}|
||| j|\}}t}| jjdkr| jjdkr|ddrtd	 nt| jj }|| ||||f| jsd
n| j| jd|\}}||	|
d}| |}|||fS )Nr   r   )NN)r   r   r   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   )r   r   r4   )rh   r   r   r   rx   r   r   r   r   r   updater   r   r   _attn_implementationgetr   r   r   r   r   r   r   )r-   r2   r   r   r   r   r   r   r   ry   Zq_lenrp   Zquery_statesr   r   r   r   Zcache_kwargsZattention_interfacer   r   r0   r0   r1   r9   f  sF   





z!GraniteMoeSharedAttention.forwardN)NNNFNN)r:   r;   r<   r=   r   r   r]   r#   r>   r?   
LongTensorr
   boolr   r9   r@   r0   r0   r.   r1   r   C  s4    #
r   c                       s   e Zd Zdedef fddZ								ddejdeej d	eej	 d
ee
 dee dee deej	 dee deeejejf  deejeeejejf  f fddZ  ZS )GraniteMoeSharedDecoderLayerr   r   c                    sx   t    |j| _t||d| _t|| _t|j|jd| _	t|j|jd| _
|j| _|jdkr5d | _d S t|| _d S )N)r   r   rF   r   )r"   r#   r$   r   	self_attnrs   block_sparse_moerA   rms_norm_epsinput_layernormpost_attention_layernormresidual_multiplierr&   r   
shared_mlpr   r.   r0   r1   r#     s   

"z%GraniteMoeSharedDecoderLayer.__init__NFr2   r   r   r   r   r   r   output_router_logitsr   r3   c
                 K   s   |}|  |}| jd||||||||	d|
\}}}||| j  }|}| |}| |\}}| jdu r8|}n|| | }||| j  }|f}|rP||f7 }|rW||f7 }|r^||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r2   r   r   r   r   r   r   r   Nr0   )r   r   r   r   r   r   )r-   r2   r   r   r   r   r   r   r   r   r   ZresidualZself_attn_weightsZpresent_key_valueZmoe_hidden_statesr{   outputsr0   r0   r1   r9     s<   &
	




z$GraniteMoeSharedDecoderLayer.forward)NNNFFNFN)r:   r;   r<   r   r]   r#   r>   r?   r   r   r
   r   r   FloatTensorr9   r@   r0   r0   r.   r1   r     s>    	
r   c                   @   s@   e Zd ZeZdZdZdgZdgZdZ	dZ
dZdZdZdd ZdS )	GraniteMoeSharedPreTrainedModelmodelTr   past_key_valuesFc                 C   s   t |tjr |jjjd| jjd |jd ur|jj	  d S d S t |tj
rC|jjjd| jjd |jd urA|jj|j 	  d S d S t |trQ|jjd d S t |trc|jjjd| jjd d S d S )Nr   )rK   Zstdg      ?)
isinstancer   r)   rD   dataZnormal_r   Zinitializer_ranger!   Zzero_	Embeddingpadding_idxrA   Zfill_rP   )r-   r   r0   r0   r1   _init_weights  s   



z-GraniteMoeSharedPreTrainedModel._init_weightsN)r:   r;   r<   r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attn_2Z_supports_sdpaZ_supports_cache_classZ_supports_quantized_cacheZ_supports_static_cacher   r0   r0   r0   r1   r     s    r   c                       s8   e Zd Zddef fddZe edd Z  Z	S )GraniteMoeSharedRotaryEmbeddingNr   c                    s   t    t|dr|jd ur|jd|jd| _nd| _|j| _|j| _|| _	t
| j | _| | j	|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r"   r#   hasattrr   r   r   max_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenr   r   Zrope_init_fnattention_scalingZregister_bufferr   Zoriginal_inv_freq)r-   r   rb   r   r.   r0   r1   r#     s   
z(GraniteMoeSharedRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r4   r   ZmpscpuF)device_typeenabledr   r5   )rG   )r   rd   r   rM   rH   rb   r   r   strr>   Zautocastr   rX   r   r   r   rG   )
r-   r|   r   Zinv_freq_expandedZposition_ids_expandedr   ZfreqsZembr   r   r0   r0   r1   r9   -  s   0&z'GraniteMoeSharedRotaryEmbedding.forwardr   )
r:   r;   r<   r   r#   r>   Zno_gradr   r9   r@   r0   r0   r.   r1   r     s
    r   c                       s0  e Zd Zdef fddZdd Zdd Ze											d"d	ee	j
 d
ee	j dee	j
 deeeee	j f  dee	j dee dee dee dee dee dee	j
 deeef fddZ	d#d
ee	jdf de	jde	jdedef
ddZed
e	jdedede	jde	jdefd d!Z  ZS )$GraniteMoeSharedModelr   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _d| _ j| _ j| _ j| _| j| j | _ j| _ j| _ j| _| jdkr]t nd | _|   d S )Nc                    s   g | ]}t  |qS r0   )r   ).0r   r   r0   r1   
<listcomp>F      z2GraniteMoeSharedModel.__init__.<locals>.<listcomp>r   FZrope)r"   r#   Zpad_token_idr   
vocab_sizer   r   r$   embed_tokensZ
ModuleListrU   num_hidden_layerslayersrA   r   normgradient_checkpointingembedding_multiplierr   r   r   r   Z
rope_thetaZposition_embedding_typer   
rotary_emb	post_initr,   r.   r   r1   r#   ?  s$   zGraniteMoeSharedModel.__init__c                 C      | j S r   r   rN   r0   r0   r1   get_input_embeddingsX     z*GraniteMoeSharedModel.get_input_embeddingsc                 C   
   || _ d S r   r   r-   r   r0   r0   r1   set_input_embeddings[     
z*GraniteMoeSharedModel.set_input_embeddingsN	input_idsr   r   r   inputs_embedsr   r   output_hidden_statesr   return_dictr   r3   c                 C   sH  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|
d ur$|
n| j j}
|d u |d uA r4td| jrC| jrC|rCt	d d}|d u rL| 
|}|| j }d}|rft|tsfd}t|}t	d |d u r|d urr| nd}tj|||jd  |jd}|d u r|d}| |||||}|}d }| jd ur| ||}|rd	nd }|rd	nd }|	rd	nd }d }| jD ]6}|r||f7 }|||||||||	|d
	}|d }|r||rdnd }|r||d f7 }|	r||d f7 }q| |}|r||f7 }|r|nd }|r| }|
stdd ||||fD S t|||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FTzWe detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)r   r   rb   r0   )r   r   r   r   r   r   r   r   r   r4   c                 s   s    | ]	}|d ur|V  qd S r   r0   )r   vr0   r0   r1   	<genexpr>  s    z0GraniteMoeSharedModel.forward.<locals>.<genexpr>)Zlast_hidden_stater   r2   
attentionsr{   )r   r   r   r   use_return_dictr   r   r   r   r   r   r   r   r
   r   Zfrom_legacy_cacheget_seq_lengthr>   arangerM   rb   r   _update_causal_maskr   r   r   Zto_legacy_cacherL   r   )r-   r   r   r   r   r   r   r   r   r   r   r   Zreturn_legacy_cachepast_seen_tokensr   r2   r   Zall_hidden_statesZall_self_attnsZall_router_logitsZnext_decoder_cacheZdecoder_layerZlayer_outputsZ
next_cacher0   r0   r1   r9   ^  s   









zGraniteMoeSharedModel.forwardFr   input_tensorc                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )NZflash_attention_2r   Zflex_attentionr   Fr   )r   Zpast_key_values_lengthZis_trainingr   r4   )sequence_lengthtarget_lengthrG   r   
batch_size)cudaZxpuZnpu)r   r   anyr   r>   r?   r   r   Zis_compileabler   Z_ignore_causal_mask_sdpar   rG   rM   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrb   r   finfominZ_unmask_unattended)r-   r   r  r   r   r   r  Zusing_compilable_cacherG   r  r  r   	min_dtyper0   r0   r1   r    sT   




z)GraniteMoeSharedModel._update_causal_maskr  r  rG   r  c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuerG   rb   r   )Zdiagonalr   r4   r   )r6   r>   r
  r  fullrb   Ztriur   rw   r   clonerM   rH   Zmasked_fill)r   r  r  rG   r   r  r   r   r  Zmask_lengthZpadding_maskr0   r0   r1   r	    s,    $
6  zKGraniteMoeSharedModel._prepare_4d_causal_attention_mask_with_cache_position)NNNNNNNNNNN)F)r:   r;   r<   r   r#   r   r   r   r   r>   r   r?   r   r
   r   r   r   r   r   r9   r  staticmethodr]   rG   r	  r@   r0   r0   r.   r1   r   =  s    	

{
Dr   r   gate_logitsrQ   c                    s  | du s	t | tsdS t | tr#| d j tj fdd| D dd}tjjj|dd}tj||dd\}}tjj	||}|du rStj
| dd}	tj
|dd}
ng|j\}}|jd ||  }|dddddddf |||||fd|| }tj| | ddtj|dd }	|ddddddf ||||fd| }tj|| ddtj|dd }
t|	|
d }|| S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   c                    s   g | ]}|  qS r0   )rH   )r   Z
layer_gateZcompute_devicer0   r1   r   r  r   z,load_balancing_loss_func.<locals>.<listcomp>r5   r4   )r   rL   rb   r>   rX   r   r   rf   re   Zone_hotrK   rd   rM   r   rw   rH   rj   r   )r  rQ   r_   r   Zconcatenated_gate_logitsZrouting_weightsrp   Zselected_expertsZexpert_maskZtokens_per_expertZrouter_prob_per_expertr  r  r   Zexpert_attention_maskZ router_per_expert_attention_maskZoverall_lossr0   r  r1   load_balancing_loss_funcP  s>   



r  c                        s"  e Zd ZdgZdef fddZdd Zdd Zd	d
 Zdd Z	dd Z
dd Ze													d%deej deej deej deeeeej f  deej deej dee dee dee dee dee deej deeejf d eeef fd!d"Zed#d$ Z  ZS )&GraniteMoeSharedForCausalLMzlm_head.weightr   c                    sX   t  | t|| _|j| _tj|j|jdd| _|j	| _	|j
| _|j| _|   d S )NFr    )r"   r#   r   r   r   r   r)   r$   lm_headrouter_aux_loss_coefrt   rQ   ru   r   r,   r.   r0   r1   r#     s   
z$GraniteMoeSharedForCausalLM.__init__c                 C   s   | j jS r   r   r   rN   r0   r0   r1   r     s   z0GraniteMoeSharedForCausalLM.get_input_embeddingsc                 C   s   || j _d S r   r  r   r0   r0   r1   r     s   z0GraniteMoeSharedForCausalLM.set_input_embeddingsc                 C   r   r   r  rN   r0   r0   r1   get_output_embeddings  r   z1GraniteMoeSharedForCausalLM.get_output_embeddingsc                 C   r   r   r  )r-   Znew_embeddingsr0   r0   r1   set_output_embeddings  r   z1GraniteMoeSharedForCausalLM.set_output_embeddingsc                 C   r   r   r   )r-   decoderr0   r0   r1   set_decoder  r   z'GraniteMoeSharedForCausalLM.set_decoderc                 C   r   r   r  rN   r0   r0   r1   get_decoder  r   z'GraniteMoeSharedForCausalLM.get_decoderNr   r   r   r   r   r   labelsr   r   r   r   r   r   logits_to_keepr3   c                 K   s  |dur|n| j j}|
dur|
n| j j}
|	dur|	n| j j}	|dur$|n| j j}| j||||||||	|
||d}|d }t|trGt| dn|}| 	|dd|ddf }|| j j
 }d}|duru| }| j||fd| j ji|}d}|
rt|r|jn|d | j| j|}|dur|| j||j 7 }|s|f|dd  }|
r|f| }|dur|f| S |S t||||j|j|j|jdS )ax  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, GraniteMoeSharedForCausalLM

        >>> model = GraniteMoeSharedForCausalLM.from_pretrained("ibm/PowerMoE-3b")
        >>> tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r   r   r   r   r   r   r   r   r   r   r   r   r   r4   r   )lossaux_lossro   r   r2   r   r{   )r   r   r   r   r   r   r   r]   slicer  Zlogits_scalingrd   Zloss_functionr   r  r{   rQ   ru   r  rH   rb   r   r   r2   r   )r-   r   r   r   r   r   r  r   r   r   r   r   r   r   r   r   r2   Zslice_indicesro   r!  r"  outputr0   r0   r1   r9     st   (
z#GraniteMoeSharedForCausalLM.forwardc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr0   c                 3   s$    | ]}| d  |jV  qdS )r   N)Zindex_selectrH   rb   )r   Z
past_statebeam_idxr0   r1   r   6  s   " z=GraniteMoeSharedForCausalLM._reorder_cache.<locals>.<genexpr>)rL   )r   r&  Zreordered_pastZ
layer_pastr0   r%  r1   _reorder_cache1  s   z*GraniteMoeSharedForCausalLM._reorder_cache)NNNNNNNNNNNNr   )r:   r;   r<   Z_tied_weights_keysr   r#   r   r   r  r  r  r  r   r   r>   r   r?   r   r
   r   r   r   r]   r   r   r9   r  r'  r@   r0   r0   r.   r1   r    sl    	

lr  )r  r   r   )Nr   )r   )Nr   N)Btypingr   r   r   r   r   r>   Ztorch.nn.functionalr   r   rW   Zactivationsr	   Zcache_utilsr
   r   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   utilsr   r   r   Zconfiguration_granitemoesharedr   Z!torch.nn.attention.flex_attentionr   Zintegrations.flex_attentionr   Z
get_loggerr:   r   Moduler   rA   rP   r^   rs   r~   r   r?   r]   r   rd   r   r   r   r   r   r   r  r  __all__r0   r0   r0   r1   <module>   s   
-0<

\`"  
R 