o
    ZhF                    @   s`  d dl mZmZmZmZmZ d dlZd dlm  m	Z
 d dlmZ d dlm  m  mZ d dlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, e* rd dl-m.Z. d dl/m0Z0m1Z1 ndZ.e) rd dl2m3Z3m4Z4 nd\Z4Z3e& rd dl5m6Z6 ddl7m8Z8 e'9e:Z;dd Z<dVddZ=dej>de?dej>fddZ@	 dWd!ejAd"ej>d#ej>d$ej>d%eej> d&eBd'eBfd(d)ZCG d*d+ d+ejAZDG d,d- d-ejEZEd.ej>d/e?fd0d1ZFd2d3 ZGd4d5 ZHeIe.e3e4fZJd6d7 ZKG d8d9 d9ejAZLG d:d; d;ejjAZMG d<d= d=ejAZNG d>d? d?ejAZOG d@dA dAejAZPG dBdC dCejAZQG dDdE dEejAZRG dFdG dGeZSe$G dHdI dIe"ZTG dJdK dKejAZUe$G dLdM dMeTZV		N	dXdOeej>eej> df dPee? d%eej> deej>e?f fdQdRZWG dSdT dTeTeZXg dUZYdS )Y    )CallableListOptionalTupleUnionN)nn)ACT2FN   )Cache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputWithPastMoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)auto_docstringcan_return_tupleis_torch_flex_attn_availablelogging)is_causal_conv1d_availableis_mamba_2_ssm_available   )GraniteMoeHybridConfig)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combined)causal_conv1d_fncausal_conv1d_updateNN)	BlockMask)make_flex_block_causal_maskc                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..N   dim)shapetorchcat)xx1Zx2 r.   m/var/www/auris/lib/python3.10/site-packages/transformers/models/granitemoehybrid/modeling_granitemoehybrid.pyrotate_half@   s   r0   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer0   )qkcossinposition_idsZunsqueeze_dimZq_embedZk_embedr.   r.   r/   apply_rotary_pos_embG   s
   

r7   hidden_statesn_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r)   expandreshape)r8   r9   batchnum_key_value_headsslenhead_dimr.   r.   r/   	repeat_kvb   s
   0rA           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr&   r	   r%   )r(   dtype)ptrainingr   )rA   num_key_value_groupsr*   matmul	transposer)   r   
functionalsoftmaxfloat32torK   rI   rM   
contiguous)rC   rD   rE   rF   rG   rH   rI   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr.   r.   r/   eager_attention_forwardn   s   
&r\   c                       s   e Zd ZdZdedef fddZ						ddejd	e	ej d
e	ej
 de	e dede	ej
 de	eejejf  deeje	ej e	eej  f fddZ  ZS )GraniteMoeHybridAttentionz=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                    s   t    || _|| _|d u rtd| jj d |j| _|j	| _	|j
| _| j	| j | _|j| _| j| j | _d| _|j| _| j| j | j	krUtd| j	 d| j dtj| j	| j| j |jd| _tj| j	| j| j |jd| _tj| j	| j| j |jd| _tj| j	| j	|jd| _d S )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).bias)super__init__r^   r_   loggerwarning_once	__class____name__attention_dropouthidden_sizenum_attention_heads	num_headsr@   r>   rN   Z	is_causalZattention_multiplierrH   
ValueErrorr   LinearZattention_biasq_projk_projv_projo_projselfr^   r_   rf   r.   r/   rc      s2   

z"GraniteMoeHybridAttention.__init__NFr8   rG   r6   past_key_value	use_cachecache_positionposition_embeddingsr:   c                 K   sj  |  \}	}
}| |}| |}| |}||	|
| j| jdd}||	|
| j| jdd}||	|
| j| jdd}|d urF|nd\}}|d urWt	||||\}}|d url|||d}|
||| j|\}}t}| jjdkr| jjdkr|ddrtd	 nt| jj }|| ||||f| jsd
n| j| jd|\}}||	|
d}| |}|||fS )Nr   r&   r"   )r5   r4   rw   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rB   )rI   rH   r%   )sizern   ro   rp   viewrk   r@   rP   r>   r7   updater_   r\   r^   _attn_implementationgetrd   re   r   rM   rh   rH   rq   )rs   r8   rG   r6   ru   rv   rw   rx   rV   bszZq_len_Zquery_statesrW   rX   r4   r5   Zcache_kwargsZattention_interfacer[   rY   r.   r.   r/   forward   sF   





z!GraniteMoeHybridAttention.forward)NNNFNN)rg   
__module____qualname____doc__r   intrc   r*   Tensorr   
LongTensorr
   boolr   r   __classcell__r.   r.   rt   r/   r]      s4    #
r]   c                       s.   e Zd ZdZejdfdef fddZ  ZS ) HybridMambaAttentionDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    Nr^   c                    sB  t  | | |j| _d| _|j}|j}g | _g | _g | _t	|j
D ]^}| j| dkr\|  jtj |j|j d|j |  ||dg7  _|  jtj |j|j||dg7  _q$|  jtjg g  dg7  _|  jtjg g  dg7  _| j| q$ fddt	|j
D | _ fddt	|j
D | _d S )	NFmambar&   devicerK   r   c                        g | ]}t jg g  d qS r   r*   tensor.0r   
batch_sizer   r.   r/   
<listcomp>       z=HybridMambaAttentionDynamicCache.__init__.<locals>.<listcomp>c                    r   r   r   r   r   r.   r/   r     r   )rb   rc   layers_block_typehas_previous_statemamba_d_convmamba_d_stateconv_states
ssm_statesZtransformer_layersrangenum_hidden_layersr*   zerosmamba_expandri   mamba_n_groupsmamba_n_headsmamba_d_headr   appendZ	key_cacheZvalue_cache)rs   r^   r   rK   r   conv_kernel_sizessm_state_sizeirt   r   r/   rc      sD   	
   z)HybridMambaAttentionDynamicCache.__init__)	rg   r   r   r   r*   Zfloat16r   rc   r   r.   r.   rt   r/   r      s    "r   input_tensorpad_sizec                 C   sH   t | jdkrddddd|ddfnddd|ddf}tjjj| |dddS )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   Zconstant)moderF   )lenr)   r*   r   rQ   pad)r   r   Z	pad_shaper.   r.   r/   pad_tensor_by_size!  s   2r   c                 C   sX   t | |} t| jdkr| | jd d|| jd S | | jd d|| jd | jd S )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r	   r   r%   r&   )r   r   r)   r<   )r   r   
chunk_sizer.   r.   r/   reshape_into_chunks,  s   
r   c                 C   s   |  d}| d jg |   |R  } tjtj||| jtjddd}| | d} tj| dd}tjtj||| jtjddd}|| tj	 }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    r%   .Nr   Zdiagonalr   rJ   r'   )
r|   r;   r*   Ztrilonesr   r   masked_fillcumsuminf)r   r   maskZtensor_segsumr.   r.   r/   segment_sum@  s   
  r   c                 C   sN   |dur%|j d dkr%|j d dkr%| j}| |dddddf  |} | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )r)   rK   rT   )r8   rG   rK   r.   r.   r/   apply_mask_to_padding_statesW  s   $ r   c                       s   e Zd ZdZdedef fddZ				ddejde	e
 d	e	ej d
e	ej de	ej f
ddZ			dde	e
 d	e	ej d
e	ej fddZ				dde	e
 d	e	ej d
e	ej de	ej fddZ  ZS )GraniteMoeHybridMambaLayeruO  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    The are a few differences between this and Mamba2Mixer:
    - The variable use_precomputed_states is slightly different due to the HybridCache structure
    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
    - Some extra variables that our layer doesn't need have been removed
    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
    r^   r_   c                    s  t    |j| _|j| _|j| _|j| _t	|j
| j | _|| _|j| _|j| _t|j | _|j| _|j| _|j| _|j| _|j| _dtdf| _d| _d| _ | jd| j | j  | _!t"j#| j!| j!|j| j| j!| jd d| _$| j| j! | j }t"j%| j|| jd| _&t"'t()| j| _*t(+d| jd }t"'t(,|| _-d	| j-_.t/| j| jd
| _0t"'t()| j| _1d	| j1_.t"j%| j| j| jd| _2t3st45d d S t45d d S )NrB   r   gMbP?g?r&   r   )Zin_channelsZout_channelsra   Zkernel_sizegroupspaddingr`   Tepsa  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzOThe fast path for GraniteMoeHybrid will be used when running the model on a GPU)6rb   rc   r   rk   ri   r   r   r   r   r   r   intermediate_sizer_   Zmamba_conv_biasuse_conv_bias
hidden_act
activationr   actZmamba_proj_biasZuse_biasrms_norm_epsZlayer_norm_epsilonr   n_groupsr   r@   Zmamba_chunk_sizer   floattime_step_limitZtime_step_minZtime_step_maxconv_dimr   Conv1dconv1drm   in_proj	Parameterr*   r   dt_biasarangelogA_logZ_no_weight_decayGraniteMoeHybridRMSNormGatednormDout_projis_fast_path_availablerd   re   )rs   r^   r_   Zprojection_sizeArt   r.   r/   rc   q  s\   

	z#GraniteMoeHybridMambaLayer.__init__Nr8   cache_paramsrw   rG   seq_idxc                 C   s  t ||}| |}|j\}}}	| j| j }
|d uoD|joD|dkoD|j| j jd |j| j jd   ko8|kn  oD|d uoD|d dk}|r)|	dj
| j| j| jgdd\}}}t||j| j | jj	d| jj| j}tj
|| j|
|
gdd\}}}t| j  }|d d d df d d d d d f d| j| jjtjd}|d d d d d f dd| j}| jd d d df d| j}| jd d d df d| j}||| j|jd | j }||| j|jd | j }||| j| j}t|j| j ||||||d |dd
}||| j| j }| ||}|  |d d d df }|S t| j  }| j!d	td
fkr>i nd| j!i}| j"r||d u r|t#|| jj	d| jj| j|f| j| j$|| j| jj| jj%| j j| j j| j| jddd|}|S |j
| j| j| jgdd\}}}|d ur|&dd}t'j()|| j*|jd  df}|j| j +| | jdvr| ,| |&dddd |f &dd}nt-|&dd| jj	d| jj| j|d&dd}t ||}tj
|| j|
|
gdd\}}}t.|||d| j|||||| jd|||| jdf| j$| jd |d| jdd|\}}|d ur:|d ur:|j| j +| |||d}| ||}|  |}|S )Nr   r   r%   r'   .rK   T)zr   dt_softplusrB   r   Zdt_limitF)r   r   r   r   Zrmsnorm_weightZrmsnorm_epsZoutproj_weightZoutproj_biasZheaddimZngroupsZnorm_before_gatereturn_final_statesr&   )siluZswish)r,   weightra   r   r   )r   r   r   r   r   r   r   )/r   r   r)   r   r   r   r   r_   r   squeezesplitr   r   rk   r!   r   r   ra   r   r*   expr   r   r;   r@   rT   rS   r   r   r}   r   r   r   r   rM   r   r   variance_epsilonrP   r   rQ   r   r   copy_r   r    r   )rs   r8   r   rw   rG   r   projected_statesr   seq_lenr   Zgroups_time_state_sizeuse_precomputed_statesgatehidden_states_B_CdtBCr   r   r   Zhidden_states_reshapedoutZdt_limit_kwargshidden_states_B_C_transposedr   scan_output	ssm_stater.   r.   r/   cuda_kernels_forward  s  
	




<"
^"V
$




z/GraniteMoeHybridMambaLayer.cuda_kernels_forwardc           3   
      s  |j \}}}|j}t||}|}	|	jjjjgdd\}
}}|d uoQ|joQ|dkoQ|j	j
 j d |jj
 j d   koE|kn  oQ|d uoQ|d dk}|r|j	j
 jddd|j	j
< |d d dd d f |j	j
 j|j	j
 d d d d df< |j	j
 jjjjd}tj|jjd dd}jr|jj }|}n8|d ur|dd}tj|j|j d  df}|j	j
 | |dddd |f dd}t||}tj|jjj jj gdd\}}}tj !  }|r[|jj
 j}|d d dd d f d d d df }|dd"||j d j#}j$d	 "j$j d j#}tjj%|||j }t&|j'd j'd }|d
 "jj#jjtj(d}t|d	 | j|d}|)|jddd d d f }|"|jjj |j d * }|)|d|j d }|d	 |dd d d f  }|)|dj#}||d	  j|d}|jj
 |jj
 | |  |)|jddd d d f }|"|jjj |j d * }|)|d|j d }|jj
 j|j|jd}|+|j j#j}|+|j jd}t,||}|+|jj#}j-d	 "j-j d j#}|||  |j}|)|dd d d df }ntj%|j$ }t&|j'd j'd }|)||dj#! }|)||dj! }|)||dj! }|j.jj djd}|j.jj djd}j/|j/  j/  j-d	 t0|  }||d	  }||j| } fdd||||fD \}}}}|1dddd}tj2|dd}tt3|} |d d d d d d d d d d d f |d d d d d d d d d d d f  }!|!jdd}"|"d	 | 1dddddd	  }#|#jdd}$|$d	 |d d d d d f  jdd}%t|d d d d d d dd f | }&||&1ddddd	  }'|'dd d d f |d	  jdd}(|r|jj
 d d d df j|(jd})nt4|(d d d df })tj5|)|(gdd}(tt3tj|d d d d d d df d}*|*dd}*|*d
 |(d d d d d df  jdd}+|+d d d df |+d d df }(},t|}-|dd d d f |(d d d d d df  }.|-1dddd}/|.d|/d	  }0|%|0 }|)|djj#}|| } dkrB|d d d |d d d d f }|)||d}|,d ur_|d ur_|jj
 |, d|_6||
}17|1|}2|2S )Nr%   r'   r   r   )Zshiftsdimsr   r&   .r   ).NNr   r   )r(   output_sizec                    s   g | ]	}t | jqS r.   )r   r   )r   tr   rs   r.   r/   r     s    z<GraniteMoeHybridMambaLayer.torch_forward.<locals>.<listcomp>r	   r   rJ   )r   r   T)8r)   rK   r   r   r   r   r   rk   r   r   r_   r   ZrollrT   r   r   r   r*   sumr   r   ra   r   rP   r   rQ   r   r   r   r   r   r   r   r   r;   r@   r   Zsoftplusclampr   rS   r<   rU   r}   Zbmmr   Zrepeat_interleaver   r   Zpermuter   r   Z
zeros_liker+   r   r   )3rs   Zinput_statesr   rw   rG   r   r   r   rK   r   r   r   r   r   r   r   r8   r   r   r   Zcache_devicer   ZdAZdBZdBxr   Zssm_states_reshapedZ
C_reshapedyr   Z
D_residualZA_cumsumLZG_intermediateGZM_intermediateMZY_diagZdecay_statesZB_decayZstatesZprevious_statesZdecay_chunkZ
new_statesr   Zstate_decay_outZC_times_statesZstate_decay_out_permutedZY_offr   Zcontextualized_statesr.   r   r/   torch_forward^  s   


@,
$"$$$P&*"&0(&
*
 z(GraniteMoeHybridMambaLayer.torch_forwardc                 K   s   t rd| jjjjv r| |||||S |d urtd|j}|d ur@|jd dkr@|jd dkr@||d d d d d f  	|}| 
||||S )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r   r   )r   r   r   r   typer   NotImplementedErrorrK   r)   rT   r   )rs   r8   r   rw   rG   r   rV   rK   r.   r.   r/   r   .  s   	$ z"GraniteMoeHybridMambaLayer.forward)NNNN)NNN)rg   r   r   r   r   r   rc   r*   r   r   r   r   Z	IntTensorr   r   r   r   r.   r.   rt   r/   r   c  sV    F
 .
 Tr   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	r   ư>c                    s&   t    tt|| _|| _d S Nrb   rc   r   r   r*   r   r   r   rs   ri   r   rt   r.   r/   rc   F  s   

z%GraniteMoeHybridRMSNormGated.__init__Nc                 C   sj   |j }|tj}|d ur|tj|tj }|djddd}|t	|| j
  }| j|| S Nr&   r%   T)Zkeepdim)rK   rT   r*   rS   r   rQ   r   powmeanrsqrtr   r   )rs   r8   r   input_dtypevariancer.   r.   r/   r   K  s   z$GraniteMoeHybridRMSNormGated.forwardr  r  )rg   r   r   rc   r   r   r.   r.   rt   r/   r   E  s    r   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	GraniteMoeHybridMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    r^   c                    s^   t t|   |j| _|j| _t|j | _t	j
| j| jd dd| _t	j
| j| jdd| _d S )Nr&   Fr`   )rb   r  rc   ri   
input_sizeZshared_intermediate_sizer   r   r   r   rm   input_linearoutput_linearrs   r^   rt   r.   r/   rc   `  s   zGraniteMoeHybridMLP.__init__r8   r:   c                 C   s<   |  |}|jddd}| |d |d  }| |}|S )Nr&   r%   r'   r   r   )r  chunkr   r  )rs   r8   chunked_hidden_statesr.   r.   r/   r   i  s
   

zGraniteMoeHybridMLP.forward)
rg   r   r   r   r   rc   r*   r   r   r   r.   r.   rt   r/   r  W  s    	r  c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	GraniteMoeHybridRMSNormr  c                    s&   t    tt|| _|| _dS )zF
        GraniteMoeHybridRMSNorm is equivalent to T5LayerNorm
        Nr  r  rt   r.   r/   rc   r  s   

z GraniteMoeHybridRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S r  )	rK   rT   r*   rS   r  r	  r
  r   r   )rs   r8   r  r  r.   r.   r/   r   z  s
   zGraniteMoeHybridRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler   r)   r   rs   r.   r.   r/   
extra_repr  s   z"GraniteMoeHybridRMSNorm.extra_reprr  )rg   r   r   rc   r   r  r   r.   r.   rt   r/   r  q  s    r  c                       s6   e Zd Zdedededdf fddZdd	 Z  ZS )
GraniteMoeHybridParallelExpertsnum_expertsr  r   r:   Nc                    s6   t    tt|||| _|| _|| _|| _	dS )a  
        Initialize the GraniteMoeHybridParallelExperts module.
        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
        used in vllm.

        Args:
            num_experts (int):
                Number of experts.
            input_size (int):
                Size of the input.
            output_size (int):
                Size of the output.
        N)
rb   rc   r   r   r*   emptyr   r  r  r   )rs   r  r  r   rt   r.   r/   rc     s
   

z(GraniteMoeHybridParallelExperts.__init__c                 C   sP   |j |dd}g }t| jD ]}|t|| | j|  qtj|dd}|S )a  
        Forward pass of the GraniteMoeHybridParallelExperts module.

        Args:
            inputs (Tensor):
                Input tensor.
            expert_size:
                Expert size information.

        Returns:
            Tensor: Output tensor.
        r   r'   )	r   r   r  r   FZlinearr   r*   r+   )rs   Zinputsexpert_sizeZ
input_listZoutput_listr   resultsr.   r.   r/   r     s   z'GraniteMoeHybridParallelExperts.forwardrg   r   r   r   rc   r   r   r.   r.   rt   r/   r    s    r  c                       s2   e Zd Zdededef fddZdd Z  ZS )GraniteMoeHybridTopKGatingr  r  top_kc                    s2   t    || _|| _|| _tj||dd| _dS )a  
        Initialize the top-k gating mechanism.
        Args:
            input_size (`int`):
                Size of the input.
            num_experts (`int`):
                Number of experts.
            top_k (`int`):
                Number of top experts to select.
        Fr`   N)rb   rc   r  r  r!  r   rm   layer)rs   r  r  r!  rt   r.   r/   rc     s
   
z#GraniteMoeHybridTopKGating.__init__c                 C   s   |  | }|j| jdd\}}tj|dd|}tj|d| j	g|j
|jd}|d|d}| d}| }| }	|	d\}
}|j| jdd}| }|| }|||||fS )Nr   r'   r   rK   r   trunc)Zrounding_mode)r"  r   topkr!  r*   rR   Ztype_asr   r|   r  rK   r   Zscatterlongr   tolistflattensortdiv)rs   r8   logitsZtop_k_logitsZtop_k_indicesZtop_k_gatesr   Zgatesr  Ztop_k_expertsr   Zindex_sorted_expertsbatch_indexbatch_gatesr.   r.   r/   r     s   z"GraniteMoeHybridTopKGating.forwardr  r.   r.   rt   r/   r     s    r   c                       s.   e Zd ZdZdef fddZdd Z  ZS )GraniteMoeHybridMoEz
    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    r^   c                    sp   t t|   |j| _|j| _t|j | _t	|j
| j| jd | _t	|j
| j| j| _t| j|j
|jd| _d S )Nr&   )r  r  r!  )rb   r.  rc   ri   r  r   r   r   r   r  num_local_expertsr  r  r   num_experts_per_tokrouterr  rt   r.   r/   rc     s   zGraniteMoeHybridMoE.__init__c                 C   s   |  \}}}|d|}| |\}}}}}	|| }
| |
|}|jddd}| |d |d  }| ||}||dddf  }tj|| | j	f|j
|jd}|d||}|||| j	}||	fS )a  
        Forward pass of the mixture of experts layer.

        Args:
            layer_input (Tensor):
                Input tensor.

        Returns:
            Tensor:
                Output tensor.
            Tensor:
                Router logits.
        r%   r&   r'   r   r   Nr#  )r|   r<   r1  r  r  r   r  r*   r   r  rK   r   Z	index_addr}   )rs   Zlayer_inputr   lengthZemb_sizer   r,  r-  r  router_logitsZexpert_inputsr8   r  Zexpert_outputsr   Zlayer_outputr.   r.   r/   r     s   zGraniteMoeHybridMoE.forward)rg   r   r   r   r   rc   r   r   r.   r.   rt   r/   r.    s    r.  c                       s   e Zd Zdedef fddZ							ddejdeej d	ee	 d
ee
 dee
 deej dee
 deeejejf  deejeeejejf  f fddZ  ZS )GraniteMoeHybridDecoderLayerr^   r_   c                    s   t    |j| _d | _t|| _t|j|jd| _t|j|jd| _	|j
| _
t|| _d | _|j| dkr=t||| _nt||| _|j| | _d S )Nr   r   )rb   rc   ri   	self_attnr.  block_sparse_moer  r   input_layernormpost_attention_layernormresidual_multiplierr  
shared_mlpr   r   r   r]   
layer_typerr   rt   r.   r/   rc     s   


z%GraniteMoeHybridDecoderLayer.__init__NFr8   rG   ru   r{   rv   rw   output_router_logitsrx   r:   c	              
   K   s   |}
|  |}| jdur| j||||d}d}n| jd|||||||d|	\}}}|
|| j  }|}
| |}| |\}}|| | }|
|| j  }|f}|rX||f7 }|r_||f7 }|rf||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        N)r8   rw   r   rG   )r8   rG   ru   r{   rv   rw   rx   r.   )r7  r   r5  r9  r8  r6  r:  )rs   r8   rG   ru   r{   rv   rw   r<  rx   rV   ZresidualZself_attn_weightsr   Zmoe_hidden_statesr3  outputsr.   r.   r/   r   2  sF   %





z$GraniteMoeHybridDecoderLayer.forward)NNFFNFN)rg   r   r   r   r   rc   r*   r   r   r
   r   r   r   FloatTensorr   r   r.   r.   rt   r/   r4    s8    	r4  c                   @   sD   e Zd ZeZdZdZdgZdgZdZ	dZ
dZdZdZdZdd ZdS )	GraniteMoeHybridPreTrainedModelmodelTr4  past_key_valuesFc                 C   sZ  t |tjr|jjjd| jjd |jd ur|jj	  n=t |tj
r=|jjjd| jjd |jd ur<|jj|j 	  nt |trJ|jjd nt |trZ|jjjd| jjd t |tjrz|jjjd| jjd |jd urx|jj	  d S d S t |tr|jjd ttd|jd |j_|jjd d S t |tr|jjd d S d S )NrB   )r	  Zstdg      ?r   )
isinstancer   rm   r   dataZnormal_r^   Zinitializer_rangera   Zzero_	Embeddingpadding_idxr  Zfill_r  r   r   r   r*   r   r   rk   r   r   r   )rs   rC   r.   r.   r/   _init_weights  s4   






z-GraniteMoeHybridPreTrainedModel._init_weightsN)rg   r   r   r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attn_2Z_supports_sdpaZ_supports_cache_classZ_supports_quantized_cacheZ_supports_static_cacheZ_is_statefulrF  r.   r.   r.   r/   r?    s    r?  c                       s8   e Zd Zddef fddZe edd Z  Z	S )GraniteMoeHybridRotaryEmbeddingNr^   c                    s   t    t|dr|jd ur|jd|jd| _nd| _|j| _|j| _|| _	t
| j | _| | j	|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typer  defaultinv_freqF)
persistent)rb   rc   hasattrrH  r   rI  max_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenr^   r   Zrope_init_fnattention_scalingZregister_bufferrK  Zoriginal_inv_freq)rs   r^   r   rK  rt   r.   r/   rc     s   
z(GraniteMoeHybridRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r%   r   ZmpscpuF)device_typeenabledr&   r'   r   )rK  r   r;   r)   rT   r   rB  r  strr*   ZautocastrP   r+   r4   rO  r5   rK   )
rs   r,   r6   Zinv_freq_expandedZposition_ids_expandedrQ  ZfreqsZembr4   r5   r.   r.   r/   r     s   0&z'GraniteMoeHybridRotaryEmbedding.forwardr  )
rg   r   r   r   rc   r*   Zno_gradr   r   r   r.   r.   rt   r/   rG    s
    rG  c                       s8  e Zd Zdef fddZdd Zdd Zee											d$d	e	j
d
ee	j dee	j
 deeeee	j f  dee	j dee dee dee dee dee dee	j
 deeef fddZ	d%d
ee	jdf de	jde	jdedef
ddZed
e	jdedede	jde	jdefd d!Zd"d# Z  ZS )&GraniteMoeHybridModelr^   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _d| _ j| _ j| _ j| _| j| j | _ j| _ j| _ j| _| jdkr]t nd | _|   d S )Nc                    s   g | ]}t  |qS r.   )r4  )r   r_   r^   r.   r/   r         z2GraniteMoeHybridModel.__init__.<locals>.<listcomp>r   FZrope)rb   rc   Zpad_token_idrE  
vocab_sizer   rD  ri   embed_tokensZ
ModuleListr   r   layersr  r   r   gradient_checkpointingembedding_multiplierrj   rk   r@   rN  Z
rope_thetaZposition_embedding_typerG  
rotary_emb	post_initr  rt   rU  r/   rc     s$   zGraniteMoeHybridModel.__init__c                 C      | j S r  rX  r  r.   r.   r/   get_input_embeddings     z*GraniteMoeHybridModel.get_input_embeddingsc                 C   
   || _ d S r  r_  rs   rF   r.   r.   r/   set_input_embeddings     
z*GraniteMoeHybridModel.set_input_embeddingsN	input_idsrG   r6   rA  inputs_embedsrv   r{   output_hidden_statesr<  return_dictrw   r:   c                 C   s@  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|
d ur$|
n| j j}
|d u |d uA r4td| jrC| jrC|rCt	d d}|d u rL| 
|}|| j }|r\|d u r\t	d |d u rx|d urh| nd}tj|||jd  |jd}|d u r|d}| |||||}| ||}|}d }| jd ur| ||}|rdnd }|rdnd }|	rdnd }d }| jD ]L}|jd	kr|n|}|r||f7 }||||||||	|d
}|d }|r||rdnd }|r|d d ur||d f7 }|	r|d d ur||d f7 }q| |}|r||f7 }|r|nd }t|||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzGraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. Because one was not provided, no cache will be returned.r   r   r   r.   r   )rG   ru   r{   rv   rw   r<  rx   r&   r%   )Zlast_hidden_staterA  r8   
attentionsr3  )r^   r{   rh  rv   use_return_dictrl   rZ  rM   rd   re   rX  r[  get_seq_lengthr*   r   r)   r   r1   _update_causal_mask_update_mamba_maskr\  rY  r;  r   r   )rs   rf  rG   r6   rA  rg  rv   r{   rh  r<  ri  rw   past_seen_tokensrZ   
mamba_maskr8   rx   Zall_hidden_statesZall_self_attnsZall_router_logitsZnext_decoder_cacheZdecoder_layerZ
layer_maskZlayer_outputsZ
next_cacher.   r.   r/   r     s   








zGraniteMoeHybridModel.forwardFr#   r   c                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )NZflash_attention_2rB   Zflex_attentionr   Frz   )rg  Zpast_key_values_lengthZis_trainingr   r%   )sequence_lengthtarget_lengthrK   rw   r   )r   ZxpuZnpu)r^   r   anyrB  r*   r   r$   rl  Zis_compileabler   Z_ignore_causal_mask_sdparM   rK   r)   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   r  finfominZ_unmask_unattended)rs   rG   r   rw   rA  r{   ro  Zusing_compilable_cacherK   rq  rr  rZ   	min_dtyper.   r.   r/   rm  j  sT   




z)GraniteMoeHybridModel._update_causal_maskrq  rr  rK   r   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr   )Z
fill_valuerK   r   r   r   r   r%   r   )r(   r*   ru  rv  fullr   Ztriur   r<   r;   cloner)   rT   r   )rG   rq  rr  rK   rw   r   rV   rZ   rw  Zmask_lengthZpadding_maskr.   r.   r/   rt    s,    $
6  zKGraniteMoeHybridModel._prepare_4d_causal_attention_mask_with_cache_positionc                 C   s.   |}|d dks|durt |dkrd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )r*   all)rs   rG   rw   rp  r.   r.   r/   rn    s   "z(GraniteMoeHybridModel._update_mamba_mask)NNNNNNNNNNN)F)rg   r   r   r   rc   r`  rd  r   r   r*   r   r   r   r   r
   r   r>  r   r   r   r   rm  staticmethodr   rK   rt  rn  r   r.   r.   rt   r/   rT    s    	

|
D6rT  r&   gate_logitsr  c                    s  | du s	t | tsdS t | tr#| d j tj fdd| D dd}tjjj|dd}tj||dd\}}tjj	||}|du rStj
| dd}	tj
|dd}
ng|j\}}|jd ||  }|dddddddf |||||fd|| }tj| | ddtj|dd }	|ddddddf ||||fd| }tj|| ddtj|dd }
t|	|
d }|| S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   c                    s   g | ]}|  qS r.   )rT   )r   Z
layer_gateZcompute_devicer.   r/   r     rV  z,load_balancing_loss_func.<locals>.<listcomp>r'   r%   )rB  r  r   r*   r+   r   rQ   rR   r%  Zone_hotr	  r   r)   r;   r<   rT   r   r1   )r|  r  r!  rG   Zconcatenated_gate_logitsZrouting_weightsr   Zselected_expertsZexpert_maskZtokens_per_expertZrouter_prob_per_expertr   rq  r   Zexpert_attention_maskZ router_per_expert_attention_maskZoverall_lossr.   r}  r/   load_balancing_loss_func  s>   



r~  c                        sF  e Zd ZdgZdef fddZdd Zdd Zd	d
 Zdd Z	dd Z
dd Ze													d*deej deej deej deeeeej f  deej deej dee dee dee dee dee deej deeejf d eeef fd!d"Zed#d$ Z						%d+d&d'Zd efd(d)Z  ZS ),GraniteMoeHybridForCausalLMzlm_head.weightr^   c                    sX   t  | t|| _|j| _tj|j|jdd| _|j	| _	|j
| _|j| _|   d S )NFr`   )rb   rc   rT  r@  rW  r   rm   ri   lm_headrouter_aux_loss_coefr/  r  r0  r]  r  rt   r.   r/   rc   F  s   
z$GraniteMoeHybridForCausalLM.__init__c                 C   s   | j jS r  r@  rX  r  r.   r.   r/   r`  S  s   z0GraniteMoeHybridForCausalLM.get_input_embeddingsc                 C   s   || j _d S r  r  rc  r.   r.   r/   rd  V  s   z0GraniteMoeHybridForCausalLM.set_input_embeddingsc                 C   r^  r  r  r  r.   r.   r/   get_output_embeddingsY  ra  z1GraniteMoeHybridForCausalLM.get_output_embeddingsc                 C   rb  r  r  )rs   Znew_embeddingsr.   r.   r/   set_output_embeddings\  re  z1GraniteMoeHybridForCausalLM.set_output_embeddingsc                 C   rb  r  r@  )rs   decoderr.   r.   r/   set_decoder_  re  z'GraniteMoeHybridForCausalLM.set_decoderc                 C   r^  r  r  r  r.   r.   r/   get_decoderb  ra  z'GraniteMoeHybridForCausalLM.get_decoderNr   rf  rG   r6   rA  rg  labelsrv   r{   rh  r<  ri  rw   logits_to_keepr:   c                 K   s  |dur|n| j j}|
dur|
n| j j}
|	dur|	n| j j}	|dur$|n| j j}| j||||||||	|
||d}|d }t|trGt| dn|}| 	|dd|ddf }|| j j
 }d}|duru| }| j||fd| j ji|}d}|
rt|r|jn|d | j| j|}|dur|| j||j 7 }|s|f|dd  }|
r|f| }|dur|f| S |S t||||j|j|j|jdS )ax  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, GraniteMoeHybridForCausalLM

        >>> model = GraniteMoeHybridForCausalLM.from_pretrained("ibm/PowerMoE-3b")
        >>> tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)rf  rG   r6   rA  rg  rv   r{   rh  r<  ri  rw   r   rW  r%   r   )lossaux_lossr+  rA  r8   rj  r3  )r^   r{   r<  rh  rk  r@  rB  r   slicer  Zlogits_scalingr   Zloss_functionrW  r~  r3  r  r0  r  rT   r   r   rA  r8   rj  )rs   rf  rG   r6   rA  rg  r  rv   r{   rh  r<  ri  rw   r  rV   r=  r8   Zslice_indicesr+  r  r  outputr.   r.   r/   r   e  st   (
z#GraniteMoeHybridForCausalLM.forwardc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr.   c                 3   s$    | ]}| d  |jV  qdS )r   N)Zindex_selectrT   r   )r   Z
past_statebeam_idxr.   r/   	<genexpr>  s   " z=GraniteMoeHybridForCausalLM._reorder_cache.<locals>.<genexpr>)r  )rA  r  Zreordered_pastZ
layer_pastr.   r  r/   _reorder_cache  s   z*GraniteMoeHybridForCausalLM._reorder_cacheTc                 K   s  |d u }	|	s5|d us|d |j d kr"|d d |j d  d f }n!|j d |j d kr4|d d |f }nt| j|j d | j| jd}|d url|d u rl| dd }||dkd |	sl|d d |j d  d f }|d urw|	rwd|i}
nd| i}
|
	|||||d |
S )Nr%   r   r   r   rg  rf  )r6   rA  rv   rG   rw   )
r)   r   r^   rK   r   r&  r   Zmasked_fill_rU   r~   )rs   rf  rA  rG   rg  rw   r6   rv   rV   Zempty_past_kvZmodel_inputsr.   r.   r/   prepare_inputs_for_generation  s8   
	z9GraniteMoeHybridForCausalLM.prepare_inputs_for_generationc                 C   s   dS )aG  
        Function overwritten as this class uses its own `HybridMambaAttentionDynamicCache`
        and do not need to initialize the Cache in advance in order to save memory
        (because no back and forth `to_legacy_cache` and `from_legacy_cache` will be performed
        for `HybridMambaAttentionDynamicCache`).
        Fr.   r  r.   r.   r/   _supports_default_dynamic_cache  s   z;GraniteMoeHybridForCausalLM._supports_default_dynamic_cache)NNNNNNNNNNNNr   )NNNNNT)rg   r   r   Z_tied_weights_keysr   rc   r`  rd  r  r  r  r  r   r   r*   r   r   r   r
   r   r>  r   r   r   r   r   r{  r  r  r  r   r.   r.   rt   r/   r  C  s|    	

l

9r  )r  rT  r?  )Nr   )rB   )Nr&   N)Ztypingr   r   r   r   r   r*   Ztorch.nn.functionalr   rQ   r  Z(transformers.models.jamba.modeling_jambamodelsZjambaZmodeling_jambaZtransformers.activationsr   Zcache_utilsr
   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   utilsr   r   r   r   Zutils.import_utilsr   r   Zconfiguration_granitemoehybridr   Z+mamba_ssm.ops.triton.selective_state_updater   Z!mamba_ssm.ops.triton.ssd_combinedr   r   Zcausal_conv1dr    r!   Z!torch.nn.attention.flex_attentionr#   Zintegrations.flex_attentionr$   Z
get_loggerrg   rd   r0   r7   r   r   rA   Moduler   r\   r]   r   r   r   r   rz  r   r   r   r   r  r  r  r   r.  r4  r?  rG  rT  r~  r  __all__r.   r.   r.   r/   <module>   s   


]9   e-0<i'"  #
R \