o
    Zhr                    @   s  d dl mZmZmZmZ d dlZd dlZd dlm	Z	 ddl
mZ ddlmZmZmZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddl m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. e+ rd dl/m0Z0 ddl1m2Z2 e,3e4Z5G dd de	j6Z7G dd de	j6Z8dej9de:dej9fddZ;	dJde	j6dej9dej9d ej9d!eej9 d"e<d#e<fd$d%Z=d&d' Z>dKd(d)Z?G d*d+ d+e	j6Z@G d,d- d-e	j6ZAG d.d/ d/eZBG d0d1 d1eZCe)G d2d3 d3e%ZDG d4d5 d5eDZEe)G d6d7 d7eDZF		 dLd8ee:e:f d9e<d:e:d!eejG d;e:dejHfd<d=ZIe)G d>d? d?eDZJd@ej9dAe:dBe:fdCdDZKe)dEdFG dGdH dHeDeZLg dIZMdS )M    )CallableOptionalTupleUnionN   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )MoonshineConfig)	BlockMask)make_flex_block_causal_maskc                       2   e Zd Z fddZdejdejfddZ  ZS )MoonshineEncoderMLPc                    sB   t    || _t| | _t|j|j| _	t|j|j| _
d S Nsuper__init__configr   activation_fnnnLinearhidden_sizeZintermediate_sizefc1fc2selfr)   Z
hidden_act	__class__ _/var/www/auris/lib/python3.10/site-packages/transformers/models/moonshine/modeling_moonshine.pyr(   =   s
   

zMoonshineEncoderMLP.__init__hidden_statesreturnc                 C   s"   |  |}| |}| |}|S r%   )r.   r*   r/   )r1   r6   r4   r4   r5   forwardD   s   


zMoonshineEncoderMLP.forward__name__
__module____qualname__r(   torchTensorr8   __classcell__r4   r4   r2   r5   r$   <       r$   c                       r#   )MoonshineDecoderMLPc                    sF   t    || _t| | _t|j|jd | _	t|j|j| _
d S )N   r&   r0   r2   r4   r5   r(   L   s
   

zMoonshineDecoderMLP.__init__r6   r7   c                 C   s8   |  |}|jddd\}}| || }| |}|S )NrB   dim)r.   chunkr*   r/   )r1   r6   Zgater4   r4   r5   r8   S   s
   

zMoonshineDecoderMLP.forwardr9   r4   r4   r2   r5   rA   K   r@   rA   r6   n_repr7   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)shapeexpandreshape)r6   rG   batchnum_key_value_headsslenhead_dimr4   r4   r5   	repeat_kv[   s
   0rO           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )NrB   r   rC   )rE   dtype)ptrainingr   )rO   num_key_value_groupsr=   matmul	transposerH   r+   
functionalZsoftmaxZfloat32torY   rW   r[   
contiguous)rQ   rR   rS   rT   rU   rV   rW   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr4   r4   r5   eager_attention_forwardg   s   
&rh   c                 C   s>   | ddddf }| ddddf }t j| |fdddS )	z*Rotates half the hidden dims of the input..r   NrB   r   rC   rD   rX   )r=   stackflatten)xx1Zx2r4   r4   r5   rotate_half   s   rm   c                 C   s   | |}| |}|dd|jd d f jddd}|dd|jd d f jddd}|jd }| dd|f | d|df }}|dd|f |d|df }	}
|| t||  }|	| t|	|  }tj||gdd}tj||
gdd}||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .NrC   rB   rD   )	unsqueezerH   Zrepeat_interleaverm   r=   cat)qkcossinposition_idsZunsqueeze_dimZ
rotary_dimZq_rotZq_passZk_rotZk_passZq_embedZk_embedr4   r4   r5   apply_rotary_pos_emb   s   

$$
""ru   c                       s   e Zd ZdZdededededef
 fddZ										dd
ej	de
eej	ej	f  de
ej	 de
e de
ej de
ej	 dee deej	e
ej	 e
eej	  f fddZ  ZS )MoonshineAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr)   	layer_idx	is_causalnum_attention_headsrL   c                    s  t    |||d || _|| _t|d|j|j | _|j|j	 | _
| jd | _|j| _|| _tj|j|j| j |jd| _tj|j|j	| j |jd| _tj|j|j	| j |jd| _tj|j| j |jdd| _| jjd ur| jj}|| j| d |  }|| j | _d S d| _d S )N)ry   rL   rN   g      ࿩biasFr   r   )r'   r(   updater)   rw   getattrr-   ry   rN   rL   r\   rV   attention_dropoutrx   r+   r,   Zattention_biasq_projk_projv_projo_projZpad_head_dim_to_multiple_ofhead_dim_padding)r1   r)   rw   rx   ry   rL   Ztarget_multipleZtarget_head_dimr2   r4   r5   r(      s0   

zMoonshineAttention.__init__Nr6   position_embeddingsrU   past_key_valuecache_positionkey_value_statesrb   r7   c                 K   s  |j d d \}}	| |||	| jj| jdd}
|d u}|d ur9|j| j	}|r6d|j| j	< |j
}n|j}|d ur?|n|}|rT|rT|rT|j| j	 }|j| j	 }n7| ||d| jj| jdd}| ||d| jj| jdd}|r|d ur|||| j	d|i\}}|s|\}}t|
|||\}
}|d ur|||d}|||| j	|\}}t}| jjdkr| jjdkr|d	d
rtd nt| jj }| jr|d u r|	dkrdnd
}| jdkrtjj|
d| jf}
tjj|d| jf}tjj|d| jf}|| |
|||f| jsdn| j| j|d|\}}| jdkr/|dd | j f }| ||	d! }| "|}||fS )NrC   r   rB   Tr   )rs   rr   r   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   rP   )rW   rV   rx   .)#rH   r   viewr)   rL   rN   r^   
is_updatedgetrw   cross_attention_cacheself_attention_cacheZ	key_cacheZvalue_cacher   r   r|   ru   rh   _attn_implementationloggerwarning_oncer   rx   r   r=   r+   r_   padr[   r~   rV   rJ   ra   r   )r1   r6   r   rU   r   r   r   rb   ZbszZq_lenZquery_statesZis_cross_attentionr   Zcurrent_statesrc   rd   rr   rs   Zcache_kwargsZattention_interfacerx   rg   re   r4   r4   r5   r8      s   
"
	

zMoonshineAttention.forward)NNNNN)r:   r;   r<   __doc__r    intboolr(   r=   r>   r   r   r   
LongTensorr   r   r8   r?   r4   r4   r2   r5   rv      sF    (	rv   c                       s8   e Zd Zddef fddZe edd Z  Z	S )MoonshineRotaryEmbeddingNr)   c                    s   t    t|dr|jd ur|jd|jd| _nd| _|j| _|j| _|| _	t
| j | _| | j	|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r'   r(   hasattrr   r   r   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenr)   r   Zrope_init_fnattention_scalingZregister_bufferr   Zoriginal_inv_freq)r1   r)   devicer   r2   r4   r5   r(   9  s   
z!MoonshineRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   rC   r   ZmpscpuF)device_typeenabledrB   rD   rY   )r   floatrI   rH   r`   r   
isinstancer   strr=   Zautocastr^   ro   rr   r   rs   rY   )
r1   rk   rt   Zinv_freq_expandedZposition_ids_expandedr   ZfreqsZembrr   rs   r4   r4   r5   r8   J  s   0&z MoonshineRotaryEmbedding.forwardr%   )
r:   r;   r<   r    r(   r=   Zno_gradr   r8   r?   r4   r4   r2   r5   r   8  s
    r   c                       s   e Zd Zdedef fddZ							ddejdeej d	eej	 d
ee
 dee dee deej	 deeejejf  dee deejeeejejf  f fddZ  ZS )MoonshineEncoderLayerr)   rw   c                    s`   t    |j| _t||d|j|jd| _t||j| _	t
j|jdd| _t
j|jdd| _d S )NFr)   rw   rx   ry   rL   rz   )r'   r(   r-   rv   Zencoder_num_attention_headsZencoder_num_key_value_heads	self_attnr$   Zencoder_hidden_actmlpr+   	LayerNorminput_layernormpost_attention_layernormr1   r)   rw   r2   r4   r5   r(   [  s   
zMoonshineEncoderLayer.__init__NFr6   rU   rt   r   r   	use_cacher   r   rb   r7   c	                 K   st   |}
|  |}| jd||||||||d|	\}}|
| }|}
| |}| |}|
| }|f}|r8||f7 }|S )Nr6   rU   rt   r   r   r   r   r   r4   )r   r   r   r   )r1   r6   rU   rt   r   r   r   r   r   rb   residualself_attn_weightsoutputsr4   r4   r5   r8   k  s.   
	



zMoonshineEncoderLayer.forward)NNNFFNN)r:   r;   r<   r    r   r(   r=   r>   r   r   r   r   r   r   r   FloatTensorr8   r?   r4   r4   r2   r5   r   Z  s<    	
r   c                        s   e Zd Zddedee f fddZ											ddejdeej d	eej d
eej deej	 deej	 dee
 dee dee deej	 deeejejf  deeejejf  deejeeejejf  f fddZ  ZS )MoonshineDecoderLayerNr)   rw   c                    s   t    |j| _t||d|j|jd| _t||d|j|jd| _t||j	| _
tj|jdd| _tj|jdd| _tj|jdd| _d S )NTr   Frz   )r'   r(   r-   rv   Zdecoder_num_attention_headsZdecoder_num_key_value_headsr   encoder_attnrA   Zdecoder_hidden_actr   r+   r   r   r   final_layernormr   r2   r4   r5   r(     s(   
zMoonshineDecoderLayer.__init__Fr6   rU   encoder_hidden_statesencoder_attention_maskrt   encoder_position_idsr   r   r   r   r   encoder_position_embeddingsr7   c                 K   s   |}|  |}| jd||||||	|
|d|\}}|| }d }|d ur<|}| |}| j||||||	d\}}|| }|}| |}| |}|| }|f}|rW|||f7 }|S )Nr   )r6   r   rU   r   r   r   r4   )r   r   r   r   r   r   )r1   r6   rU   r   r   rt   r   r   r   r   r   r   r   rb   r   r   Zcross_attn_weightsr   r4   r4   r5   r8     sH   
	




zMoonshineDecoderLayer.forwardr%   )NNNNNNFFNNN)r:   r;   r<   r    r   r   r(   r=   r>   r   r   r   r   r   r8   r?   r4   r4   r2   r5   r     sP    	
r   c                   @   sL   e Zd ZeZdZdZdZddgZdZ	dZ
dZdZdd Zdejfd	d
ZdS )MoonshinePreTrainedModelmodelinput_valuesTr   r   c                 C   s   | j j}t|tjtjfr%|jjjd|d |j	d ur#|j	j
  d S d S t|tjtjfrD|jjd |j	d urB|j	j
  d S d S t|tjrc|jjjd|d |jd ure|jj|j 
  d S d S d S )NrP   )meanstdg      ?)r)   Zinitializer_ranger   r+   r,   Conv1dweightdataZnormal_r{   Zzero_	GroupNormr   Zfill_	Embeddingpadding_idx)r1   rQ   r   r4   r4   r5   _init_weights  s"   


z&MoonshinePreTrainedModel._init_weightsinput_lengthsc                 C   s@   t |d d d }t |d d d }t |d d d }|S )zH
        Computes the output length of the convolutional layers
           @   r      r   rB   )r   )r1   r   Zoutput_conv1_lengthZoutput_conv2_lengthZoutput_conv3_lengthr4   r4   r5    _get_feat_extract_output_lengths  s   z9MoonshinePreTrainedModel._get_feat_extract_output_lengthsN)r:   r;   r<   r    Zconfig_classZbase_model_prefixmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_flash_attn_2Z_supports_sdpaZ_supports_cache_classZ_supports_static_cacher   r=   r   r   r4   r4   r4   r5   r     s    r   c                       s   e Zd ZdZdZdef fddZdejfddZ	d	ejfd
dZ
e				ddeej deej dee dee dee defddZ  ZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r   r)   c                    s   t     | _ j}tjd|dddd| _tj|d| ddd	| _tjd| |ddd	| _tj	d|d
d| _
t d| _t fddt jD | _tj|dd| _d| _|   d S )Nr   r   r   F)kernel_sizestrider{   rB   r   r   )r   r   gh㈵>)Z
num_groupsZnum_channelsZepsr)   c                       g | ]}t  |qS r4   )r   .0idxr   r4   r5   
<listcomp>*      z-MoonshineEncoder.__init__.<locals>.<listcomp>rz   )r'   r(   r)   r-   r+   r   conv1conv2conv3r   	groupnormr   
rotary_emb
ModuleListrangeZencoder_num_hidden_layerslayersr   
layer_normgradient_checkpointing	post_init)r1   r)   Z	embed_dimr2   r   r5   r(     s   zMoonshineEncoder.__init__r7   c                 C      | j S r%   r   r1   r4   r4   r5   get_input_embeddings1     z%MoonshineEncoder.get_input_embeddingsrT   c                 C   
   || _ d S r%   r   r1   rT   r4   r4   r5   set_input_embeddings4     
z%MoonshineEncoder.set_input_embeddingsNrU   r   output_hidden_statesflash_attn_kwargsc                 K   s  |dur|n| j j}|dur|n| j j}|du rtd|d}tj| |}| 	|}tj
| |}tj
| |}|ddd}|dur| |jd }d}|ddd|f dd|f }| j jd	krv|d
k rs|nd}n| j jdkr|st||j}nt||j}tjd|jd |jdd}	| ||	}
|rdnd}|rdnd}| jD ]#}|r||f7 }||f||	||
d|}|d }|r||d f7 }q| |}|r||f7 }t|||dS )a  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
                tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
                more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzYou must specify input_values.r   r   rB   rC     .flash_attention_2rP   r   r   r4   )rU   rt   r   r   last_hidden_stater6   
attentions)r)   r   r   
ValueErrorrn   r+   r_   tanhr   r   Zgelur   r   Zpermuter   rH   r   anyr   rY   r   r=   aranger   r   r   r   r   )r1   r   rU   r   r   r   r6   mask_lendownsample_stridert   r   all_hidden_statesall_self_attnsZencoder_layerlayer_outputsr4   r4   r5   r8   7  sb   



	

zMoonshineEncoder.forward)NNNN)r:   r;   r<   r   r   r    r(   r+   Moduler   r   r   r   r=   r   r>   r   r   r   r   r8   r?   r4   r4   r2   r5   r     s0    r   c                       s6  e Zd ZdZdef fddZdd Zdd Zee																							d#de
ej d
e
ej de
ej de
e de
ej de
e de
e de
e de
ej de
ej de
ej dee deeef fddZ	d$d
eejdf dejdejdedef
ddZed
ejdededejdejd efd!d"Z  ZS )%MoonshineDecoder	input_idsr)   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _tj jdd| _t d| _d| _|   d S )Nc                    r   r4   )r   r   r   r4   r5   r     r   z-MoonshineDecoder.__init__.<locals>.<listcomp>Frz   r   )r'   r(   pad_token_idr   
vocab_sizer+   r   r-   embed_tokensr   r   Zdecoder_num_hidden_layersr   r   normr   r   r   r   r1   r)   r2   r   r5   r(     s   zMoonshineDecoder.__init__c                 C   r   r%   r  r   r4   r4   r5   r     r   z%MoonshineDecoder.get_input_embeddingsc                 C   r   r%   r  r   r4   r4   r5   r     r   z%MoonshineDecoder.set_input_embeddingsNrU   rt   past_key_valuesinputs_embedsr   r   r   r   r   r   r   r7   c                 K   sv  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du |duA r*td| jr9| jr9|r9td d}|du rB| 	|}|rS|du rSt
 }t
 }t||}|	du ro|dur_| nd}tj|||jd  |jd}	|du rx|	d}| |||	||}|}| ||}|rdnd}|rdnd}|r|
durdnd}|dur|
jd	 }d
}|ddd|f dd|f }| j jdkr|dk r|nd}n| j jdkr|st||j|jd	 }n
t||j|jd	 }| jD ]5}|r||f7 }||f|||
|||||	|d	|}|d }|r||d f7 }|
dur||d f7 }q| |}|r-||f7 }t||r4|nd|||dS )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   r   r4   rX   r   .r   rP   r   )	rU   r   r   rt   r   r   r   r   r   rB   )r   r  r6   r   cross_attentions)r)   r   r   r   r   r   r[   r   r   r  r	   r
   get_seq_lengthr=   r   rH   r   rn   _update_causal_maskr   r   r   r   rY   r   r   r  r   )r1   r  rU   rt   r  r	  r   r   r   r   r   r   r   r   r   past_seen_tokensrf   r6   r   r   r   Zall_cross_attentionsr   r   Zdecoder_layerr   r4   r4   r5   r8     s   









zMoonshineDecoder.forwardFr!   input_tensorc                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )Nr   rP   Zflex_attentionr   Fr   )r	  Zpast_key_values_lengthZis_trainingr   rC   )sequence_lengthtarget_lengthrY   r   
batch_size)cudaZxpuZnpu)r)   r   r   r   r=   r>   r"   r  Zis_compileabler   Z_ignore_causal_mask_sdpar[   rY   rH   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   r   finfominZ_unmask_unattended)r1   rU   r  r   r  r   r  Zusing_compilable_cacherY   r  r  rf   	min_dtyper4   r4   r5   r  =  sT   




z$MoonshineDecoder._update_causal_maskr  r  rY   r  c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuerY   r   r   )Zdiagonalr   rC   r   )rE   r=   r  r  fullr   Ztriur   rJ   rI   clonerH   r`   Zmasked_fill)rU   r  r  rY   r   r  rb   rf   r  mask_lengthZpadding_maskr4   r4   r5   r    s,    $
6  zFMoonshineDecoder._prepare_4d_causal_attention_mask_with_cache_position)NNNNNNNNNNN)F)r:   r;   r<   r   r    r(   r   r   r   r   r   r=   r   r>   r   r   r   r   r   r   r   r   r8   r  staticmethodr   rY   r  r?   r4   r4   r2   r5   r     s    	

 

Dr   rH   	mask_probr  	min_masksc                    s  | \}dk rt dkrt d d dtjd   fdd}|dur:| d	 n
fd
dt|D }tj	|ft
d}g }	|}
|
dkrZ|S |D ];}||}tjjt|d  |dd}t|dkr}d }n|d }t|tj|
| tjd| g}|	| q\t|	}	t|	dddddf ||
f}	|	||
 }	tddddf }t|||
f||
 }|	| }	|	 d krd |	|	d k< t||	dd	 |S )af  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    sX   t |     }t|}| kr }| d  |k r*t| d  d}|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)input_lengthnum_masked_spanepsilonr  r  r  r  r4   r5   compute_num_masked_span  s   
z6_compute_mask_indices.<locals>.compute_num_masked_spanNrC   c                    s   g | ]} qS r4   r4   )r   _)r  r4   r5   r     s    z)_compute_mask_indices.<locals>.<listcomp>r   r   F)replace)r   nprandomZranditemdetachsumtolistr   Zzerosr   choicer   lenZconcatenateZonesZint32appendarrayZbroadcast_torJ   r  Zput_along_axis)rH   r  r  rU   r  r  r$  r   Zspec_aug_maskZspec_aug_mask_idxsZmax_num_masked_spanr   r!  Zspec_aug_mask_idxZdummy_mask_idxoffsetsr4   r"  r5   _compute_mask_indices  s\   

r2  c                       s&  e Zd Zdef fddZdd Zdd Zdd	 Zd
d Zdd Z		d!de
jdee
j fddZee												d"dee
j dee
j dee
j dee
j deeee
j   deeeee
j f  deee
j  deee
j  dee dee dee dee
j defdd Z  ZS )#MoonshineModelr)   c                    s,   t  | t|| _t|| _|   d S r%   )r'   r(   r   encoderr   decoderr   r  r2   r4   r5   r(   2  s   

zMoonshineModel.__init__c                 C   s   | j jS r%   r5  r  r   r4   r4   r5   r   :  s   z#MoonshineModel.get_input_embeddingsc                 C   s   || j _d S r%   r6  r   r4   r4   r5   r   =  s   z#MoonshineModel.set_input_embeddingsc                 C   r   r%   )r4  r   r4   r4   r5   get_encoder@  r   zMoonshineModel.get_encoderc                 C   r   r%   )r5  r   r4   r4   r5   get_decoderC  r   zMoonshineModel.get_decoderc                 C   s   | j   dS )z
        Calling this function will disable the gradient computation for the Moonshine encoder so that its parameters will
        not be updated during training.
        N)r4  Z_freeze_parametersr   r4   r4   r5   freeze_encoderF  s   zMoonshineModel.freeze_encoderNinput_featuresrU   c                 C   s   t | jdds	|S | \}}}| jjdkrE| jrEt||f| jj| jj|| jjd}tj	||j
tjd}|dddf d|d}d||< | jjdkrl| jrlt||f| jj| jj| jjd}tj	||j
tjd}d||< |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        Zapply_spec_augmentTr   )r  r  rU   r  )r   rY   NrC   )r  r  r  )r}   r)   sizeZmask_time_probr[   r2  Zmask_time_lengthZmask_time_min_masksr=   Ztensorr   r   rI   Zmask_feature_probZmask_feature_lengthZmask_feature_min_masks)r1   r:  rU   r  r-   r  Zmask_time_indicesZmask_feature_indicesr4   r4   r5   _mask_input_featuresM  s0   z#MoonshineModel._mask_input_featuresr   decoder_input_idsdecoder_attention_maskencoder_outputsr  decoder_inputs_embedsdecoder_position_idsr   r   r   r   r7   c                 C   s   |
dur|
n| j j}
|dur|n| j j}|	dur|	n| j j}	|du r,| j|||
|d}n"t|tsNt|d t|dkr?|d ndt|dkrJ|d ndd}| j||||j	||||	|
||d}t
|j	|j|j|j|j|j	|j|jdS )	a\  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        decoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```
        N)rU   r   r   r   r   rB   r   )r  rU   r   r   r  r	  rt   r   r   r   r   )r   r  decoder_hidden_statesdecoder_attentionsr
  encoder_last_hidden_stater   encoder_attentions)r)   r   r   r   r4  r   r   r.  r5  r   r   r  r6   r   r
  )r1   r   rU   r=  r>  r?  r  r@  rA  r   r   r   r   Zdecoder_outputsr4   r4   r5   r8   x  sP   P
zMoonshineModel.forwardr%   )NNNNNNNNNNNN)r:   r;   r<   r    r(   r   r   r7  r8  r9  r=   r   r   r   r<  r   r   r   r   r
   r   r   r8   r?   r4   r4   r2   r5   r3  0  sl    

+	
r3  r  r  decoder_start_token_idc                 C   sh   |  | j}| ddddf  |ddddf< ||dddf< |du r*td||dk| |S )z1
    Shift input ids one token to the right.
    NrC   r   r   z1self.model.config.pad_token_id has to be defined.i)Z	new_zerosrH   r  r   Zmasked_fill_)r  r  rF  Zshifted_input_idsr4   r4   r5   shift_tokens_right  s   (rG  zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    )Zcustom_introc                       s"  e Zd ZdgZdef fddZdd Zdd Zd	d
 Zdd Z	de
jfddZee													d deej deej deej deej deeeej   deeeeej f  deeej  deeej  dee dee dee deej deej defddZ  ZS )!!MoonshineForConditionalGenerationzproj_out.weightr)   c                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFrz   )
r'   r(   r3  r   r+   r,   r-   r  proj_outr   r  r2   r4   r5   r(     s   
z*MoonshineForConditionalGeneration.__init__c                 C   
   | j  S r%   )r   r7  r   r4   r4   r5   r7    r   z-MoonshineForConditionalGeneration.get_encoderc                 C   rJ  r%   )r   r8  r   r4   r4   r5   r8    r   z-MoonshineForConditionalGeneration.get_decoderc                 C   r   r%   rI  r   r4   r4   r5   get_output_embeddings  r   z7MoonshineForConditionalGeneration.get_output_embeddingsc                 C   r   r%   rK  )r1   Znew_embeddingsr4   r4   r5   set_output_embeddings!  r   z7MoonshineForConditionalGeneration.set_output_embeddingsr7   c                 C   rJ  r%   )r   r   r   r4   r4   r5   r   $  r   z6MoonshineForConditionalGeneration.get_input_embeddingsNr   rU   r=  r>  r?  r  r@  rA  r   r   r   r   labelsc                 C   s   |dur|du r|du rt || jj| jj}| j|||||||||	|
||d}| |j}d}|dur=| j||| jjd}t	|||j
|j|j|j|j|j|jd	S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        decoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)rU   r=  r?  r>  r  r@  rA  r   r   r   r   )logitsrN  r  )	lossrO  r  rB  rC  r
  rD  r   rE  )rG  r)   r  rF  r   rI  r   Zloss_functionr  r   r  rB  rC  r
  rD  r   rE  )r1   r   rU   r=  r>  r?  r  r@  rA  r   r   r   r   rN  r   rO  rP  r4   r4   r5   r8   '  sD   Yz)MoonshineForConditionalGeneration.forward)NNNNNNNNNNNNN)r:   r;   r<   Z_tied_weights_keysr    r(   r7  r8  rL  rM  r+   r   r   r   r   r   r=   r   r   r   r   r
   r   r   r8   r?   r4   r4   r2   r5   rH    sh    	
rH  )r3  r   rH  )rP   )Nr   )Nr   )Ntypingr   r   r   r   numpyr'  r=   Ztorch.nnr+   Zactivationsr   Zcache_utilsr   r	   r
   Z
generationr   Zmodeling_attn_mask_utilsr   r   r   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   Zconfiguration_moonshiner    Z!torch.nn.attention.flex_attentionr!   Zintegrations.flex_attentionr"   Z
get_loggerr:   r   r   r$   rA   r>   r   rO   r   rh   rm   ru   rv   r   r   r   r   r   r   r   Zndarrayr2  r3  rG  rH  __all__r4   r4   r4   r5   <module>   s   


* ";X%    

w H 