o
    Zh                     @   s  d dl mZmZmZmZ d dlZd dlmZ ddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1 ddl2m3Z3m4Z4 e)5e6Z7G dd deZ8G dd dej9Z:G dd dej9Z;G dd de+Z<G dd de,Z=G dd de/Z>G d d! d!eZ?e'G d"d# d#e#Z@G d$d% d%e@ZAG d&d' d'e0ZBG d(d) d)e3ZCe'd*d+G d,d- d-e@eZDg d.ZEdS )/    )CallableOptionalTupleUnionN   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)PretrainedConfig)GenerationMixin)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)rope_config_validation)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging   )GlmAttentionGlmRotaryEmbeddingapply_rotary_pos_emb)LlamaDecoderLayer
LlamaModeleager_attention_forward)WhisperModelshift_tokens_rightc                       sh   e Zd ZdZdZdgZddddZ				
																					d fdd	Z  ZS )MoonshineConfiga"  
    This is the configuration class to store the configuration of a [`MoonshineModel`]. It is used to instantiate a Moonshine
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Moonshine
    [UsefulSensors/moonshine-tiny](https://huggingface.co/UsefulSensors/moonshine-tiny).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 32768):
            Vocabulary size of the Moonshine model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MoonshineModel`].
        hidden_size (`int`, *optional*, defaults to 288):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 1152):
            Dimension of the MLP representations.
        encoder_num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer encoder.
        decoder_num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer decoder.
        encoder_num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        decoder_num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        encoder_num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        decoder_num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `decoder_num_attention_heads`.
        pad_head_dim_to_multiple_of (`int`, *optional*):
            Pad head dimension in encoder and decoder to the next multiple of this value. Necessary for using certain
            optimized attention implementations.
        encoder_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder.
        decoder_hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        decoder_start_token_id (`int`, *optional*, defaults to 1):
            Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
            are provided to the `generate` function. It is used to guide the model`s generation process depending on
            the task.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        partial_rotary_factor (`float`, *optional*, defaults to 0.9):
            Percentage of the query and keys which will have rotary embedding.
        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        bos_token_id (`int`, *optional*, defaults to 1):
            Denotes beginning of sequences token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            Denotes end of sequences token id.

    Example:

    ```python
    >>> from transformers import MoonshineModel, MoonshineConfig

    >>> # Initializing a Moonshine style configuration
    >>> configuration = MoonshineConfig().from_pretrained("UsefulSensors/moonshine-tiny")

    >>> # Initializing a model from the configuration
    >>> model = MoonshineModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```Z	moonshinepast_key_valuesencoder_num_key_value_headsencoder_num_attention_headsencoder_num_hidden_layers)num_key_value_headsnum_attention_headsZnum_hidden_layers              Ngelusilu   {Gz?   T     @?F        r   c                    s   || _ || _|| _|| _|| _|| _|| _|d u r|}|| _|	d u r$|}	|	| _|
| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _t|  t jd||||d| d S )N)bos_token_ideos_token_idis_encoder_decoderdecoder_start_token_id )
vocab_sizehidden_sizeintermediate_sizer*   decoder_num_hidden_layersr)   decoder_num_attention_headsr(   decoder_num_key_value_headspad_head_dim_to_multiple_ofencoder_hidden_actdecoder_hidden_actmax_position_embeddingsinitializer_ranger=   	use_cache
rope_thetarope_scalingpartial_rotary_factorr<   attention_biasattention_dropoutr   super__init__)selfr?   r@   rA   r*   rB   r)   rC   r(   rD   rE   rF   rG   rH   rI   r=   rJ   rK   rL   rM   r<   rN   rO   r:   r;   kwargs	__class__r>   ^/var/www/auris/lib/python3.10/site-packages/transformers/models/moonshine/modular_moonshine.pyrQ      sF   
zMoonshineConfig.__init__)r-   r.   r/   r0   r0   r1   r1   NNNr2   r3   r4   r5   r6   Tr7   Nr8   TFr9   r6   r   )	__name__
__module____qualname____doc__Z
model_typeZkeys_to_ignore_at_inferenceZattribute_maprQ   __classcell__r>   r>   rT   rV   r&   .   sB    }r&   c                       2   e Zd Z fddZdejdejfddZ  ZS )MoonshineEncoderMLPc                    sB   t    || _t| | _t|j|j| _	t|j|j| _
d S NrP   rQ   configr   activation_fnnnLinearr@   rA   fc1fc2rR   r`   Z
hidden_actrT   r>   rV   rQ      s
   

zMoonshineEncoderMLP.__init__hidden_statesreturnc                 C   s"   |  |}| |}| |}|S r^   )rd   ra   re   )rR   rg   r>   r>   rV   forward  s   


zMoonshineEncoderMLP.forwardrW   rX   rY   rQ   torchTensorri   r[   r>   r>   rT   rV   r]          r]   c                       r\   )MoonshineDecoderMLPc                    sF   t    || _t| | _t|j|jd | _	t|j|j| _
d S )Nr   r_   rf   rT   r>   rV   rQ     s
   

zMoonshineDecoderMLP.__init__rg   rh   c                 C   s8   |  |}|jddd\}}| || }| |}|S )Nr   )dim)rd   chunkra   re   )rR   rg   Zgater>   r>   rV   ri     s
   

zMoonshineDecoderMLP.forwardrj   r>   r>   rT   rV   rn   
  rm   rn   c                       s   e Zd Zdededededef
 fddZ					dd	ejd
e	e
ejejf  de	ej de	e de	ej de	ej dee de
eje	ej e	e
ej  f fddZ  ZS )MoonshineAttentionr`   	layer_idx	is_causalr,   r+   c                    s~   | ||d t || || _t|d|j|j | _| jj	d ur:| jj	}|| j| d |  }|| j | _
d S d| _
d S )N)r,   r+   head_dimr6   r   )updaterP   rQ   rt   getattrr@   r,   ru   r`   rE   head_dim_padding)rR   r`   rs   rt   r,   r+   Ztarget_multipleZtarget_head_dimrT   r>   rV   rQ     s   
zMoonshineAttention.__init__Nrg   position_embeddingsattention_maskpast_key_valuecache_positionkey_value_statesrS   rh   c                 K   s  |j d d \}}	| |||	| jj| jdd}
|d u}|d ur9|j| j	}|r6d|j| j	< |j
}n|j}|d ur?|n|}|rT|rT|rT|j| j	 }|j| j	 }n7| ||d| jj| jdd}| ||d| jj| jdd}|r|d ur|||| j	d|i\}}|s|\}}t|
|||\}
}|d ur|||d}|||| j	|\}}t}| jjdkr| jjdkr|d	d
rtd nt| jj }| jr|d u r|	dkrdnd
}| jdkrtjj|
d| jf}
tjj|d| jf}tjj|d| jf}|| |
|||f| jsdn| j| j|d|\}}| jdkr/|dd | j f }| ||	d! }| "|}||fS )Nro   r6   r   Tr|   )sincosr|   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   r9   )Zdropoutscalingrt   .)#shapeZq_projviewr`   r+   ru   Z	transpose
is_updatedgetrs   cross_attention_cacheself_attention_cacheZ	key_cacheZvalue_cacheZk_projZv_projrv   r    r#   _attn_implementationloggerwarning_oncer   rt   rx   rk   rb   
functionalpadtrainingrO   r   Zreshape
contiguousZo_proj)rR   rg   ry   rz   r{   r|   r}   rS   ZbszZq_lenZquery_statesZis_cross_attentionr   Zcurrent_statesZ
key_statesZvalue_statesr   r~   Zcache_kwargsZattention_interfacert   Zattn_outputZattn_weightsr>   r>   rV   ri   0  s   
"
	

zMoonshineAttention.forward)NNNNN)rW   rX   rY   r&   intboolrQ   rk   rl   r   r   r   
LongTensorr   r   ri   r[   r>   r>   rT   rV   rr     sD    	rr   c                   @   s   e Zd ZdS )MoonshineRotaryEmbeddingN)rW   rX   rY   r>   r>   r>   rV   r     s    r   c                       s&   e Zd Zdedef fddZ  ZS )MoonshineEncoderLayerr`   rs   c                    s\   t  || t||d|j|jd| _t||j| _t	j
|jdd| _t	j
|jdd| _d S )NFr`   rs   rt   r,   r+   bias)rP   rQ   rr   r)   r(   	self_attnr]   rF   mlprb   	LayerNormr@   input_layernormpost_attention_layernormrR   r`   rs   rT   r>   rV   rQ     s   zMoonshineEncoderLayer.__init__)rW   rX   rY   r&   r   rQ   r[   r>   r>   rT   rV   r     s    r   c                        s   e Zd Zddedee f fddZ											ddejdeej d	eej d
eej deej	 deej	 dee
 dee dee deej	 deeejejf  deeejejf  deejeeejejf  f fddZ  ZS )MoonshineDecoderLayerNr`   rs   c                    s   t    |j| _t||d|j|jd| _t||d|j|jd| _t||j	| _
tj|jdd| _tj|jdd| _tj|jdd| _d S )NTr   Fr   )rP   rQ   r@   rr   rC   rD   r   encoder_attnrn   rG   r   rb   r   r   r   final_layernormr   rT   r>   rV   rQ     s(   
zMoonshineDecoderLayer.__init__Frg   rz   encoder_hidden_statesencoder_attention_maskposition_idsencoder_position_idsr{   r   rJ   r|   ry   encoder_position_embeddingsrh   c                 K   s   |}|  |}| jd||||||	|
|d|\}}|| }d }|d ur<|}| |}| j||||||	d\}}|| }|}| |}| |}|| }|f}|rW|||f7 }|S )N)rg   rz   r   r{   r   rJ   r|   ry   )rg   r}   rz   r{   r   rJ   r>   )r   r   r   r   r   r   )rR   rg   rz   r   r   r   r   r{   r   rJ   r|   ry   r   rS   ZresidualZself_attn_weightsZcross_attn_weightsoutputsr>   r>   rV   ri     sH   
	




zMoonshineDecoderLayer.forwardr^   )NNNNNNFFNNN)rW   rX   rY   r&   r   r   rQ   rk   rl   r   r   r   r   FloatTensorri   r[   r>   r>   rT   rV   r     sP    	
r   c                   @   sL   e Zd ZeZdZdZdZddgZdZ	dZ
dZdZdd Zdejfd	d
ZdS )MoonshinePreTrainedModelmodelinput_valuesTr   r   c                 C   s   | j j}t|tjtjfr%|jjjd|d |j	d ur#|j	j
  d S d S t|tjtjfrD|jjd |j	d urB|j	j
  d S d S t|tjrc|jjjd|d |jd ure|jj|j 
  d S d S d S )Nr9   )meanstdg      ?)r`   rI   
isinstancerb   rc   Conv1dweightdataZnormal_r   Zzero_	GroupNormr   Zfill_Z	EmbeddingZpadding_idx)rR   moduler   r>   r>   rV   _init_weights  s"   


z&MoonshinePreTrainedModel._init_weightsinput_lengthsc                 C   s@   t |d d d }t |d d d }t |d d d }|S )zH
        Computes the output length of the convolutional layers
           @   r6      r   r   )r   )rR   r   Zoutput_conv1_lengthZoutput_conv2_lengthZoutput_conv3_lengthr>   r>   rV    _get_feat_extract_output_lengths  s   z9MoonshinePreTrainedModel._get_feat_extract_output_lengthsN)rW   rX   rY   r&   Zconfig_classZbase_model_prefixmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_flash_attn_2Z_supports_sdpaZ_supports_cache_classZ_supports_static_cacher   rk   r   r   r>   r>   r>   rV   r     s    r   c                       s   e Zd ZdZdZdef fddZdejfddZ	d	ejfd
dZ
e				ddeej deej dee dee dee defddZ  ZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r   r`   c                    s   t     | _ j}tjd|dddd| _tj|d| ddd	| _tjd| |ddd	| _tj	d|d
d| _
t d| _t fddt jD | _tj|dd| _d| _|   d S )Nr6   r   r   F)kernel_sizestrider   r   r   r   )r   r   gh㈵>)Z
num_groupsZnum_channelsZepsr`   c                       g | ]}t  |qS r>   )r   .0idxr   r>   rV   
<listcomp>8      z-MoonshineEncoder.__init__.<locals>.<listcomp>r   )rP   rQ   r`   r@   rb   r   conv1conv2conv3r   	groupnormr   
rotary_emb
ModuleListranger*   layersr   
layer_normgradient_checkpointing	post_init)rR   r`   Z	embed_dimrT   r   rV   rQ   +  s   zMoonshineEncoder.__init__rh   c                 C      | j S r^   r   rR   r>   r>   rV   get_input_embeddings?     z%MoonshineEncoder.get_input_embeddingsvaluec                 C   
   || _ d S r^   r   )rR   r   r>   r>   rV   set_input_embeddingsB     
z%MoonshineEncoder.set_input_embeddingsNrz   r   output_hidden_statesflash_attn_kwargsc                 K   s  |dur|n| j j}|dur|n| j j}|du rtd|d}tj| |}| 	|}tj
| |}tj
| |}|ddd}|dur| |jd }d}|ddd|f dd|f }| j jd	krv|d
k rs|nd}n| j jdkr|st||j}nt||j}tjd|jd |jdd}	| ||	}
|rdnd}|rdnd}| jD ]#}|r||f7 }||f||	||
d|}|d }|r||d f7 }q| |}|r||f7 }t|||dS )a  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
                tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
                more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzYou must specify input_values.r6   r   r   ro     .flash_attention_2r9   r   devicer>   )rz   r   r   ry   last_hidden_staterg   
attentions)r`   r   r   
ValueError	unsqueezerb   r   tanhr   r   r2   r   r   Zpermuter   r   r   anyr   dtyper   rk   aranger   r   r   r   r   )rR   r   rz   r   r   r   rg   mask_lendownsample_strider   ry   all_hidden_statesall_self_attnsZencoder_layerlayer_outputsr>   r>   rV   ri   E  sb   



	

zMoonshineEncoder.forward)NNNN)rW   rX   rY   rZ   r   r&   rQ   rb   Moduler   r   r   r   rk   r   rl   r   r   r   r   ri   r[   r>   r>   rT   rV   r   !  s0    r   c                       s   e Zd ZdZdef fddZ											ddeej deej	 deej dee
 d	eej d
ee dee dee deej deej deej	 dee deeef fddZ  ZS )MoonshineDecoder	input_idsr`   c                    sB   t    tj jdd| _t fddt jD | _	d S )NFr   c                    r   r>   )r   r   r   r>   rV   r     r   z-MoonshineDecoder.__init__.<locals>.<listcomp>)
rP   rQ   rb   r   r@   normr   r   rB   r   rR   r`   rT   r   rV   rQ     s
   
zMoonshineDecoder.__init__Nrz   r   r'   inputs_embedsrJ   r   r   r|   r   r   r   rh   c                 K   sv  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du |duA r*td| jr9| jr9|r9td d}|du rB| 	|}|rS|du rSt
 }t
 }t||}|	du ro|dur_| nd}tj|||jd  |jd}	|du rx|	d}| |||	||}|}| ||}|rdnd}|rdnd}|r|
durdnd}|dur|
jd	 }d
}|ddd|f dd|f }| j jdkr|dk r|nd}n| j jdkr|st||j|jd	 }n
t||j|jd	 }| jD ]5}|r||f7 }||f|||
|||||	|d	|}|d }|r||d f7 }|
dur||d f7 }q| |}|r-||f7 }t||r4|nd|||dS )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r6   r   r>   r   .r   r9   r   )	rz   r   r   r   r{   r   rJ   r|   ry   r   )r   r'   rg   r   cross_attentions)r`   r   r   rJ   r   r   r   r   r   Zembed_tokensr	   r
   Zget_seq_lengthrk   r   r   r   r   Z_update_causal_maskr   r   r   r   r   r   r   r   r   )rR   r   rz   r   r'   r   rJ   r   r   r|   r   r   r   r   r   Zpast_seen_tokensZcausal_maskrg   ry   r   r   Zall_cross_attentionsr   r   Zdecoder_layerr   r>   r>   rV   ri     s   









zMoonshineDecoder.forward)NNNNNNNNNNN)rW   rX   rY   r   r&   rQ   r   rk   r   rl   r   r   r   r   r   r   r   r   ri   r[   r>   r>   rT   rV   r     sR    		

r   c                   @   s   e Zd Zee												ddeej deej deej deej dee	e	ej   dee
ee	ej f  dee	ej  d	ee	ej  d
ee dee dee deej defddZdS )MoonshineModelNr   rz   decoder_input_idsdecoder_attention_maskencoder_outputsr'   decoder_inputs_embedsdecoder_position_idsrJ   r   r   r|   rh   c                 C   s   |
dur|
n| j j}
|dur|n| j j}|	dur|	n| j j}	|du r,| j|||
|d}n"t|tsNt|d t|dkr?|d ndt|dkrJ|d ndd}| j||||j	||||	|
||d}t
|j	|j|j|j|j|j	|j|jdS )	a\  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        decoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```
        N)rz   r   r   r   r6   r   r   )r   rz   r   r   r'   r   r   rJ   r   r   r|   )r   r'   decoder_hidden_statesdecoder_attentionsr   encoder_last_hidden_stater   encoder_attentions)r`   r   r   rJ   encoderr   r   lendecoderr   r   r'   rg   r   r   )rR   r   rz   r   r   r   r'   r   r   rJ   r   r   r|   Zdecoder_outputsr>   r>   rV   ri   ;  sP   P
zMoonshineModel.forward)NNNNNNNNNNNN)rW   rX   rY   r   r   r   rk   r   r   r   r   r
   r   r   ri   r>   r>   r>   rV   r   :  sT    	
r   zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    )Zcustom_introc                       s"  e Zd ZdgZdef fddZdd Zdd Zd	d
 Zdd Z	de
jfddZee													d deej deej deej deej deeeej   deeeeej f  deeej  deeej  dee dee dee deej deej defddZ  ZS )!!MoonshineForConditionalGenerationzproj_out.weightr`   c                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFr   )
rP   rQ   r   r   rb   rc   r@   r?   proj_outr   r   rT   r>   rV   rQ     s   
z*MoonshineForConditionalGeneration.__init__c                 C   
   | j  S r^   )r   get_encoderr   r>   r>   rV   r    r   z-MoonshineForConditionalGeneration.get_encoderc                 C   r  r^   )r   get_decoderr   r>   r>   rV   r    r   z-MoonshineForConditionalGeneration.get_decoderc                 C   r   r^   r  r   r>   r>   rV   get_output_embeddings  r   z7MoonshineForConditionalGeneration.get_output_embeddingsc                 C   r   r^   r  )rR   Znew_embeddingsr>   r>   rV   set_output_embeddings  r   z7MoonshineForConditionalGeneration.set_output_embeddingsrh   c                 C   r  r^   )r   r   r   r>   r>   rV   r     r   z6MoonshineForConditionalGeneration.get_input_embeddingsNr   rz   r   r   r   r'   r   r   rJ   r   r   r|   labelsc                 C   s   |dur|du r|du rt || jj| jj}| j|||||||||	|
||d}| |j}d}|dur=| j||| jjd}t	|||j
|j|j|j|j|j|jd	S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        decoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)rz   r   r   r   r'   r   r   rJ   r   r   r|   )logitsr
  r?   )	lossr  r'   r   r   r   r   r   r   )r%   r`   Zpad_token_idr=   r   r  r   Zloss_functionr?   r   r'   r   r   r   r   r   r   )rR   r   rz   r   r   r   r'   r   r   rJ   r   r   r|   r
  r   r  r  r>   r>   rV   ri     sD   Yz)MoonshineForConditionalGeneration.forward)NNNNNNNNNNNNN)rW   rX   rY   Z_tied_weights_keysr&   rQ   r  r  r  r	  rb   r   r   r   r   r   rk   r   r   r   r   r
   r   r   ri   r[   r>   r>   rT   rV   r    sh    	
r  )r&   r   r   r  )Ftypingr   r   r   r   rk   Ztorch.nnrb   Zactivationsr   Zcache_utilsr   r	   r
   Zconfiguration_utilsr   Z
generationr   Zmodeling_attn_mask_utilsr   r   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   r   Zmodeling_rope_utilsr   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   Zglm.modeling_glmr   r   r    Zllama.modeling_llamar!   r"   r#   Zwhisper.modeling_whisperr$   r%   Z
get_loggerrW   r   r&   r   r]   rn   rr   r   r   r   r   r   r   r   r  __all__r>   r>   r>   rV   <module>   sR   
 NtX%    