
    fThr                    <   S SK JrJrJrJr  S SKrS SKrS SKJ	r	  SSK
Jr  SSKJrJrJr  SSKJr  SSKJrJrJr  SSKJr  SS	KJr  SS
KJrJrJrJrJr  SSK J!r!J"r"  SSK#J$r$J%r%  SSK&J'r'  SSK(J)r)J*r*J+r+J,r,  SSK-J.r.  \+" 5       (       a  S SK/J0r0  SSK1J2r2  \,Rf                  " \45      r5 " S S\	Rl                  5      r7 " S S\	Rl                  5      r8S\Rr                  S\:S\Rr                  4S jr; SCS\	Rl                  S\Rr                  S\Rr                  S\Rr                  S\\Rr                     S \<S!\<4S" jjr=S# r>SDS$ jr? " S% S&\	Rl                  5      r@ " S' S(\	Rl                  5      rA " S) S*\5      rB " S+ S,\5      rC\) " S- S.\%5      5       rD " S/ S0\D5      rE\) " S1 S2\D5      5       rF  SES3\\:\:4   S4\<S5\:S\\R                     S6\:S\R                  4S7 jjrI\) " S8 S9\D5      5       rJS:\Rr                  S;\:S<\:4S= jrK\)" S>S?9 " S@ SA\D\5      5       rL/ SBQrMg)F    )CallableOptionalTupleUnionN   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )MoonshineConfig)	BlockMask)make_flex_block_causal_maskc                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MoonshineEncoderMLP<   c                 
  > [         TU ]  5         Xl        [        U   U l        [
        R                  " UR                  UR                  5      U l	        [
        R                  " UR                  UR                  5      U l
        g Nsuper__init__configr   activation_fnnnLinearhidden_sizeintermediate_sizefc1fc2selfr,   
hidden_act	__class__s      h/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/moonshine/modeling_moonshine.pyr+   MoonshineEncoderMLP.__init__=   s\    #J/99V//1I1IJ99V55v7I7IJ    hidden_statesreturnc                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r(   )r2   r-   r3   )r5   r;   s     r8   forwardMoonshineEncoderMLP.forwardD   s4    /**=9/r:   r-   r,   r2   r3   
__name__
__module____qualname____firstlineno__r+   torchTensorr>   __static_attributes____classcell__r7   s   @r8   r%   r%   <   s)    KU\\ ell  r:   r%   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MoonshineDecoderMLPK   c                   > [         TU ]  5         Xl        [        U   U l        [
        R                  " UR                  UR                  S-  5      U l	        [
        R                  " UR                  UR                  5      U l
        g )N   r)   r4   s      r8   r+   MoonshineDecoderMLP.__init__L   sa    #J/99V//1I1IA1MN99V55v7I7IJr:   r;   r<   c                     U R                  U5      nUR                  SSS9u  pU R                  U5      U-  nU R                  U5      nU$ )NrO   dim)r2   chunkr-   r3   )r5   r;   gates      r8   r>   MoonshineDecoderMLP.forwardS   sQ    /+11!1<**40=@/r:   r@   rA   rJ   s   @r8   rL   rL   K   s)    KU\\ ell  r:   rL   r;   n_repr<   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r    N)shapeexpandreshape)r;   rX   batchnum_key_value_headsslenhead_dims         r8   	repeat_kvra   [   s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr:   modulequerykeyvalueattention_maskscalingdropoutc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )NrO   r   rR   )rT   dtype)ptrainingr    )ra   num_key_value_groupsrF   matmul	transposerZ   r.   
functionalsoftmaxfloat32tork   rh   rm   
contiguous)rb   rc   rd   re   rf   rg   rh   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r8   eager_attention_forwardr|   g   s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r:   c                 x    U SSSS24   nU SSSS24   n[         R                  " U* U4SS9R                  S5      $ )	z*Rotates half the hidden dims of the input..r   NrO   r    rR   rS   rj   )rF   stackflatten)xx1x2s      r8   rotate_halfr      sJ    	
319B	
319B;;Ryb)11"55r:   c                    UR                  U5      nUR                  U5      nUSSUR                  S   S-  24   R                  SSS9nUSSUR                  S   S-  24   R                  SSS9nUR                  S   nU SSU24   U SUS24   pUSSU24   USUS24   pXr-  [        U5      U-  -   nX-  [        U	5      U-  -   n[        R
                  " X/SS9n[        R
                  " X/SS9nX4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
.NrR   rO   rS   )	unsqueezerZ   repeat_interleaver   rF   cat)qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r8   apply_rotary_pos_embr      s6   ( --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
EC 2Jc;J;&'3
+;)<6c;J;&'3
+;)<6 {{51C78G{{51C78G ii)r2Gii)r2Gr:   c                   |  ^  \ rS rSrSrS\S\S\S\S\4
U 4S jjr     SS	\	R                  S
\\\	R                  \	R                  4      S\\	R                     S\\   S\\	R                     S\\	R                     S\\   S\\	R                  \\	R                     \\\	R                        4   4S jjrSrU =r$ )MoonshineAttention   z=Multi-headed attention from 'Attention Is All You Need' paperr,   	layer_idx	is_causalnum_attention_headsr^   c                   > [         TU ]  5         UR                  XES.5        Xl        X l        [        USUR                  UR                  -  5      U l        UR                  UR                  -  U l
        U R                  S-  U l        UR                  U l        X0l        [        R                  " UR                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  U R                  -  UR                  SS9U l        U R                  R*                  bA  U R                  R*                  nX`R                  U-   S-
  U-  -  nXpR                  -
  U l        g SU l        g )N)r   r^   r`   g      ࿩biasFr    r   )r*   r+   updater,   r   getattrr0   r   r`   r^   rn   rg   attention_dropoutr   r.   r/   attention_biasq_projk_projv_projo_projpad_head_dim_to_multiple_ofhead_dim_padding)	r5   r,   r   r   r   r^   target_multipletarget_head_dimr7   s	           r8   r+   MoonshineAttention.__init__   s    	.Ano"
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9"ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JFL^L^ejk ;;22>"kkEEO---/2QTU2UZi1ijO$3mm$CD!$%D!r:   r;   position_embeddingsrf   past_key_valuecache_positionkey_value_statesrv   r<   c                    UR                   S S u  pU R                  U5      R                  XU R                  R                  U R
                  5      R                  SS5      n
US LnUb^  UR                  R                  U R                  5      nU(       a&  SUR                  U R                  '   UR                  nOUR                  nUb  UOUnU(       aA  U(       a:  W(       a3  UR                  U R                     nUR                  U R                     nOU R                  U5      R                  USU R                  R                  U R
                  5      R                  SS5      nU R                  U5      R                  USU R                  R                  U R
                  5      R                  SS5      nU(       a$  Ub!  UR!                  XU R                  SU05      u  pU(       d<  Uu  nn[#        XUU5      u  pUb%  UUUS.nUR!                  XU R                  U5      u  p[$        nU R                  R&                  S:w  ad  U R                  R&                  S:X  a-  UR                  S	S
5      (       a  [(        R+                  S5        O[,        U R                  R&                     nU R.                  (       a  Uc  U	S:  a  SOS
nU R0                  S:  a  [2        R4                  R6                  R9                  U
SU R0                  45      n
[2        R4                  R6                  R9                  USU R0                  45      n[2        R4                  R6                  R9                  USU R0                  45      nU" U U
UUU4U R:                  (       d  SOU R<                  U R>                  US.UD6u  nnU R0                  S:  a  USS U R0                  * 24   nURA                  XS5      RC                  5       nU RE                  U5      nUU4$ )NrR   r    rO   Tr   )r   r   r   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r           )rh   rg   r   .)#rZ   r   viewr,   r^   r`   rp   
is_updatedgetr   cross_attention_cacheself_attention_cache	key_cachevalue_cacher   r   r   r   r|   _attn_implementationloggerwarning_oncer   r   r   rF   r.   rq   padrm   r   rg   r\   ru   r   )r5   r;   r   rf   r   r   r   rv   bszq_lenquery_statesis_cross_attentionr   current_statesrw   rx   r   r   cache_kwargsattention_interfacer   r{   ry   s                          r8   r>   MoonshineAttention.forward   s    #(("-
 KK&++C8W8WY]YfYfgqqrsuvw 	 .T9%'2266t~~FJ!<@))$..9!/!E!E!/!D!D .>-I)}.Z'11$..AJ)55dnnEL N+c2t{{>>N1a  N+c2t{{>>N1a 
 "n&@+9+@+@dnn?OQ_>`,(
 "*HC';LVY[^'_$L)'*3.Y+9+@+@dnnl,(
 )@;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_# NN~/E%RS)DY^	  1$ 88..22<!TEZEZA[\L,,00aAVAV=WXJ 88..22<!TEZEZA[\L$7
%
  $}}C$2H2HLL
%
 
%
!\   1$%c+Cd.C.C-C+C&CDK!))#b9DDFkk+.L((r:   )r   r,   r`   r   r   r   r   rn   r   r   rg   r   )NNNNN)rB   rC   rD   rE   __doc__r!   intboolr+   rF   rG   r   r   r	   
LongTensorr   r   r>   rH   rI   rJ   s   @r8   r   r      s   G#&#& #& 	#&
 !#& !#&P LP15*.5937[)||[) &eELL%,,,F&GH[) !.	[)
 ![) !!1!12[) #5<<0[) -.[) 
u||Xell3XeELL>Q5RR	S[) [)r:   r   c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )MoonshineRotaryEmbeddingi8  r,   c                   > [         TU ]  5         [        US5      (       aH  UR                  b;  UR                  R	                  SUR                  R	                  S5      5      U l        OSU l        UR                  U l        UR                  U l        Xl	        [        U R
                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                  U l        g )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r*   r+   hasattrr   r   r   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr,   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r5   r,   devicer   r7   s       r8   r+   !MoonshineRotaryEmbedding.__init__9  s    6>**v/B/B/N#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r:   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   rR   r    mpscpuF)device_typeenabledrO   rS   rk   )r   floatr[   rZ   rt   r   
isinstancer   strrF   autocastrp   r   r   r   r   rk   )
r5   r   r   inv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r8   r>    MoonshineRotaryEmbedding.forwardJ  sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)r   r,   r   r   r   r   r   r(   )rB   rC   rD   rE   r!   r+   rF   no_gradr   r>   rH   rI   rJ   s   @r8   r   r   8  s6    / / /" ]]_<  <r:   r   c                     ^  \ rS rSrS\S\4U 4S jjr       SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\	\R                     S\	\\R                  \R                  4      S\\   S\\R                   \	\\R                   \R                   4      4   4S jjrSrU =r$ )MoonshineEncoderLayeriZ  r,   r   c                 T  > [         TU ]  5         UR                  U l        [        UUSUR                  UR
                  S9U l        [        XR                  5      U l	        [        R                  " UR                  SS9U l        [        R                  " UR                  SS9U l        g )NFr,   r   r   r   r^   r   )r*   r+   r0   r   encoder_num_attention_headsencoder_num_key_value_heads	self_attnr%   encoder_hidden_actmlpr.   	LayerNorminput_layernormpost_attention_layernormr5   r,   r   r7   s      r8   r+   MoonshineEncoderLayer.__init__[  s    !--+ & B B & B B
 'v/H/HI!||F,>,>UK(*V5G5Ge(T%r:   r;   rf   r   r   r   	use_cacher   r   rv   r<   c	                     Un
U R                  U5      nU R                  " SUUUUUUUUS.U	D6u  pX-   nUn
U R                  U5      nU R                  U5      nX-   nU4nU(       a  X4-  nU$ )Nr;   rf   r   r   r   r   r   r    )r   r   r   r   )r5   r;   rf   r   r   r   r   r   r   rv   residualself_attn_weightsoutputss                r8   r>   MoonshineEncoderLayer.forwardk  s     !,,]; ,0>> 
,
')%)/) 3
,
 
,
( !0 !55mD/ 0 "++Gr:   )r0   r   r   r   r   )NNNFFNN)rB   rC   rD   rE   r!   r   r+   rF   rG   r   r   r	   r   r   r   r   FloatTensorr>   rH   rI   rJ   s   @r8   r   r   Z  s   U U3 U& 2637*.,1$)59KO'||' !.' u//0	'
 !' $D>' D>' !!1!12' &eELL%,,,F&GH' -.' 
u  (51B1BEDUDU1U+V"WW	X' 'r:   r   c                      ^  \ rS rSrSS\S\\   4U 4S jjjr           SS\R                  S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\R                     S\\\R                  \R                  4      S\\\R                  \R                  4      S\\R                  \\\R                  \R                  4      4   4S jjrSrU =r$ )MoonshineDecoderLayeri  r,   r   c                   > [         TU ]  5         UR                  U l        [        UUSUR                  UR
                  S9U l        [        UUSUR                  UR
                  S9U l        [        XR                  5      U l
        [        R                  " UR                  SS9U l        [        R                  " UR                  SS9U l        [        R                  " UR                  SS9U l        g )NTr   Fr   )r*   r+   r0   r   decoder_num_attention_headsdecoder_num_key_value_headsr   encoder_attnrL   decoder_hidden_actr   r.   r   r   r   final_layernormr   s      r8   r+   MoonshineDecoderLayer.__init__  s    !--+ & B B & B B
 / & B B & B B
 'v/H/HI!||F,>,>UK(*V5G5Ge(T%!||F,>,>UKr:   r;   rf   encoder_hidden_statesencoder_attention_maskr   encoder_position_idsr   r   r   r   r   encoder_position_embeddingsr<   c                 F   UnU R                  U5      nU R                  " SUUUUUU	U
US.UD6u  pX-   nS nUb.  UnU R                  U5      nU R                  UUUUUU	S9u  nnX-   nUnU R	                  U5      nU R                  U5      nX-   nU4nU(       a  UUU4-  nU$ )Nr   )r;   r   rf   r   r   r   r   )r   r   r   r  r  r   )r5   r;   rf   r  r  r   r  r   r   r   r   r   r  rv   r  r  cross_attn_weightsr  s                     r8   r>   MoonshineDecoderLayer.forward  s     !,,]; ,0>> 
,
')%)/) 3
,
 
,
( !0 " ,$H 99-HM040A0A+!65-"3# 1B 1-M- %4M !,,];/ 0 ")+=>>Gr:   )r  r  r0   r   r   r   r   r(   )NNNNNNFFNNN)rB   rC   rD   rE   r!   r   r   r+   rF   rG   r   r	   r   r   r  r>   rH   rI   rJ   s   @r8   r  r    su   L L8C= L L6 268<9=37;?*.,1$)59KOSW<||< !.<  (5	<
 !) 6< u//0< 'u'7'78< !< $D>< D>< !!1!12< &eELL%,,,F&GH< &.eELL%,,4N.O%P< 
u  (51B1BEDUDU1U+V"WW	X< <r:   r  c                   d    \ rS rSr\rSrSrSrSS/r	Sr
SrSrSrS rS\R                   4S	 jrS
rg)MoonshinePreTrainedModeli  modelinput_valuesTr   r  c                 P   U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  [        R                  45      (       aX  UR                  R                  R                  S5        UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g g )Nr   )meanstdg      ?)r,   initializer_ranger   r.   r/   Conv1dweightdatanormal_r   zero_	GroupNormr   fill_	Embeddingpadding_idx)r5   rb   r  s      r8   _init_weights&MoonshinePreTrainedModel._init_weights  s)   kk++fryy"))455MM&&CS&9{{&  &&( 'r|| <==MM$$S){{&  &&( '--MM&&CS&9!!-""6#5#56<<> . .r:   input_lengthsc                 ~    [        US-
  S-  S-   5      n[        US-
  S-  S-   5      n[        US-
  S-  S-   5      nU$ )z8
Computes the output length of the convolutional layers
   @   r       r   rO   )r   )r5   r)  output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengths        r8    _get_feat_extract_output_lengths9MoonshinePreTrainedModel._get_feat_extract_output_lengths  sZ     "=3#6""<q"@A!#6#:a"?!"CD!#6#:a"?!"CD""r:   r   N)rB   rC   rD   rE   r!   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_flash_attn_2_supports_sdpa_supports_cache_class_supports_static_cacher'  rF   r   r1  rH   r   r:   r8   r  r    sR    "L$O&*#02IJ!N !?#e>N>N #r:   r  c                      ^  \ rS rSrSrSrS\4U 4S jjrS\R                  4S jr
S\R                  4S	 jr\    SS\\R                     S
\\R                      S\\   S\\   S\\   S\4S jj5       rSrU =r$ )MoonshineEncoderi  z
Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

Args:
    config: MoonshineConfig
r  r,   c           	      L  > [         TU ]  U5        Xl        UR                  n[        R
                  " SUSSSS9U l        [        R
                  " USU-  SSS	9U l        [        R
                  " SU-  USSS	9U l        [        R                  " SUS
S9U l
        [        US9U l        [        R                  " [        UR                  5       Vs/ s H  n[!        X5      PM     sn5      U l        [        R$                  " USS9U l        SU l        U R+                  5         g s  snf )Nr    r+  r,  F)kernel_sizestrider   rO   r-  r   )r?  r@  gh㈵>)
num_groupsnum_channelsepsr,   r   )r*   r+   r,   r0   r.   r  conv1conv2conv3r#  	groupnormr   
rotary_emb
ModuleListrangeencoder_num_hidden_layersr   layersr   
layer_normgradient_checkpointing	post_init)r5   r,   	embed_dimidxr7   s       r8   r+   MoonshineEncoder.__init__  s     &&	YYq)ReT
YYy!i-QqQ
YYq9}iQqQ
PTU2&Amm;@AaAa;bc;bC"6/;bc
 ,,yu=&+# ds   D!r<   c                     U R                   $ r(   rE  r5   s    r8   get_input_embeddings%MoonshineEncoder.get_input_embeddings1  s    zzr:   re   c                     Xl         g r(   rU  r5   re   s     r8   set_input_embeddings%MoonshineEncoder.set_input_embeddings4  s    
r:   rf   r   output_hidden_statesflash_attn_kwargsc           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  [        S5      eUR	                  S5      n[
        R                  R                  U R                  U5      5      nU R                  U5      n[
        R                  R                  U R                  U5      5      n[
        R                  R                  U R                  U5      5      nUR                  SSS5      nUb  U R                  UR                  S   5      nSnUSSSU24   SSU24   nU R                   R                   S	:X  a  US
:H  R#                  5       (       a  UOSnOLU R                   R                   S:X  a  U(       d  [%        X&R&                  5      nO[)        X&R&                  5      n[*        R,                  " SUR                  S   UR.                  S9R	                  S5      n	U R1                  Xi5      n
U(       a  SOSnU(       a  SOSnU R2                   H3  nU(       a  X4-  nU" U4UU	UU
S.UD6nUS   nU(       d  M+  XS   4-  nM5     U R5                  U5      nU(       a  X4-  n[7        UUUS9$ )a\  
Args:
    input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
        Float values of the raw speech waveform. Raw speech waveform can be
        obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
        `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
        `input_values`, the [`AutoFeatureExtractor`] should be used for padding
        and conversion into a tensor of type `torch.FloatTensor`.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.
        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
        tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
        more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NzYou must specify input_values.r    r   rO   rR     .flash_attention_2r   r   r   r   )rf   r   r   r   last_hidden_stater;   
attentions)r,   r   r]  
ValueErrorr   r.   rq   tanhrE  rH  gelurF  rG  permuter1  rZ   r   anyr   rk   r   rF   aranger   rI  rM  rN  r   )r5   r  rf   r   r]  r^  r;   mask_lendownsample_strider   r   all_hidden_statesall_self_attnsencoder_layerlayer_outputss                  r8   r>   MoonshineEncoder.forward7  s   > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 =>> $--a0**4::l+CD}5**4::m+DE**4::m+DE%--aA6 %<<^=Q=QRT=UVH *+C1D3D1D,DEc9H9nUN{{//3FF4Bc4I3N3N3P3PVZ 11V;DU!D^UhUh!i "<NL_L_!`||A}':':1'=mFZFZ[eefgh #oomJ #7BD0d![[M#!%55!)-)"3$7 $M *!,M  #3"55! )$ 6  !11&++%
 	
r:   )	r,   rE  rF  rG  rO  rH  rN  rM  rI  )NNNN)rB   rC   rD   rE   r   r5  r!   r+   r.   ModulerW  r[  r   r   rF   r  rG   r   r   r   r   r>   rH   rI   rJ   s   @r8   r=  r=    s     %O (bii "))   5915,0/3c
u001c
 !.c
 $D>	c

 'tnc
 $$89c
 
!c
 c
r:   r=  c                   \  ^  \ rS rSrSrS\4U 4S jjrS rS r\	\
           SS\\R                     S\\R                     S\\R                     S	\\   S
\\R                      S\\   S\\   S\\   S\\R                     S\\R                      S\\R                     S\\   S\\\4   4S jj5       5       r SS\\R                  S4   S\R                  S\R                  S	\S\4
S jjr\S\R                  S\S\S\R6                  S\R                  S\4S j5       rSrU =r$ )MoonshineDecoderi  	input_idsr,   c           	      
  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [
        R                  " UR                  SS9U l        [!        US9U l        SU l        U R'                  5         g s  snf )NFr   rD  )r*   r+   pad_token_idr&  
vocab_sizer.   r%  r0   embed_tokensrJ  rK  decoder_num_hidden_layersr  rM  r   normr   rI  rO  rP  )r5   r,   rR  r7   s      r8   r+   MoonshineDecoder.__init__  s     !.. ++LL):):F<N<NPTP`P`amm;@AaAa;bc;bC"6/;bc
 LL!3!3%@	2&A&+# 	 ds   D c                     U R                   $ r(   rz  rV  s    r8   rW  %MoonshineDecoder.get_input_embeddings  s       r:   c                     Xl         g r(   r  rZ  s     r8   r[  %MoonshineDecoder.set_input_embeddings  s    !r:   rf   r   past_key_valuesinputs_embedsr   r   r]  r   r  r  r^  r<   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUSL USL-  (       a  [	        S5      eU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nU(       a"  Uc  [        5       n[        5       n[        X5      nU	cD  Ub  UR                  5       OSn[        R                  " XUR                  S   -   UR                   S9n	Uc  U	R#                  S5      nU R%                  X%XU5      nUnU R'                  UU5      nU(       a  SOSnU(       a  SOSnU(       a  U
b  SOSnUb  U
R                  S	   nS
nUSSSU24   SSU24   nU R                   R(                  S:X  a  US:H  R+                  5       (       a  UOSnOjU R                   R(                  S:X  a,  U(       d%  [-        UUR.                  UR                  S	   5      nO$[1        UUR.                  UR                  S	   5      nU R2                   HH  nU(       a  UU4-  nU" U4UUU
UUUUU	US.	UD6nUS   nU(       d  M1  UUS   4-  nU
c  M?  UUS   4-  nMJ     U R5                  U5      nU(       a  UU4-  n[7        UU(       a  UOSUUUS9$ )a\  
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
    of the decoder.
encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
    [What are attention masks?](../glossary#attention-mask)
Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r    rb  r   rj   r`  .ra  r   r   )	rf   r  r  r   r   r   r   r   r   rO   )rd  r  r;   re  cross_attentions)r,   r   r]  r   rf  rO  rm   r   r   rz  r
   r   get_seq_lengthrF   rk  rZ   r   r   _update_causal_maskrI  r   rj  r   rk   r   rM  r|  r   )r5   rv  rf   r   r  r  r   r   r]  r   r  r  r^  r   r   past_seen_tokensrz   r;   r   rn  ro  all_cross_attentionsrl  rm  decoder_layerrq  s                             r8   r>   MoonshineDecoder.forward  s;   6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0#/> $0N!12F^O!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L..>L]
 & #oom\J #7BD0d&7<Q<]rdh "-,2226H *%;CATCTAT<T%UVY[d\d[dVd%e"{{//3FFDZ^aDaCfCfChCh)?nr& 11V;DU)L*M,?,?ATATUWAX*&
 *D*M,?,?ATATUWAX*& "[[M#!m%55!)*'=&;)."3#-$7 $M *!,M  =#3"55(4(]1-=,??(1 )4 		-0  -!118+/8Od+%1
 	
r:   r"   input_tensorc           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nra  r   flex_attentionr   Fr   )r  past_key_values_lengthis_trainingr    rR   )sequence_lengthtarget_lengthrk   r   
batch_size)cudaxpunpu)r,   r   rj  r   rF   rG   r#   r  is_compileabler   _ignore_causal_mask_sdparm   rk   rZ   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   r   finfomin_unmask_unattended)r5   rf   r  r   r  r   r  using_compilable_cacherk   r  r  rz   	min_dtypes                r8   r  $MoonshineDecoder._update_causal_mask=  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr:   r  r  rk   r  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuerk   r   r    )diagonalrb  rR   r   )rT   rF   r  r  fullr   triurk  r\   r[   clonerZ   rt   masked_fill)rf   r  r  rk   r   r  rv   rz   r  mask_lengthpadding_masks              r8   r  FMoonshineDecoder._prepare_4d_causal_attention_mask_with_cache_position  s}   < %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r:   )rz  rO  rM  r|  r&  rI  ry  )NNNNNNNNNNN)F)rB   rC   rD   rE   r5  r!   r+   rW  r[  r   r   r   rF   r   rG   r	   r  r   r   r   r   r   r   r>   r  staticmethodr   rk   r  rH   rI   rJ   s   @r8   ru  ru    s   !O  !"  151537+/59$(,0/359=A9=A
E,,-A
 !.A
 u//0	A

 "%A
   1 12A
 D>A
 $D>A
 'tnA
 !!1!12A
  ((9(9:A
 !) 6A
 $$89A
 
u--	.A
  A
R #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r:   ru  rZ   	mask_probr  	min_masksc           	        ^^^^^ U u  nmTS:  a  [        S5      eTT:  a  [        ST ST S35      e[        R                  R                  S5      R	                  5       mUUUUU4S jnUb-  UR                  5       R                  S5      R                  5       O[        U5       Vs/ s H  nTPM     snn[        R                  " UT4[        S	9n	/ n
U" T5      nUS
:X  a  U	$ U H  nU" U5      n[        R                  R                  [        R                  " UTS-
  -
  5      USS9n[        U5      S
:X  a  TS-
  nOUS
   n[        R                  " U[        R                  " X-
  [        R                   S	9U-  /5      nU
R#                  U5        M     [        R$                  " U
5      n
[        R&                  " U
SS2SS2S4   X[T45      n
U
R)                  X[T-  5      n
[        R                  " T5      SSSS24   n[        R&                  " UX[T45      R)                  X[T-  5      nU
U-   n
U
R+                  5       TS-
  :  a  TS-
  XTS-
  :  '   [        R,                  " XSS5        U	$ s  snf )a*  
Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
CPU as part of the preprocessing during training.

Args:
    shape: The shape for which to compute masks. This should be of a tuple of size 2 where
           the first element is the batch size and the second element is the length of the axis to span.
    mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                independently generated mask spans of length `mask_length` is computed by
                `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                actual percentage will be smaller.
    mask_length: size of the mask
    min_masks: minimum number of masked spans
    attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                    each batch dimension.
r    z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    > [        TU -  T-  T-   5      n[        UT5      nUT-  T:  a  TT-  nU TS-
  -
  U:  a  [        U TS-
  -
  S5      nU$ )z;Given input length, compute how many spans should be maskedr    r   )r   max)input_lengthnum_masked_spanepsilonr  r  r  r  s     r8   compute_num_masked_span6_compute_mask_indices.<locals>.compute_num_masked_span  so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr:   NrR   r   r   F)replace)rf  nprandomranditemdetachsumtolistrK  zerosr   choicerk  lenconcatenateonesint32appendarraybroadcast_tor\   r  put_along_axis)rZ   r  r  rf   r  r  r  _r)  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr  r  spec_aug_mask_idxdummy_mask_idxoffsetsr  r  s    `` `            @@r8   _compute_mask_indicesr    s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89'8!o'89  HHj/:$GM1/Ba%1,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;(MUWU]U] ^ao op
 	!!"34/ &2 "45 1a:&+(V ,33JVa@ab ii$T4]3Goog
'UV^^+5G ,g5 /A"55GVYZGZ!0CCD mB?w :s   (I0c                   &  ^  \ rS rSrS\4U 4S jjrS rS rS rS r	S r
 SS	\R                  S
\\R                     4S jjr\\            SS\\R                     S
\\R                     S\\R                     S\\R                     S\\\\R                           S\\\\\R                     4      S\\\R                        S\\\R                        S\\   S\\   S\\   S\\R                     S\4S jj5       5       rSrU =r$ )MoonshineModeli0  r,   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r(   )r*   r+   r=  encoderru  decoderrP  r5   r,   r7   s     r8   r+   MoonshineModel.__init__2  s2     '/'/r:   c                 .    U R                   R                  $ r(   r  rz  rV  s    r8   rW  #MoonshineModel.get_input_embeddings:  s    ||(((r:   c                 $    XR                   l        g r(   r  rZ  s     r8   r[  #MoonshineModel.set_input_embeddings=  s    $)!r:   c                     U R                   $ r(   )r  rV  s    r8   get_encoderMoonshineModel.get_encoder@      ||r:   c                     U R                   $ r(   )r  rV  s    r8   get_decoderMoonshineModel.get_decoderC  r  r:   c                 8    U R                   R                  5         g)z
Calling this function will disable the gradient computation for the Moonshine encoder so that its parameters will
not be updated during training.
N)r  _freeze_parametersrV  s    r8   freeze_encoderMoonshineModel.freeze_encoderF  s    
 	'')r:   input_featuresrf   c                 2   [        U R                  SS5      (       d  U$ UR                  5       u  p4nU R                  R                  S:  a  U R                  (       a  [        X54U R                  R                  U R                  R                  UU R                  R                  S9n[        R                  " XaR                  [        R                  S9nUSS2S4   R                  SUS5      nSX'   U R                  R                  S:  a  U R                  (       az  [        X44U R                  R                  U R                  R                  U R                  R                  S9n[        R                  " XqR                  [        R                  S9nSX'   U$ )	z
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://arxiv.org/abs/1904.08779).
apply_spec_augmentTr   )r  r  rf   r  )r   rk   NrR   )r  r  r  )r   r,   sizemask_time_probrm   r  mask_time_lengthmask_time_min_masksrF   tensorr   r   r[   mask_feature_probmask_feature_lengthmask_feature_min_masks)r5   r  rf   r  r0   r  mask_time_indicesmask_feature_indicess           r8   _mask_input_features#MoonshineModel._mask_input_featuresM  sN    t{{$8$??!! 4B3F3F3H0
;;%%)dmm 5-++44 KK88-++99! !&->G\G\didndn o 1!T' : A A"kSU V01N-;;((1,#8)++77 KK;;++<<	$  $)<<0DMbMbjojtjt#u 34N0r:   r  decoder_input_idsdecoder_attention_maskencoder_outputsr  decoder_inputs_embedsdecoder_position_idsr   r   r]  r   r<   c                 l   U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Uc  U R	                  UUU
US9nOK[        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nU R                  UUUUR                  UUUU	U
UUS9n[        UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9$ )	a  
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
    Float values of the raw speech waveform. Raw speech waveform can be
    obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
    `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
    `input_values`, the [`AutoFeatureExtractor`] should be used for padding
    and conversion into a tensor of type `torch.FloatTensor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
    it.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
decoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
    information on the default strategy.

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.n_positions - 1]`.

    [What are position IDs?](../glossary#position-ids)

Example:

```python
>>> import torch
>>> from transformers import AutoFeatureExtractor, MoonshineModel
>>> from datasets import load_dataset

>>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
>>> input_values = inputs.input_values
>>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
>>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
>>> list(last_hidden_state.shape)
[1, 2, 288]
```
N)rf   r   r]  r   r    rO   rc  )rv  rf   r  r  r  r  r   r   r   r]  r   )rd  r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentions)r,   r   r]  r   r  r   r   r  r  rd  r   r  r;   re  r  )r5   r  rf   r  r  r  r  r  r  r   r   r]  r   decoder_outputss                 r8   r>   MoonshineModel.forwardx  s^   ` 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	"/3||-"3%9	 0< 0O O_==-"1!"4474H14Loa0RV14_1E1I?1-tO FJ\\'1#1"1"C"C+/-/!5) FR F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r:   )r  r  r(   )NNNNNNNNNNNN)rB   rC   rD   rE   r!   r+   rW  r[  r  r  r  rF   r  r   r   r  r   r   r   r   r   r   r   r>   rH   rI   rJ   s   @r8   r  r  0  s    )** 6:)))) !!1!12)V  59598<=AEIZ^DHBF$(,0/359{
u001{
 !!1!12{
 $E$4$45	{

 !))9)9 :{
 "%e.?.?(@"AB{
 "%(;U5CTCT=U(U"VW{
  (e.?.?(@A{
 'uU-=-='>?{
 D>{
 $D>{
 'tn{
 !!1!12{
 
{
  {
r:   r  rv  rx  decoder_start_token_idc                     U R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   X#SS2S4'   Uc  [        S5      eUR	                  US:H  U5        U$ )z)
Shift input ids one token to the right.
NrR   r    r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosrZ   r  rf  masked_fill_)rv  rx  r  shifted_input_idss       r8   shift_tokens_rightr
    sz     "++IOO<(CRC0668ae4adLMM""#4#<lKr:   zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    )custom_introc                   "  ^  \ rS rSrS/rS\4U 4S jjrS rS rS r	S r
S	\R                  4S
 jr\\             SS\\R$                     S\\R&                     S\\R&                     S\\R&                     S\\\\R$                           S\\\\\R$                     4      S\\\R$                        S\\\R&                        S\\   S\\   S\\   S\\R&                     S\\R&                     S	\4S jj5       5       rSrU =r$ )!MoonshineForConditionalGenerationi  zproj_out.weightr,   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NFr   )
r*   r+   r  r  r.   r/   r0   ry  proj_outrP  r  s     r8   r+   *MoonshineForConditionalGeneration.__init__  sH     #F+
		&"4"4f6G6GeT 	r:   c                 6    U R                   R                  5       $ r(   )r  r  rV  s    r8   r  -MoonshineForConditionalGeneration.get_encoder      zz%%''r:   c                 6    U R                   R                  5       $ r(   )r  r  rV  s    r8   r  -MoonshineForConditionalGeneration.get_decoder  r  r:   c                     U R                   $ r(   r  rV  s    r8   get_output_embeddings7MoonshineForConditionalGeneration.get_output_embeddings  s    }}r:   c                     Xl         g r(   r  )r5   new_embeddingss     r8   set_output_embeddings7MoonshineForConditionalGeneration.set_output_embeddings!  s    &r:   r<   c                 6    U R                   R                  5       $ r(   )r  rW  rV  s    r8   rW  6MoonshineForConditionalGeneration.get_input_embeddings$  s    zz..00r:   r  rf   r  r  r  r  r  r  r   r   r]  r   labelsc                    Ub:  Uc7  Uc4  [        XR                  R                  U R                  R                  5      nU R	                  UUUUUUUUU	U
UUS9nU R                  UR                  5      nSnUb$  U R                  XU R                  R                  S9n[        UUUR                  UR                  UR                  UR                  UR                  UR                  UR                   S9	$ )aw  
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
    Float values of the raw speech waveform. Raw speech waveform can be
    obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
    `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
    `input_values`, the [`AutoFeatureExtractor`] should be used for padding
    and conversion into a tensor of type `torch.FloatTensor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
    it.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
decoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
    information on the default strategy.

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.n_positions - 1]`.

    [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
    or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
    only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> import torch
>>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
>>> from datasets import load_dataset

>>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
>>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

>>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
>>> input_values = inputs.input_values

>>> generated_ids = model.generate(input_values, max_new_tokens=100)

>>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> transcription
'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
```N)rf   r  r  r  r  r  r  r   r   r]  r   )logitsr   ry  )	lossr"  r  r  r   r  r  r  r  )r
  r,   rx  r  r  r  rd  loss_functionry  r   r  r  r   r  r  r  r  )r5   r  rf   r  r  r  r  r  r  r   r   r]  r   r   r  r"  r#  s                    r8   r>   )MoonshineForConditionalGeneration.forward'  s   r  (-B-J$6KK44dkk6X6X%! '+jj)/+#9+"7!5/!5) '1 '
 w889%%Vt{{OeOe%fD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r:   )r  r  )NNNNNNNNNNNNN)rB   rC   rD   rE   _tied_weights_keysr!   r+   r  r  r  r  r.   rs  rW  r   r   r   rF   r  r   r   r   r   r   r   r>   rH   rI   rJ   s   @r8   r  r    s    ,, (('1bii 1  59598<=AEIZ^DHBF$(,0/359-1{
u001{
 !!1!12{
 $E$4$45	{

 !))9)9 :{
 "%e.?.?(@"AB{
 "%(;U5CTCT=U(U"VW{
  (e.?.?(@A{
 'uU-=-='>?{
 D>{
 $D>{
 'tn{
 !!1!12{
 ))*{
 
{
  {
r:   r  )r  r  r  )r   )Nr    )Nr   )Ntypingr   r   r   r   numpyr  rF   torch.nnr.   activationsr   cache_utilsr	   r
   r   
generationr   modeling_attn_mask_utilsr   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   configuration_moonshiner!   !torch.nn.attention.flex_attentionr"   integrations.flex_attentionr#   
get_loggerrB   r   rs  r%   rL   rG   r   ra   r   r|   r   r   r   r   r   r  r  r=  ru  r   ndarrayr  r  r
  r  __all__r   r:   r8   <module>r;     s  * 4 3    ! C C ) 
 C 9  L F & \ \ 4  !!;J 
		H	%")) "))  	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % %46'TC) C)L<ryy <D86 8vU6 Up "# "# "#JH
/ H
V W/ W W| 26tc?tt t U--.	t
 t ZZtn D
- D
 D
N%,, c [^   
W
(@/ W

W
t ^r:   