
    fThB                       S r SSKrSSKJrJrJrJr  SSKrSSK	r	SSK
r	SSK	Jr  SSKJrJrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJrJrJrJ r   SSK!J"r"  SSK#J$r$J%r%  SSK&J'r'J(r(  \%RR                  " \*5      r+Sr,S\	RZ                  S\.S\.4S jr/ S{S\	RZ                  S\.S\\	RZ                     4S jjr0  S|S\\.\.4   S\1S\.S\\	Rd                     S\.S\Rf                  4S jjr4 " S S \Rj                  5      r6 " S! S"\Rj                  5      r7 " S# S$\Rj                  5      r8 " S% S&\Rj                  5      r9 " S' S(\Rj                  5      r: " S) S*\Rj                  5      r; " S+ S,\	R                  Rj                  5      r< " S- S.\Rj                  5      r= " S/ S0\Rj                  5      r> " S1 S2\Rj                  5      r? " S3 S4\Rj                  5      r@ " S5 S6\Rj                  5      rA " S7 S8\Rj                  5      rB " S9 S:\Rj                  5      rC " S; S<\Rj                  5      rD " S= S>\Rj                  5      rE " S? S@\Rj                  5      rF " SA SB\Rj                  5      rG " SC SD\Rj                  5      rH " SE SF\Rj                  5      rI " SG SH\Rj                  5      rJ\$ " SI SJ\"5      5       rK " SK SL\K5      rL " SM SN\K5      rM " SO SP\K5      rN " SQ SR\K5      rO " SS ST\K5      rP " SU SV\K5      rQ " SW SX\K5      rR " SY SZ\K5      rS " S[ S\\Rj                  5      rT " S] S^\Rj                  5      rU\$" S_S`9 " Sa Sb\K5      5       rV\$" ScS`9 " Sd Se\K\5      5       rW        S}Sf\KS\	R                  Sg\\	R                     S\\	Rd                     Sh\1Si\1Sj\1Sk\\Rj                     Sl\YSm\YS\\	R                  \\	R                  \	R                  4   4   4Sn jjrZ\$" SoS`9 " Sp Sq\K5      5       r[\$" SrS`9 " Ss St\K5      5       r\ " Su Sv\Rj                  5      r]\$" SwS`9 " Sx Sy\"5      5       r^/ SzQr_g)~zPyTorch SpeechT5 model.    N)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossL1Loss   )ACT2FN)GenerationMixin)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSeq2SeqSpectrogramOutput)PreTrainedModel)auto_docstringlogging   )SpeechT5ConfigSpeechT5HifiGanConfig	input_idspad_token_iddecoder_start_token_idc                     U R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   X#SS2S4'   Uc  [        S5      eUR	                  US:H  U5        U$ )z)
Shift input ids one token to the right.
Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r   r   shifted_input_idss       f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/speecht5/modeling_speecht5.pyshift_tokens_rightr)   4   sz     "++IOO<(CRC0668ae4adLMM""#4#<lK    input_valuesreduction_factorattention_maskc                     US:  a!  U SS2US-
  SU24   n Ub  USS2US-
  SU24   nU R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   UR                  US:H  S5        X24$ )zo
Shift input spectrograms one timestep to the right. Also applies the reduction factor to the sequence length.
r   Nr!         Y        )r"   r#   r$   r&   )r+   r,   r-   shifted_input_valuess       r(   shift_spectrograms_rightr2   D   s     !#A'7!';'O?O'O$OP%+A/?!/C/WGW/W,WXN'11,2D2DE".q#2#v"6"<"<">AB %%&:f&DcJ//r*   r#   	mask_probmask_length	min_masksreturnc           	        ^^^^^ U u  nmTS:  a  [        S5      eTT:  a  [        ST ST S35      e[        R                  R                  S5      R	                  5       mUUUUU4S jnUb-  UR                  5       R                  S5      R                  5       O[        U5       Vs/ s H  nTPM     snn[        R                  " UT4[        S	9n	/ n
U" T5      nUS
:X  a  U	$ U H  nU" U5      n[        R                  R                  [        R                  " UTS-
  -
  5      USS9n[        U5      S
:X  a  TS-
  nOUS
   n[        R                  " U[        R                  " X-
  [        R                   S	9U-  /5      nU
R#                  U5        M     [        R$                  " U
5      n
[        R&                  " U
SS2SS2S4   X[T45      n
U
R)                  X[T-  5      n
[        R                  " T5      SSSS24   n[        R&                  " UX[T45      R)                  X[T-  5      nU
U-   n
U
R+                  5       TS-
  :  a  TS-
  XTS-
  :  '   [        R,                  " XSS5        U	$ s  snf )a*  
Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
CPU as part of the preprocessing during training.

Args:
    shape: The shape for which to compute masks. This should be of a tuple of size 2 where
           the first element is the batch size and the second element is the length of the axis to span.
    mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                independently generated mask spans of length `mask_length` is computed by
                `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                actual percentage will be smaller.
    mask_length: size of the mask
    min_masks: minimum number of masked spans
    attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                    each batch dimension.
r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    > [        TU -  T-  T-   5      n[        UT5      nUT-  T:  a  TT-  nU TS-
  -
  U:  a  [        U TS-
  -
  S5      nU$ )z;Given input length, compute how many spans should be maskedr   r   )intmax)input_lengthnum_masked_spanepsilonr4   r3   r5   sequence_lengths     r(   compute_num_masked_span6_compute_mask_indices.<locals>.compute_num_masked_span   so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr*   Nr!   dtyper   F)replace)r%   nprandomranditemdetachsumtolistrangezerosboolchoicearangelenconcatenateonesint32appendarraybroadcast_toreshaper;   put_along_axis)r#   r3   r4   r-   r5   
batch_sizer@   _input_lengthsspec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr<   r=   spec_aug_mask_idxdummy_mask_idxoffsetsr>   r?   s    `` `            @@r(   _compute_mask_indicesrc   Z   s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89'8!o'89  HHj/:$GM1/Ba%1,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;(MUWU]U] ^ao op
 	!!"34/ &2 "45 1a:&+(V ,33JVa@ab ii$T4]3Goog
'UV^^+5G ,g5 /A"55GVYZGZ!0CCD mB?w :s   (I0c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SpeechT5NoLayerNormConvLayer   c                 b  > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        g )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr   feat_extract_activation
activationselfconfiglayer_id	__class__s      r(   rm   %SpeechT5NoLayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@r*   c                 J    U R                  U5      nU R                  U5      nU$ N)ru   rw   ry   hidden_statess     r(   forward$SpeechT5NoLayerNormConvLayer.forward   s$    		-06r*   )rw   ru   ro   rp   r   __name__
__module____qualname____firstlineno__rm   r   __static_attributes____classcell__r|   s   @r(   re   re      s    A r*   re   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SpeechT5LayerNormConvLayer   c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [
        R                  " U R                  SS9U l        [        UR                     U l        g )Nr   r   rh   T)elementwise_affine)rl   rm   rn   ro   rp   r   rq   rr   rs   rt   ru   	LayerNorm
layer_normr   rv   rw   rx   s      r(   rm   #SpeechT5LayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r*   c                     U R                  U5      nUR                  SS5      nU R                  U5      nUR                  SS5      nU R                  U5      nU$ )Nr!   )ru   	transposer   rw   r   s     r(   r   "SpeechT5LayerNormConvLayer.forward   sV    		-0%//B76%//B76r*   rw   ru   ro   r   rp   r   r   r   s   @r(   r   r      s    A r*   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SpeechT5GroupNormConvLayeri  c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        [
        R                  " U R                  U R                  SS9U l        g )Nr   r   rh   T)
num_groupsnum_channelsaffine)rl   rm   rn   ro   rp   r   rq   rr   rs   rt   ru   r   rv   rw   	GroupNormr   rx   s      r(   rm   #SpeechT5GroupNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr*   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )ru   r   rw   r   s     r(   r   "SpeechT5GroupNormConvLayer.forward  s2    		-066r*   r   r   r   r   s   @r(   r   r     s    r  r*   r   c            	         ^  \ rS rSrSrSS\S\S\\   4U 4S jjjrSS\S\S\\   4S jjr\	SS\S\S\\   4S	 jj5       r
\R                  " 5       SS
\R                  S\4S jj5       r SS
\R                  S\S\\   4S jjrSrU =r$ )%SpeechT5SinusoidalPositionalEmbeddingi  zDThis module produces sinusoidal positional embeddings of any length.num_positionsembedding_dimpadding_idxc                    > [         TU ]  5         SU l        X l        X0l        U R                  XR                  -   X#5        g N   )rl   rm   offsetr   r   make_weights)ry   r   r   r   r|   s       r(   rm   .SpeechT5SinusoidalPositionalEmbedding.__init__   s8    *&-++5}Rr*   num_embeddingsc                     U R                  XU5      n[        U S5      (       a8  UR                  U R                  R                  U R                  R
                  S9nU R                  SUSS9  g )NweightsrC   deviceF
persistent)get_embeddinghasattrtor   rC   r   register_buffer)ry   r   r   r   emb_weightss        r(   r   2SpeechT5SinusoidalPositionalEmbedding.make_weights'  s\    ((T4##%..t||/A/A$,,J]J].^KYFr*   c                    US-  n[         R                  " S5      US-
  -  n[        R                  " [        R                  " U[        R
                  S9R                  5       U* -  5      n[        R                  " U [        R
                  S9R                  5       R                  S5      UR                  S5      -  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9R                  U S5      nUS-  S:X  a,  [        R                  " U[        R                  " U S5      /SS9nUb  SXBSS24'   UR                  [        R                  " 5       5      $ )	z
Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
description in Section 3.5 of "Attention Is All You Need".
r   i'  r   rB   r   dimr!   N)mathlogtorchexprP   int64float	unsqueezecatsincosviewrM   r   get_default_dtype)r   r   r   half_dimembs        r(   r   3SpeechT5SinusoidalPositionalEmbedding.get_embedding/  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r*   r   past_key_values_lengthc                    UR                  5       u  p4U R                  XR                  U5      R                  UR                  5      nU R                  S-   U-   nX`R
                  R                  S5      :  a3  U R                  X`R                  -   U R                  U R                  5        U R
                  R                  SUR                  S5      5      R                  X4S5      R                  5       $ )Nr   r   r!   )size"create_position_ids_from_input_idsr   r   r   r   r   r   r   index_selectr   rI   )ry   r   r   bszseq_lenposition_idsmax_poss          r(   r   -SpeechT5SinusoidalPositionalEmbedding.forwardA  s     ~~'>>yJZJZ\rsvv

 ""Q&0\\&&q))g3T5G5GIYIYZ||((L,=,=b,ABGGVXY``bbr*   c                     UR                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
symbols are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:
Returns: torch.Tensor
r   r   )ner:   r   cumsumtype_aslong)ry   r   r   r   maskincremental_indicess         r(   r   HSpeechT5SinusoidalPositionalEmbedding.create_position_ids_from_input_idsP  sW     ||K(,,.$||Da8@@FI__cgg"'')K77r*   )r   r   r   r   r   )r   r   r   r   __doc__r:   r   rm   r   staticmethodr   r   no_gradTensorr   r   r   r   r   s   @r(   r   r     s    NSc S# SHUXM S SG3 Gs GQYZ]Q^ G 1c 1# 1HUXM 1 1" ]]_c cs c c bc88478QYZ]Q^8 8r*   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SpeechT5PositionalConvEmbeddingib  c                   > [         TU ]  5         [        R                  " UR                  UR                  UR
                  UR
                  S-  UR                  S9U l        [        R                  R                  n[        [        R                  R                  S5      (       a$  [        R                  R                  R                  n[        5       (       Ga%  SS KnUR                  R                  U R                  R                   SS9   U" U R                  SSS9U l        S S S 5        [        U R                  S5      (       aU  U R                  R                  R                   R"                  nU R                  R                  R                   R$                  nO,U R                  R&                  nU R                  R(                  nUR                  R+                  X5        UR                  R+                  X5        OU" U R                  SSS9U l        [-        UR
                  5      U l        [0        UR2                     U l        g ! , (       d  f       GN,= f)	Nr   )ri   paddinggroupsweight_normr   )modifier_rankweight)namer   parametrizations)rl   rm   r   rq   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsru   utilsr   r   r   r   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterSpeechT5SamePadLayerr   r   rv   rw   )ry   rz   r   r   r   r   r|   s         r(   rm   (SpeechT5PositionalConvEmbedding.__init__c  s   II6622a777
	 hh**288,,m<<((33??K%''224993C3CST2U'		aH	 Vtyy"4559955<<FF9955<<FF99--99--NN66tFNN66tF#DIIH!DDI+F,J,JK !?!?@ VUs   I
Ic                     UR                  SS5      nU R                  U5      nU R                  U5      nU R                  U5      nUR                  SS5      nU$ Nr   r   )r   ru   r   rw   r   s     r(   r   'SpeechT5PositionalConvEmbedding.forward  sV    %//15		-0]36%//15r*   )rw   ru   r   r   r   s   @r(   r   r   b  s    AB r*   r   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ ) SpeechT5ScaledPositionalEncodingi  uK   
Scaled positional encoding, see §3.2 in https://arxiv.org/abs/1809.08895
c           	        > [         R                  " X25      n[         R                  " SU5      R                  S5      n[         R                  " [         R                  " SUS[         R
                  S9R                  5       [        R                  " S5      U-  * -  5      n[         R                  " UR                  5       U-  5      US S 2SS S24'   [         R                  " UR                  5       U-  5      US S 2SS S24'   UR                  S5      n[        TU ]1  5         U R                  SUSS9  [        R                  " US	9U l        X l        [         R                  R%                  [         R&                  " S
5      5      U l        g )Nr   r   r   rB   g     @peFr   p      ?)r   rM   rP   r   r   r   r   r   r   r   r   rl   rm   r   r   Dropoutdropoutr   	Parametertensoralpha)ry   r  r   max_lenr  positiondiv_termr|   s          r(   rm   )SpeechT5ScaledPositionalEncoding.__init__  s"   [[&<<7+55a899ell1c1EKKHNNPUYU]U]^eUfilUlSmmoii 08 ;<1add7ii 08 ;<1add7\\!_T2%8zzG,XX''S(9:
r*   c                     XR                   U R                  S S 2S UR                  S5      24   -  -   nU R                  U5      nU$ )Nr   )r
  r  r   r  )ry   r   s     r(   r   (SpeechT5ScaledPositionalEncoding.forward  s@    JJMchhqkM)9!:::ll3
r*   )r
  r   r  )i  )	r   r   r   r   r   rm   r   r   r   r   s   @r(   r   r     s    ; r*   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )"SpeechT5RelativePositionalEncodingi  c                    > [         TU ]  5         Xl        X l        [        R
                  R                  SU-  U5      U l        g r   )rl   rm   r   
max_lengthr   r   	Embeddingpe_k)ry   r   r  r|   s      r(   rm   +SpeechT5RelativePositionalEncoding.__init__  s4    $HH&&q:~s;	r*   c                 t   UR                   S   n[        R                  " SU5      R                  UR                  [        R
                  S9nUS S 2S 4   US S S 24   -
  nU R                  * X3U R                  * :  '   U R                  S-
  X3U R                  :  '   X0R                  -   nU R                  U5      $ )Nr   r   r   rC   )r#   r   rP   r   r   r   r  r  )ry   r   r   pos_seqs       r(   r   *SpeechT5RelativePositionalEncoding.forward  s    %%a(,,q'*--]5I5IQVQ[Q[-\!T'"WT1W%55/3.>4??**+.2oo.A4??*+OO+yy!!r*   )r   r  r  )i  r   r   s   @r(   r  r    s    <	" 	"r*   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )r   i  c                 R   > [         TU ]  5         US-  S:X  a  SU l        g SU l        g )Nr   r   r   )rl   rm   num_pad_remove)ry   r   r|   s     r(   rm   SpeechT5SamePadLayer.__init__  s)    #:Q#>!#Car*   c                 X    U R                   S:  a  US S 2S S 2S U R                   * 24   nU$ Nr   r  r   s     r(   r   SpeechT5SamePadLayer.forward  s6    ")!Q0F43F3F2F0F*FGMr*   r"  r   r   s   @r(   r   r     s    K r*   r   c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )SpeechT5FeatureEncoderi  z.Construct the features from raw audio waveformc           	        > [         TU ]  5         UR                  S:X  a@  [        USS9/[	        UR
                  S-
  5       Vs/ s H  n[        XS-   S9PM     sn-   nOVUR                  S:X  a-  [	        UR
                  5       Vs/ s H  n[        XS9PM     nnO[        SUR                   S35      e[        R                  " U5      U l        SU l        S	U l        g s  snf s  snf )
Ngroupr   )r{   r   layerz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)rl   rm   feat_extract_normr   rL   num_feat_extract_layersre   r   r%   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)ry   rz   ir,  r|   s       r(   rm   SpeechT5FeatureEncoder.__init__  s    ##w.5fqIJNSTZTrTruvTvNwNNw,V!eDNwN K %%0HMfNlNlHmHm1*6>Hm  K 01I1I0JJst  ==5&+#"Ns   C C%c                 N    U R                  5        H
  nSUl        M     SU l        g NF)
parametersrequires_gradr.  )ry   params     r(   _freeze_parameters)SpeechT5FeatureEncoder._freeze_parameters  s#    __&E"'E '#r*   c                 B   US S 2S 4   nU R                   (       a  U R                  (       a  SUl        U R                   H\  nU R                   (       a@  U R                  (       a/  U R                  (       a  U R                  UR                  U5      nMT  U" U5      nM^     U$ NT)r.  trainingr4  r,  r-  _gradient_checkpointing_func__call__)ry   r+   r   
conv_layers       r(   r   SpeechT5FeatureEncoder.forward  s    $QW- 4==*.M'**J""t'B'Bt}} $ A A''!!
 !+= 9 + r*   )r.  r,  r-  )
r   r   r   r   r   rm   r6  r   r   r   r   s   @r(   r%  r%    s    8#&$
 r*   r%  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SpeechT5FeatureProjectioni  c                 4  > [         TU ]  5         [        R                  " UR                  S   UR
                  S9U l        [        R                  " UR                  S   UR                  5      U l	        [        R                  " UR                  5      U l        g )Nr!   eps)rl   rm   r   r   rn   layer_norm_epsr   Linearr   
projectionr  feat_proj_dropoutr  ry   rz   r|   s     r(   rm   "SpeechT5FeatureProjection.__init__  sf    ,,vr':@U@UV))FOOB$79K9KLzz&":":;r*   c                 n    U R                  U5      nU R                  U5      nU R                  U5      nX4$ r   )r   rF  r  )ry   r   norm_hidden_statess      r(   r   !SpeechT5FeatureProjection.forward  s7    !__];(:;]300r*   )r  r   rF  r   r   s   @r(   r@  r@    s    <1 1r*   r@  c                   L  ^  \ rS rSrU 4S jrS r  SS\R                  S\\R                     S\\R                     4S jjrS\S\R                  4S	 jrS
\\R                  \4   4S jr  SS\R                  S\\R                     S\\R                     4S jjrSrU =r$ )SpeechT5SpeechEncoderPreneti  c                   > [         TU ]  5         Xl        [        U5      U l        [        U5      U l        UR                  S:  d  UR                  S:  aG  [        R                  " [        R                  " UR                  5      R                  5       5      U l        [!        U5      U l        [%        UR&                  UR(                  -   S-   UR                  UR(                  5      U l        g )Nr0   r   )rl   rm   rz   r%  feature_encoderr@  feature_projectionmask_time_probmask_feature_probr   r  r   r   r   uniform_masked_spec_embedr   pos_conv_embedr   max_speech_positionsr   pos_sinusoidal_embedrH  s     r(   rm   $SpeechT5SpeechEncoderPrenet.__init__  s    5f=";F"C   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"=fE$I''&*=*==A%
!r*   c                 8    U R                   R                  5         g r   )rP  r6  ry   s    r(   freeze_feature_encoder2SpeechT5SpeechEncoderPrenet.freeze_feature_encoder  s    //1r*   r+   r-   mask_time_indicesc                    U R                  U5      nUR                  SS5      nUb  U R                  UR                  S   U5      nU R	                  U5      u  pTU R                  XSUS9nU R                  U5      nXV-   nUb   UR                  S5      R                  5       nO;[        R                  " UR                  S S [        R                  UR                  S9nU R                  U5      nXX-   nXR4$ )Nr   r   )r^  r-   r   )rP  r   "_get_feature_vector_attention_maskr#   rQ  _mask_hidden_statesrV  r   r   r   rM   r   rX  )	ry   r+   r-   r^  extract_featuresr   positional_conv_embeddingpadding_mask positional_sinusoidal_embeddingss	            r(   r   #SpeechT5SpeechEncoderPrenet.forward  s     //=+55a;%!DD &&q)N
 +/*A*ABR*S'00~ 1 
 %)$7$7$F!%A%),,Q/446L ;;}':':2A'>ejjYfYmYmnL+/+D+D\+R(%H,,r*   feature_vector_lengthc                    UR                  SS9S S 2S4   nU R                  U5      R                  [        R                  5      nUR
                  S   n[        R                  " XQ4UR                  UR                  S9nSU[        R                  " UR
                  S   UR                  S9US-
  4'   UR                  S/5      R                  S5      R                  S/5      R                  5       nU$ )Nr!   r   r   r   r   r   )r    _get_feat_extract_output_lengthsr   r   r   r#   rM   rC   r   rP   fliprN   )ry   rg  r-   non_padded_lengthsoutput_lengthsrZ   s         r(   r`  >SpeechT5SpeechEncoderPrenet._get_feature_vector_attention_mask=  s     ,22r2:1b5A>>?QRUUV[V`V`a#))!,
/~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr*   r\   c                     S n[        U R                  R                  U R                  R                  5       H  u  p4U" XU5      nM     U$ )z8
Computes the output length of the convolutional layers
c                 8    [         R                  " X-
  USS9S-   $ )Nfloor)rounding_moder   )r   div)r<   ri   rj   s      r(   _conv_out_lengthVSpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengths.<locals>._conv_out_lengthR  s      99\7wWZ[[[r*   )ziprz   rr   rs   )ry   r\   rt  ri   rj   s        r(   rj  <SpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengthsM  sG    
	\
 $'t{{'>'>@W@W#XK,]PM $Y r*   r   c                    [        U R                  SS5      (       d  U$ UR                  5       u  pEnUb(  U R                  R	                  UR
                  5      X'   OU R                  R                  S:  a  U R                  (       a  [        XE4U R                  R                  U R                  R                  UU R                  R                  S9n[        R                  " X!R                  [        R                  S9nU R                  R	                  UR
                  5      X'   U R                  R                  S:  a  U R                  (       a  [        XF4U R                  R                  U R                  R                   U R                  R"                  S9n[        R                  " XqR                  [        R                  S9nUSS2S4   R%                  SUS5      nSX'   U$ )	z
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://arxiv.org/abs/1904.08779).
apply_spec_augmentTNr   )r3   r4   r-   r5   r  )r3   r4   r5   r!   )getattrrz   r   rU  r   rC   rR  r:  rc   mask_time_lengthmask_time_min_masksr   r	  r   rN   rS  mask_feature_lengthmask_feature_min_masksexpand)ry   r   r^  r-   rZ   r?   r   mask_feature_indicess           r(   ra  /SpeechT5SpeechEncoderPrenet._mask_hidden_states]  s    t{{$8$??   4A3E3E3G0
[(/3/E/E/H/HI\I\/]M,[[''!+ 5-++44 KK88-++99! !&->G[G[chcmcm n/3/E/E/H/HI\I\/]M,;;((1,#8)++77 KK;;++<<	$  $)<<0DMaMainisis#t #74#@#G#GO]_#` 23M/r*   )rz   rP  rQ  rU  rV  rX  NN)r   r   r   r   rm   r\  r   r   r   
LongTensorFloatTensorr   r:   r`  r   rj  ra  r   r   r   s   @r(   rN  rN    s    
"2 6:9=	 -ll - !!1!12 - $E$5$56	 -F ]b]m]m  eEDTDTVYDY>Z & :>59	,((, $E$5$56, !!1!12	, ,r*   rN  c                   t   ^  \ rS rSrU 4S jrS r SS\R                  S\\R                     4S jjr	Sr
U =r$ )	SpeechT5SpeechDecoderPreneti  c           	      f  > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H@  n[        R                  " US:X  a  UR                  OUR                  UR                  5      PMB     sn5      U l
        [        R                  " UR                  UR                  5      U l        [        UR                  UR                  UR                  5      U l        [        R                  " UR"                  UR                  -   UR                  5      U l        g s  snf r!  )rl   rm   rz   r   r+  rL   speech_decoder_prenet_layersrE  num_mel_binsspeech_decoder_prenet_unitslayersr   final_layerr   positional_dropoutrW  encode_positionsspeaker_embedding_dimspeaker_embeds_layerry   rz   r/  r|   s      r(   rm   $SpeechT5SpeechDecoderPrenet.__init__  s    mm vBBC
 DA	 		+,6F''v7Y7Y66 D
 99V%G%GI[I[\ @%%''!

 %'IIf.J.JVM_M_._agasas$t!s   AD.c                     [         R                  " US   US9nUR                  S5      R                  UR	                  S5      SS5      n[         R
                  " US:H  US5      S-  SU-
  -  $ )Nr   r  r   )r   	bernoullir   repeatr   where)ry   inputs_embedsr  r   	all_maskss        r(   _consistent_dropout/SpeechT5SpeechDecoderPrenet._consistent_dropout  sd    }Q/15NN1%,,]-?-?-BAqI	{{9>=!<q@AEJJr*   r+   speaker_embeddingsc                 6   UnU R                    HM  n[        R                  R                  U" U5      5      nU R	                  X0R
                  R                  5      nMO     U R                  U5      nU R                  U5      nUb  [        R                  R                  U5      nUR                  S5      R                  SUR                  S5      S5      n[        R                  " X2/SS9n[        R                  R                  U R                  U5      5      nU$ )Nr   r!   r   )r  r   
functionalrelur  rz   speech_decoder_prenet_dropoutr  r  	normalizer   r  r   r   r   r  )ry   r+   r  r  r(  s        r(   r   #SpeechT5SpeechDecoderPrenet.forward  s     %[[EMM..u]/CDM 44]KKDmDmnM ! ((7--m<)!#!8!89K!L!3!=!=a!@!G!GML^L^_`Lace!f!II}&IrRMMM..t/H/H/WXMr*   )rz   r  r  r  r  r   )r   r   r   r   rm   r  r   r   r   r   r   r   r   s   @r(   r  r    s=    u,K 6:ll %U\\2 r*   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SpeechT5BatchNormConvLayeri  c           	        > [         TU ]  5         US:X  a  UR                  nOUR                  nX!R                  S-
  :X  a  UR                  nOUR                  n[
        R                  " UUUR                  SUR                  S-
  S-  SS9U l        [
        R                  " U5      U l
        X!R                  S-
  :  a  [
        R                  " 5       U l        OS U l        [
        R                  " UR                  5      U l        g )Nr   r   r   F)ri   rj   r   rk   )rl   rm   r  speech_decoder_postnet_unitsspeech_decoder_postnet_layersr   rq   speech_decoder_postnet_kernelru   BatchNorm1d
batch_normTanhrw   r  speech_decoder_postnet_dropoutr  )ry   rz   r{   ro   rp   r|   s        r(   rm   #SpeechT5BatchNormConvLayer.__init__  s    q= --K ==K;;a??!..L!>>LII<<99A=!C
	 ..6::Q>> ggiDO"DOzz&"G"GHr*   c                     U R                  U5      nU R                  U5      nU R                  b  U R                  U5      nU R                  U5      nU$ r   )ru   r  rw   r  r   s     r(   r   "SpeechT5BatchNormConvLayer.forward  sJ    		-06??& OOM:M]3r*   )rw   r  ru   r  r   r   r   s   @r(   r  r    s    I< r*   r  c                   l   ^  \ rS rSrU 4S jrS\R                  4S jrS\R                  4S jrSr	U =r
$ )SpeechT5SpeechDecoderPostneti  c           	        > [         TU ]  5         Xl        [        R                  " UR
                  UR                  UR                  -  5      U l        [        R                  " UR
                  UR                  5      U l	        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        g s  snf r   )rl   rm   rz   r   rE  r   r  r,   feat_outprob_outr+  rL   r  r  r  r  s      r(   rm   %SpeechT5SpeechDecoderPostnet.__init__  s    		&"4"4f6I6IFLcLc6cd		&"4"4f6M6MNmm<A&BfBf<gh<gq'2<gh
hs   *Cr   c                    U R                  U5      R                  UR                  S5      SU R                  R                  5      nU R                  U5      nU R                  U5      R                  UR                  S5      S5      nX#U4$ )Nr   r!   )r  r   r   rz   r  postnetr  )ry   r   outputs_before_postnetoutputs_after_postnetlogitss        r(   r   $SpeechT5SpeechDecoderPostnet.forward  s{    !%}!=!B!B=CUCUVWCXZ\^b^i^i^v^v!w $-C D}-22=3E3Ea3H"M%fDDr*   c                     UR                  SS5      nU R                   H  nU" U5      nM     XR                  SS5      -   $ r   )r   r  )ry   r   layer_outputr(  s       r(   r  $SpeechT5SpeechDecoderPostnet.postnet  sB    $..q!4[[E .L !55a;;;r*   )rz   r  r  r  )r   r   r   r   rm   r   r   r   r  r   r   r   s   @r(   r  r    s/    	
EU\\ E<U\\ < <r*   r  c                   V   ^  \ rS rSrU 4S jrS rS rS\R                  4S jr	Sr
U =r$ )SpeechT5TextEncoderPreneti  c                   > [         TU ]  5         Xl        [        R                  " UR
                  UR                  UR                  5      U l        [        UR                  UR                  UR                  5      U l        g r   )rl   rm   rz   r   r  
vocab_sizer   r   embed_tokensr   r  max_text_positionsr  rH  s     r(   rm   "SpeechT5TextEncoderPrenet.__init__  sc    LL):):F<N<NPVPcPcd @%%%%!
r*   c                     U R                   $ r   r  r[  s    r(   get_input_embeddings.SpeechT5TextEncoderPrenet.get_input_embeddings         r*   c                     Xl         g r   r  ry   values     r(   set_input_embeddings.SpeechT5TextEncoderPrenet.set_input_embeddings      !r*   r   c                 J    U R                  U5      nU R                  U5      nU$ r   )r  r  )ry   r   r  s      r(   r   !SpeechT5TextEncoderPrenet.forward  s(    )))4--m<r*   )rz   r  r  )r   r   r   r   rm   r  r  r   r   r   r   r   r   s   @r(   r  r    s'    
!"  r*   r  c            	          ^  \ rS rSrU 4S jrS rS r  S
S\R                  S\	\R                     S\	\\R                        4S jjrS	rU =r$ )SpeechT5TextDecoderPreneti  c                   > [         TU ]  5         Xl        [        R                  " UR
                  5      U l        UR                  (       a   [        R                  " UR                  5      OSU l        [        R                  " UR                  UR                  UR                  5      U l        [!        UR"                  UR                  -   S-   UR                  UR                  5      U l        g )Nr  r   )rl   rm   rz   r   r  r  r  scale_embeddingr   sqrtr   embed_scaler  r  r   r  r   r  embed_positionsrH  s     r(   rm   "SpeechT5TextDecoderPrenet.__init__  s    zz&";";<<B<R<R499V%7%78X[LL):):F<N<NPVPcPcdD%%(;(;;a? 
r*   c                     U R                   $ r   r  r[  s    r(   r  .SpeechT5TextDecoderPrenet.get_input_embeddings'  r  r*   c                     Xl         g r   r  r  s     r(   r  .SpeechT5TextDecoderPrenet.set_input_embeddings*  r  r*   r   r-   past_key_valuesc                 ,   Ub&  UR                  5       nUR                  SUS   5      nO[        S5      eUb  US   S   R                  S   OSnU R	                  X5      nU R                  U5      U R                  -  nXv-  nU R                  U5      nXr4$ )Nr!   z'You have to specify `decoder_input_ids`r   r   )r   r   r%   r#   r  r  r  r  )ry   r   r-   r  input_shaper   	positionsr  s           r(   r   !SpeechT5TextDecoderPrenet.forward-  s      #..*K!r;r?;IFGGCRC^!3A!6!<!<Q!?de((K	)))4t7G7GG"]3,,r*   )rz   r  r  r  r  r  )r   r   r   r   rm   r  r  r   r   r   r  r   r  r   r   r   r   s   @r(   r  r    s_    
!" 6:=A	-<<- !!1!12- "$u'8'8"9:	- -r*   r  c                   V   ^  \ rS rSrU 4S jrS\R                  4S jrS rS r	Sr
U =r$ )SpeechT5TextDecoderPostnetiC  c                    > [         TU ]  5         Xl        [        R                  " UR
                  UR                  SS9U l        g )NFrk   )rl   rm   rz   r   rE  r   r  lm_headrH  s     r(   rm   #SpeechT5TextDecoderPostnet.__init__D  s3    yy!3!3V5F5FUSr*   r   c                 $    U R                  U5      $ r   r  r   s     r(   r   "SpeechT5TextDecoderPostnet.forwardI  s    ||M**r*   c                     U R                   $ r   r  r[  s    r(   get_output_embeddings0SpeechT5TextDecoderPostnet.get_output_embeddingsL      ||r*   c                     Xl         g r   r  ry   new_embeddingss     r(   set_output_embeddings0SpeechT5TextDecoderPostnet.set_output_embeddingsO  s    %r*   )rz   r  )r   r   r   r   rm   r   r   r   r  r  r   r   r   s   @r(   r  r  C  s(    T
+U\\ +& &r*   r  c                     ^  \ rS rSrSr   SS\S\S\S\S\4
U 4S jjjrS	\	R                  S
\S\4S jr      SS\	R                  S\\	R                     S\\\	R                        S\\	R                     S\\	R                     S\\	R                     S\S\\	R                  \\	R                     \\\	R                        4   4S jjrSrU =r$ )SpeechT5AttentioniS  z
Multi-headed attention from 'Attention Is All You Need' paper with relative position bias (see
https://aclanthology.org/N18-2074.pdf)
	embed_dim	num_headsr  
is_decoderrk   c                   > [         TU ]  5         Xl        X l        X0l        X-  U l        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r  )rl   rm   r  r  r  head_dimr%   scalingr  r   rE  k_projv_projq_projout_proj)ry   r  r  r  r  rk   r|   s         r(   rm   SpeechT5Attention.__init__Y  s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$ii	4@ii	4@ii	4@		)TBr*   r	  r   r   c                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ r   )r   r  r  r   
contiguous)ry   r	  r   r   s       r(   _shapeSpeechT5Attention._shapet  s5    {{3GQQRSUVWbbddr*   r   key_value_statespast_key_valuer-   layer_head_maskposition_biasoutput_attentionsr6   c                 D   USLnUR                  5       u  pnU R                  U5      U R                  -  nU(       a  Ub  US   nUS   nGOU(       aE  U R                  U R	                  U5      SU	5      nU R                  U R                  U5      SU	5      nOUby  U R                  U R	                  U5      SU	5      nU R                  U R                  U5      SU	5      n[        R                  " US   U/SS9n[        R                  " US   U/SS9nODU R                  U R	                  U5      SU	5      nU R                  U R                  U5      SU	5      nU R                  (       a  X4nXR                  -  SU R                  4nU R                  XU	5      R                  " U6 nUR                  " U6 nUR                  " U6 nUR                  S5      n[        R                  " XR                  SS5      5      nUR                  5       XR                  -  U
U4:w  a.  [        SXR                  -  U
U4 SUR                  5        35      eUb  UR                  5       R                  XR                  -  SU R                  5      R                  SS5      n[        R                   " UUR                  S	S5      5      nUR                  SS5      R                  XR                  -  UR                  S5      UR                  S5      5      nUU-  nUbz  UR                  5       U	SU
U4:w  a#  [        S
U	SU
U4 SUR                  5        35      eUR                  XR                  U
U5      U-   nUR                  XR                  -  U
U5      n["        R$                  R'                  USS9nUb  UR                  5       U R                  4:w  a*  [        SU R                  4 SUR                  5        35      eUR                  SSSS5      UR                  XR                  U
U5      -  nUR                  XR                  -  U
U5      nU(       a=  UR                  XR                  U
U5      nUR                  XR                  -  U
U5      nOSn["        R$                  R)                  UU R(                  U R*                  S9n[        R                  " UU5      nUR                  5       XR                  -  XR                  4:w  a5  [        SXR                  XR                  4 SUR                  5        35      eUR                  XR                  XR                  5      nUR                  SS5      nUR-                  XU R.                  5      nU R1                  U5      nUUU4$ )z#Input shape: Batch x Time x ChannelNr   r   r!   r   r   z$Attention weights should be of size z	, but is r   z!Attention mask should be of size z/Head mask for a single layer should be of size )r  r:  z `attn_output` should be of size )r   r  r  r  r  r  r   r   r  r  r  r   bmmr   r%   r  matmulr   r  softmaxr  r:  rX   r  r   )ry   r   r  r  r-   r  r	  r
  is_cross_attentionr   tgt_lenr[   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weights	reshape_qrel_pos_biasattn_weights_reshaped
attn_probsattn_outputs                          r(   r   SpeechT5Attention.forwardw  sP    .T9',,.a {{=1DLL@."<'*J)!,LT[[1A%BBLJ;;t{{3C'Db#NL'T[[%?SIJ;;t{{='A2sKLN1$5z#BJJ 99nQ&7%FANL T[[%?SIJ;;t{{='A2sKL?? )7NNN*B>
{{<#>CCZP__j1
#((*5//!$yy/C/CAq/IJ3#7'"JJ6nn8LgW^7_6` a %%'(*  $$//166s^^7KRQUQ^Q^_iijkmnoI <<	=3J3J2r3RSL'11!Q7<<nn$m&8&8&;]=O=OPQ=RL L(L%""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S..'7SVddL',,S>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVdVdfmov?wwL',,S>>-A7GTL
 %1$5$5c>>7T[$\!055cNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1 "))#GmmK01>AAr*   )
r  r  r  r  r  r  r   r  r  r  )r0   FT)NNNNNF)r   r   r   r   r   r:   r   rN   rm   r   r   r  r   r   r   r   r   r   s   @r(   r  r  S  sS     CC C 	C
 C C C6eU\\ eC ec e 488<152604"'yB||yB #5<<0yB !u||!45	yB
 !.yB "%,,/yB  -yB  yB 
u||Xell3XeELL>Q5RR	SyB yBr*   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SpeechT5FeedForwardi  c                   > [         TU ]  5         [        R                  " UR                  5      U l        [        R                  " UR                  U5      U l        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        [        R                  " X!R                  5      U l        [        R                  " UR                  5      U l        g r   )rl   rm   r   r  activation_dropoutintermediate_dropoutrE  r   intermediate_dense
isinstance
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropout)ry   rz   intermediate_sizer|   s      r(   rm   SpeechT5FeedForward.__init__  s    $&JJv/H/H$I!"$))F,>,>@Q"Rf''--'-f.?.?'@D$'-'8'8D$II&79K9KL jj)>)>?r*   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ r   )r"  r&  r!  r'  r)  r   s     r(   r   SpeechT5FeedForward.forward  sX    //>00?11-@))-8++M:r*   )r&  r"  r!  r'  r)  r   r   s   @r(   r  r    s    @ r*   r  c                      ^  \ rS rSrS\4U 4S jjr    SS\R                  S\\R                     S\\R                     S\\R                     S\	4
S	 jjr
S
rU =r$ )SpeechT5EncoderLayeri  rz   c                   > [         TU ]  5         [        UR                  UR                  UR
                  SS9U l        [        R                  " UR                  5      U l
        [        R                  " UR                  UR                  S9U l        [        XR                  5      U l        [        R                  " UR                  UR                  S9U l        g )NFr  r  r  r  rB  )rl   rm   r  r   encoder_attention_headsattention_dropout	attentionr   r  r(  r  r   rD  r   r  encoder_ffn_dimfeed_forwardfinal_layer_normrH  s     r(   rm   SpeechT5EncoderLayer.__init__  s    *((44,,	
 zz&"7"78,,v'9'9v?T?TU/8N8NO "V-?-?VEZEZ [r*   r   r-   r  r	  r
  c                     UnU R                  UUUUUS9u  pnU R                  U5      nXa-   nU R                  U5      nXR                  U5      -   nU R	                  U5      nU4n	U(       a  X4-  n	U	$ )a  
Args:
    hidden_states (`torch.FloatTensor`):
        input to the layer of shape `(batch, seq_len, hidden_size)`
    attention_mask (`torch.FloatTensor`):
        attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
        large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(config.encoder_attention_heads,)`.
    position_bias (`torch.FloatTensor`):
        relative position embeddings of size `(seq_len, seq_len, hidden_size // encoder_attention_heads)`
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r-   r  r	  r
  )r4  r  r   r6  r7  )
ry   r   r-   r  r	  r
  residualr  r[   outputss
             r(   r   SpeechT5EncoderLayer.forward  s    . !)-')+'/ *8 *
&Q ]3 06%(9(9-(HH--m< "&Gr*   )r4  r  r6  r7  r   )NNNF)r   r   r   r   r   rm   r   r   r   rN   r   r   r   r   s   @r(   r/  r/    sx    \~ \  262604"',||, !., "%,,/	,
  -,  , ,r*   r/  c                   4  ^  \ rS rSrS\4U 4S jjr        SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	\R                        S\\
   S\\
   4S jjrSrU =r$ )SpeechT5DecoderLayeriH  rz   c                 p  > [         TU ]  5         [        UR                  UR                  UR
                  SS9U l        [        R                  " UR                  5      U l
        [        R                  " UR                  UR                  S9U l        [        UR                  UR                  UR
                  SS9U l        [        R                  " UR                  UR                  S9U l        [!        XR"                  5      U l        [        R                  " UR                  UR                  S9U l        g )NTr1  rB  )r  r  )rl   rm   r  r   decoder_attention_headsr3  	self_attnr   r  r(  r  r   rD  self_attn_layer_normencoder_attnencoder_attn_layer_normr  decoder_ffn_dimr6  r7  rH  s     r(   rm   SpeechT5DecoderLayer.__init__I  s    *((44,,	
 zz&"7"78$&LL1C1CI^I^$_!-**,,	
 (*||F4F4FFLaLa'b$/8N8NO "V-?-?VEZEZ [r*   r   r-   encoder_hidden_statesencoder_attention_maskr  cross_attn_layer_head_maskr  r
  	use_cachec
           	         Un
Ub  USS OSnU R                  UUUUUS9u  pnU R                  U5      nX-   nU R                  U5      nSnSnUbM  Un
Ub  USS OSnU R                  UUUUUUS9u  pnU R                  U5      nX-   nU R	                  U5      nX-   nXR                  U5      -   nU R                  U5      nU4nU(       a  UX4-  nU	(       a  UU4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, hidden_size)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
        size `(decoder_attention_heads,)`.
    past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
Nr   )r   r  r-   r  r
  r   )r   r  r-   r  r  r
  )rA  r  rB  rC  rD  r6  r7  )ry   r   r-   rG  rH  r  rI  r  r
  rJ  r:  self_attn_past_key_valueself_attn_weightspresent_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_valuer;  s                     r(   r   SpeechT5DecoderLayer.forward_  s[   < ! :H9S>"1#5Y] >Bnn'3)+/ ?M ?
;*; ]3 011-@ (,$! ,$H @N?Yrs(;_c%NRN_N_+!65 :8"3 O` OKM/K !LL7M$4M 88GM !2 P &(9(9-(HH--m< ")>>G)++Gr*   )r  rC  rD  r6  r7  rA  rB  )NNNNNNFT)r   r   r   r   r   rm   r   r   r   r   rN   r   r   r   r   s   @r(   r>  r>  H  s    \~ \2 268<9=26=A8<,1$(R||R !.R  (5	R
 !) 6R "%,,/R %-U\\$:R !u||!45R $D>R D>R Rr*   r>  c                   *    \ rS rSr\rSrSrSrS r	Sr
g)SpeechT5PreTrainedModeli  speecht5r+   Tc           
         [        U[        5      (       a  [        R                  R	                  UR
                  R                  SS[        R                  " SUR
                  R                  S   UR
                  R                  -  -  5      -  S9  [        R                  R                  UR
                  R                  S5        g[        U[        5      (       a  [        R                  " SUR                  R                  -  5      n[        R                  R!                  UR                  R                  U* US9  [        R                  R!                  UR                  R                  U* US9  g[        U[        R"                  5      (       ak  UR                  R$                  R	                  SU R&                  R(                  S9  UR                  b%  UR                  R$                  R+                  5         gg[        U[        R,                  [        R.                  45      (       aJ  UR                  R$                  R+                  5         UR                  R$                  R1                  S5        g[        U[        R2                  5      (       a  [        R                  R5                  UR                  5        UR                  bh  [        R                  " UR6                  UR                  UR                  S   -  -  5      n[        R                  R!                  UR                  U* US9  gg[        U[        R8                  5      (       ax  UR                  R$                  R	                  SU R&                  R(                  S9  UR:                  b2  UR                  R$                  UR:                     R+                  5         ggg)	zInitialize the weightsr   r   r   meanstd)abr0   Nr  )r#  r   r   initnormal_ru   r   r   r  ri   in_channels	constant_rk   r@  rF  in_featuresrT  rE  datarz   initializer_rangezero_r   r   fill_rq   kaiming_normal_r   r  r   )ry   moduleks      r(   _init_weights%SpeechT5PreTrainedModel._init_weights  s   f=>>GGOO""		!v{{'>'>q'AFKKD[D['["\]]  
 GGfkk..2 9::		!f//;;;<AGGV..55!qAGGV..33rQ?		**MM&&CT[[5R5R&S{{&  &&( 'r|| <==KK""$MM$$S)		**GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' --MM&&CT[[5R5R&S!!-""6#5#56<<> . .r*    N)r   r   r   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingrh  r   rj  r*   r(   rT  rT    s    !L"$O&*#?r*   rT  c                      ^  \ rS rSrSrS\4U 4S jjr     SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\\\4   4S jjrSrU =r$ )SpeechT5Encoderi  zm
Transformer encoder consisting of *config.encoder_layers* layers. Each layer is a [`SpeechT5EncoderLayer`].
rz   c                   > [         TU ]  U5        [        R                  " UR                  UR
                  S9U l        [        R                  " UR                  5      U l	        UR                  U l        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        [#        UR                  UR$                  -  UR&                  5      U l        SU l        U R-                  5         g s  snf )NrB  F)rl   rm   r   r   r   rD  r   r  r(  r  encoder_layerdrop	layerdropr+  rL   encoder_layersr/  r  r  r2  encoder_max_relative_positionr  r-  	post_initry   rz   r[   r|   s      r(   rm   SpeechT5Encoder.__init__  s     ,,v'9'9v?T?TUzz&"7"7811mm5QWQfQfKg$hKga%9&%AKg$hiA&"@"@@&BfBf 
 ',# 	 %is   Dr   r-   	head_maskr
  output_hidden_statesreturn_dictr6   c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  [	        X!R
                  5      nU R                  U5      nU R                  U5      nU R                  U5      n[        5       =(       d    [        U 5      nU(       a  SOSn	U(       a  SOSn
Ub`  UR                  5       S   [        U R                  5      :w  a6  [        S[        U R                  5       SUR                  5       S    S35      e[        U R                  5       H  u  pU(       a  X4-   n	SnU R                   (       a$  ["        R$                  " / 5      nXR&                  :  nU(       a  U(       a`  U R(                  (       a9  U R                   (       a(  U R+                  UR,                  UUUb  X;   OSUU5      nOU" UUUUb  X;   OSUS9nUS   nU(       a  S	nU(       d  M  U
WS
   4-   n
M     U(       a  X4-   n	U(       d  [/        S XU
4 5       5      $ [1        UU	U
S9$ )aA  
Args:
    hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
        Features extracted from the speech or text input by the encoder prenet.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
        `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nrj  r   z&The head_mask should be specified for  layers, but it is for .F)r-   r	  r  r
  r  r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   rj  .0vs     r(   	<genexpr>*SpeechT5Encoder.forward.<locals>.<genexpr>Z  s     m$[q$[s   	last_hidden_stater   
attentions)rz   r
  rz  use_return_dictr   rC   r   r  r  r   r   r   rQ   r  r%   	enumerater:  r   rG   rs  r-  r;  r<  tupler   )ry   r   r-   ry  r
  rz  r{  r	  synced_gpusall_hidden_statesall_self_attentionsidxencoder_layerskip_the_layerdropout_probabilitylayer_outputss                   r(   r   SpeechT5Encoder.forward  sI   H 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] %7H[H[\N6]3,,];02R6LT6R"6BD$5b4  ~~"c$++&66 <S=M<N O!(+,A/ 
 #,DKK"8C#$58H$H! #N}}&+jjn#!4~~!E![..4==$($E$E%..%&+4+@d%)%M %2%'5&3;D;PVZ*;%M !.a 0 ,  &9]1=M<O&O#G #9J   14D Dm]GZ$[mmm++*
 	
r*   )r  r  r-  r   rs  r  NNNNNr   r   r   r   r   r   rm   r   r  r   r   rN   r   r   r   r   r   r   r   s   @r(   rp  rp    s    ~ ( 26,0,0/3&*p
((p
 !.p
 ELL)	p

 $D>p
 'tnp
 d^p
 
uo%	&p
 p
r*   rp  c                      ^  \ rS rSrSrS\4U 4S jjr     SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\\\4   4S jjrSrU =r$ )SpeechT5EncoderWithSpeechPrenetic  z
Wrapper around SpeechT5Encoder that applies SpeechT5SpeechEncoderPrenet to convert the audio waveform data to
hidden features.
rz   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )rl   rm   rN  prenetrp  wrapped_encoderrv  rH  s     r(   rm   (SpeechT5EncoderWithSpeechPrenet.__init__i  5     1&9.v6 	r*   r+   r-   ry  r
  rz  r{  r6   c           	      T    U R                  X5      u  prU R                  UUUUUUS9nU$ N)r   r-   ry  r
  rz  r{  r  r  	ry   r+   r-   ry  r
  rz  r{  r   r;  s	            r(   r   'SpeechT5EncoderWithSpeechPrenet.forwardq  sC     )-L(Q%&&')/!5# ' 
 r*   r  r  r  r   s   @r(   r  r  c  s    
~  26,0,0/3&*'' !. ELL)	
 $D> 'tn d^ 
uo%	& r*   r  c                      ^  \ rS rSrSrS\4U 4S jjrS rS r     SS\	R                  S\\	R                     S	\\	R                     S
\\   S\\   S\\   S\\\4   4S jjrSrU =r$ )SpeechT5EncoderWithTextPreneti  zt
Wrapper around SpeechT5Encoder that applies SpeechT5TextEncoderPrenet to convert the input_ids to hidden features.
rz   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )rl   rm   r  r  rp  r  rv  rH  s     r(   rm   &SpeechT5EncoderWithTextPrenet.__init__  5     /7.v6 	r*   c                 6    U R                   R                  5       $ r   r  r  r[  s    r(   r  2SpeechT5EncoderWithTextPrenet.get_input_embeddings      {{//11r*   c                 :    U R                   R                  U5        g r   r  r  r  s     r(   r  2SpeechT5EncoderWithTextPrenet.set_input_embeddings      ((/r*   r+   r-   ry  r
  rz  r{  r6   c           	      P    U R                  U5      nU R                  UUUUUUS9nU$ r  r  r  s	            r(   r   %SpeechT5EncoderWithTextPrenet.forward  s@     L1&&')/!5# ' 
 r*   r  r  )r   r   r   r   r   r   rm   r  r  r   r  r   r   rN   r   r   r   r   r   r   r   s   @r(   r  r    s    ~ 20 26,0,0/3&*'' !. ELL)	
 $D> 'tn d^ 
uo%	& r*   r  c                      ^  \ rS rSrSrS\4U 4S jjr     SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\\\4   4S jjrSrU =r$ )SpeechT5EncoderWithoutPreneti  
This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
[`SpeechT5Model`].
rz   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rl   rm   rp  r  rv  rH  s     r(   rm   %SpeechT5EncoderWithoutPrenet.__init__  )     .v6 	r*   r+   r-   ry  r
  rz  r{  r6   c           	      *    U R                  UUUUUUS9$ r  r  )ry   r+   r-   ry  r
  rz  r{  s          r(   r   $SpeechT5EncoderWithoutPrenet.forward  s.     ##&)/!5# $ 
 	
r*   r  r  r  r   s   @r(   r  r    s    
~  26,0,0/3&*
''
 !.
 ELL)	

 $D>
 'tn
 d^
 
uo%	&
 
r*   r  c                   f  ^  \ rS rSrSrS\4U 4S jjr           SS\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\\R                        S\\   S\\   S\\   S\\   S\\\4   4S jjrSrU =r$ )SpeechT5Decoderi  zl
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SpeechT5DecoderLayer`]
rz   c                   > [         TU ]  U5        UR                  U l        [        R
                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l	        SU l
        U R                  5         g s  snf r2  )rl   rm   decoder_layerdroprs  r   r+  rL   decoder_layersr>  r  r-  rv  rw  s      r(   rm   SpeechT5Decoder.__init__  sj     11mm5QWQfQfKg$hKga%9&%AKg$hi&+# 	 %is   	Br   r-   rG  rH  ry  cross_attn_head_maskr  rJ  r
  rz  r{  r6   c                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUR                  5       SS nUb  US   S   R                  S   OSn[        X,X5      nUb  Ub  [        XAR                  US   S9n[        5       =(       d    [        U 5      nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU
(       a  SOSnU	(       a  SOSnU	(       a  Ub  SOSnU(       a  SOSn[!        XV/S	S
/5       Hn  u  nnUc  M  UR                  5       S   [#        U R$                  5      :w  d  M7  ['        SU S[#        U R$                  5       SUR                  5       S    S35      e   [)        U R$                  5       GH  u  nnU
(       a  X4-   nSnU R                  (       a%  [*        R,                  " / 5      nUU R.                  :  nU(       a	  U(       d  M[  Ub  UU   OSnU R                  (       aF  U R                  (       a5  U R1                  UR2                  UUUUUb  UU   OSUb  UU   OSSU	U5
      nOU" UUUUUb  UU   OSUb  UU   OSUU	US9	nUS   nU(       a  UUU	(       a  SOS   4-  nU	(       d  GM  UUS   4-   nUc  GM  UUS   4-   nGM     U
(       a  X4-   nU(       a  UOSnU(       d  [5        S UUUUU4 5       5      $ [7        UUUUUS9$ )a  
Args:
    hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
        Features extracted from the speech or text input by the decoder prenet.
    attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
        of the decoder.
    encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
        Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
        selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
        cross-attention on hidden heads. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
        shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
        shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
        cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

        If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
        that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
        all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr!   r   r   )r  zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Frj  ry  r  zThe `z` should be specified for r}  r~  )r-   rG  rH  r  rI  r  r
  rJ  r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   rj  r  s     r(   r  *SpeechT5Decoder.forward.<locals>.<genexpr>  s      rA rs   	)r  r  r   r  cross_attentions)rz   r
  rz  rJ  r  r   r#   r   r   rC   r   r   r-  r:  loggerwarning_oncerv  rQ   r  r%   r  r   rG   rs  r;  r<  r  r   )ry   r   r-   rG  rH  ry  r  r  rJ  r
  rz  r{  r  r   r  r  r  all_cross_attentionsnext_decoder_cache	attn_mask	mask_namer  decoder_layerr  r  r  r  
next_caches                               r(   r   SpeechT5Decoder.forward  s   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]#((*3B/CRC^!3A!6!<!<Q!?de:

 !,1G1S%?&(;(;[QS_&" 12R6LT6R&&4==##p "	 #7BD$5b4&7<Q<]rdh#,R$ %((IKYoKp$q Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03  %r #,DKK"8C#$58H$H! #N}}&+jjn#!4t~~!Ek5D5P_S1VZN**t}} $ A A!**!")*&/&;IcN1E1Q(-W[%! !.!#1*?+A7@7LYs^RV5I5U,S1[_#1&7'! *!,M"}:KQQR'S&UU"  &9]1=M<O&O#(4+?=QRCSBU+U(e #9h   14D D+4'$
 '5FH[]qr   9+&+*1
 	
r*   )r-  rs  r  NNNNNNNNNNNr   r   r   r   r   r   rm   r   r   r  r  r   r   rN   r   r   r   r   r   r   r   s   @r(   r  r    s.   	~ 	 6:59=A=A,07;=A$(,0/3&*|
 1 12|
 !!1!12|
  ((9(9:	|

 !))9)9 :|
 ELL)|
 'u||4|
 "$u'8'8"9:|
 D>|
 $D>|
 'tn|
 d^|
 
u??	@|
 |
r*   r  c                     ^  \ rS rSrSrS\4U 4S jjr            SS\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\\R                        S\\   S\\   S\\   S\\   S\\\4   4S jjrSrU =r$ )SpeechT5DecoderWithSpeechPreneti  z|
Wrapper around SpeechT5Decoder that applies SpeechT5SpeechDecoderPrenet to convert log-mel filterbanks to hidden
features.
rz   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )rl   rm   r  r  r  wrapped_decoderrv  rH  s     r(   rm   (SpeechT5DecoderWithSpeechPrenet.__init__  r  r*   r+   r-   rG  rH  r  ry  r  r  rJ  r
  rz  r{  r6   c                 Z    U R                  X5      nU R                  UUUUUUUU	U
UUS9nU$ N)r   r-   rG  rH  ry  r  r  rJ  r
  rz  r{  r  r  )ry   r+   r-   rG  rH  r  ry  r  r  rJ  r
  rz  r{  decoder_hidden_statesr;  s                  r(   r   'SpeechT5DecoderWithSpeechPrenet.forward  sP     !%L M&&/)"7#9!5+/!5# ' 
 r*   r  )NNNNNNNNNNNNr  r   s   @r(   r  r    s6   
~  5959=A=A59,07;=A$(,0/3&*u001 !!1!12  ((9(9:	
 !))9)9 : %U\\2 ELL) 'u||4 "$u'8'8"9: D> $D> 'tn d^ 
u??	@ r*   r  c                   r  ^  \ rS rSrSrS\4U 4S jjrS rS r           SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\\
R                        S\	\   S\	\   S\	\   S\	\   S\\\4   4S jjrSrU =r$ )SpeechT5DecoderWithTextPreneti  zs
Wrapper around SpeechT5Decoder that applies SpeechT5TextDecoderPrenet to convert input tokens to hidden features.
rz   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )rl   rm   r  r  r  r  rv  rH  s     r(   rm   &SpeechT5DecoderWithTextPrenet.__init__  r  r*   c                 6    U R                   R                  5       $ r   r  r[  s    r(   r  2SpeechT5DecoderWithTextPrenet.get_input_embeddings  r  r*   c                 :    U R                   R                  U5        g r   r  r  s     r(   r  2SpeechT5DecoderWithTextPrenet.set_input_embeddings  r  r*   r+   r-   rG  rH  ry  r  r  rJ  r
  rz  r{  r6   c                 `    U R                  XU5      u  pU R                  UUUUUUUUU	U
US9nU$ r  r  )ry   r+   r-   rG  rH  ry  r  r  rJ  r
  rz  r{  r  r;  s                 r(   r   %SpeechT5DecoderWithTextPrenet.forward  sV     15LZi0j-&&/)"7#9!5+/!5# ' 
 r*   r  r  )r   r   r   r   r   r   rm   r  r  r   r   r  r  r   r   rN   r   r   r   r   r   r   r   s   @r(   r  r    s*   ~ 20
 5959=A=A,07;=A$(,0/3&*u001 !!1!12  ((9(9:	
 !))9)9 : ELL) 'u||4 "$u'8'8"9: D> $D> 'tn d^ 
u??	@ r*   r  c                   f  ^  \ rS rSrSrS\4U 4S jjr           SS\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\\R                        S\\   S\\   S\\   S\\   S\\\4   4S jjrSrU =r$ )SpeechT5DecoderWithoutPreneti  r  rz   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rl   rm   r  r  rv  rH  s     r(   rm   %SpeechT5DecoderWithoutPrenet.__init__  r  r*   r+   r-   rG  rH  ry  r  r  rJ  r
  rz  r{  r6   c                 8    U R                  UUUUUUUUU	U
US9nU$ r  r  )ry   r+   r-   rG  rH  ry  r  r  rJ  r
  rz  r{  r;  s                r(   r   $SpeechT5DecoderWithoutPrenet.forward  sA     &&&)"7#9!5+/!5# ' 
 r*   r  r  r  r   s   @r(   r  r    s    
~  5959=A=A,07;=A$(,0/3&*u001 !!1!12  ((9(9:	
 !))9)9 : ELL) 'u||4 "$u'8'8"9: D> $D> 'tn d^ 
u??	@ r*   r  c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  S\R                  4S	 jrS
 r\S 5       rSrU =r$ )$SpeechT5GuidedMultiheadAttentionLossi0  z
Guided attention loss from the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
Networks with Guided Attention](https://arxiv.org/abs/1710.08969), adapted for multi-head attention.
rz   c                 f   > [         TU ]  5         UR                  U l        UR                  U l        g r   )rl   rm   guided_attention_loss_sigmasigmaguided_attention_loss_scalescalerH  s     r(   rm   -SpeechT5GuidedMultiheadAttentionLoss.__init__6  s(    77
77
r*   r  input_masksoutput_masksr6   c                 D   U R                  X#UR                  5      nUR                  S5      UR                  S5      -  nUR                  UR                  5      R                  S5      nXA-  n[        R
                  " UR                  U5      5      nU R                  U-  $ )a  
Compute the attention loss.

Args:
    attentions (`torch.FloatTensor` of shape `(batch_size, layers * heads, output_sequence_length, input_sequence_length)`):
        Batch of multi-head attention weights
    input_masks (`torch.BoolTensor` of shape `(batch_size, input_sequence_length)`):
        Input attention mask as booleans.
    output_masks (`torch.BoolTensor` of shape `(batch_size, output_sequence_length)`):
        Target attention mask as booleans.

Returns:
    `torch.Tensor` with the loss value
r!   r   r   )_make_guided_attention_masksr   r   r   r   rX  masked_selectr  )ry   r  r  r  guided_attn_masksmaskslosseslosss           r(   r   ,SpeechT5GuidedMultiheadAttentionLoss.forward;  s    " !==kYcYjYjk&&r*[-B-B2-FF**+55a8"/zz&..u56zzD  r*   c                 j   UR                  S5      nUR                  S5      n[        R                  " [        U5      UR                  S   UR                  S   4US9n[        [        XE5      5       H.  u  nu  pU R                  XU R                  U5      XgS U	2S U24'   M0     UR                  S5      $ )Nr!   r   ri  )
rJ   r   rM   rQ   r#   r  rv  _make_guided_attention_maskr  r   )
ry   r  r  r   r\   rm  r  r  ilenolens
             r(   r  ASpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_masksT  s    #+%))"-!KK[)9<;M;Ma;PR]RcRcdeRf(gpvw!*3}+M!NC$373S3STX`d`j`jlr3s5D5%4%/0 "O !**1--r*   c                    [         R                  " [         R                  " XS9[         R                  " XS9SS9u  pEUR                  5       U-  nUR                  5       U -  nS[         R                  " XE-
  S-  * SUS-  -  -  5      -
  $ )Nri  xy)indexingr  r   )r   meshgridrP   r   r   )r<   output_lengthr  r   grid_ygrid_xs         r(   r  @SpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_mask_  sz    LL5LL6

 -/,.UYY&/a!78ANKLLLr*   )r  r  )r   r   r   r   r   r   rm   r   r  
BoolTensorr   r   r  r   r  r   r   r   s   @r(   r  r  0  sj    
8~ 8
!++!:?:J:J!Z_ZjZj!	!2	. M Mr*   r  c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\R                  S	\R                  S
\
\R                     S\R                  4S jjrSrU =r$ )SpeechT5SpectrogramLossik  z3
Loss computation used by SpeechT5ForTextToSpeech.
rz   c                 .  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        [        5       U l        [        [        R                  " S5      S9U l
        U R                  (       a  [        U5      U l        g g )Ng      @)
pos_weight)rl   rm   use_guided_attention_lossguided_attention_loss_num_headsr,   r
   l1_criterionr   r   r	  bce_criterionr  attn_criterionrH  s     r(   rm    SpeechT5SpectrogramLoss.__init__p  ss    )/)I)I&/5/U/U, & 7 7"H.%,,s:KL))"Fv"ND *r*   r-   r  r  r  labelsr  r6   c           	      V   US:g  nUR                  U5      nUR                  U5      nUR                  U5      nU R                  X55      U R                  X%5      -   nUS S 2S S 2S4   n	[        R                  " U	) S-  [        R                  " U	R                  S5      S5      R                  U	R                  5      /SS9n
U
S S 2SS 24   R                  U	5      n
UR                  U	5      nU R                  XJ5      nX-   nU R                  (       a  [        R                  " U Vs/ s H  oS S 2S U R                  24   PM     snSS9nUS:H  nUS S 2S S 2S4   nU R                  S:  a#  US S 2U R                  S-
  S U R                  24   nU R                  XU5      nUU-  nU$ s  snf )Nr/   r   r  r   r   )r  r  r   r   rS   r   r   r   r  r  r  r,   r  )ry   r-   r  r  r  r  r  rd  l1_lossr  stop_labelsbce_lossr  xattnr  r  	attn_losss                     r(   r   SpeechT5SpectrogramLoss.forward|  s    ' %%l3!7!E!El!S 5 C CL Q ##$9BTEVEVWmEvv Q1W%ii%#uzz%**Q-/K/N/Nu||/\ ]cde!!QR%(66u=%%e, %%f: ! ))99TdeTdq#IT%I%I#I IJTdeklmD(A-K'1a0L$$q(+At/D/Dq/H/aDLaLa/a,ab++D|LIID fs   %F&)r  r  r  r  r,   r  r   )r   r   r   r   r   r   rm   r   r  r  r   r   r   r   r   r   s   @r(   r  r  k  s    
O~ 
O& 9=)(() !& 1 1)  %00	)
 !!) !!) #5#4#45) 
) )r*   r  zv
    The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.
    custom_introc            "       Z  ^  \ rS rSr  SS\S\\R                     S\\R                     4U 4S jjjrS r	S r
S rS	 rS
 r\              SS\\R                      S\\R"                     S\\R                      S\\R"                     S\\R$                     S\\R$                     S\\R                      S\\\\R$                           S\\\\R$                           S\\   S\\R$                     S\\   S\\   S\\   S\\\R$                     \4   4S jj5       rSrU =r$ )SpeechT5Modeli  rz   encoderdecoderc                    > [         TU ]  U5        Xl        Uc  [        U5      OUU l        Uc  [        U5      OUU l        U R                  5         g)z
encoder (`PreTrainedModel`, *optional*):
    The encoder model to use.
decoder (`PreTrainedModel`, *optional*):
    The decoder model to use.
N)rl   rm   rz   r  r"  r  r#  rv  )ry   rz   r"  r#  r|   s       r(   rm   SpeechT5Model.__init__  sK     	 ?F3F;T[?F3F;T[ 	r*   c                     [        U R                  [        5      (       a  U R                  R                  5       $ [        U R                  [
        5      (       a  U R                  R                  5       $ [        er   )r#  r"  r  r  r#  r  NotImplementedErrorr[  s    r(   r  "SpeechT5Model.get_input_embeddings  sR    dll$ABB<<4466dll$ABB<<4466!!r*   c                     [        U R                  [        5      (       a  U R                  R                  U5        [        U R                  [
        5      (       a  U R                  R                  U5        g g r   )r#  r"  r  r  r#  r  r  s     r(   r  "SpeechT5Model.set_input_embeddings  sP    dll$ABBLL--e4dll$ABBLL--e4 Cr*   c                     U R                   $ r   )r"  r[  s    r(   get_encoderSpeechT5Model.get_encoder  r  r*   c                     U R                   $ r   )r#  r[  s    r(   get_decoderSpeechT5Model.get_decoder  r  r*   c                     [        U R                  [        5      (       a%  U R                  R                  R	                  5         ggz
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
N)r#  r"  r  r  r\  r[  s    r(   r\  $SpeechT5Model.freeze_feature_encoder  s2    
 dll$CDDLL668 Er*   r+   r-   decoder_input_valuesdecoder_attention_maskry  decoder_head_maskr  encoder_outputsr  rJ  r  r
  rz  r{  r6   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUc  U R                  UUUUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUbV  [        U R
                  [        5      (       a7  U R
                  R                  R                  US   R                  S   U5      nOUn[        U R                  [        5      (       a  SU0nO0 nU R                  " S
UUUS   UUUU	U
UUUS.UD6nU(       d  UU-   $ [        UR                   UR"                  UR$                  UR&                  UR(                  UR                   UR$                  UR&                  S	9$ )a)  
input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
    Depending on which encoder is being used, the `input_values` are either: float values of the input raw
    speech waveform, or indices of input sequence tokens in the vocabulary, or hidden states.
decoder_input_values (`torch.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Depending on which decoder is being used, the `decoder_input_values` are either: float values of log-mel
    filterbank features extracted from the raw speech waveform, or indices of decoder input sequence tokens in
    the vocabulary, or hidden states.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
    also be used by default.

    If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
    Tensor containing the speaker embeddings.
N)r+   r-   ry  r
  rz  r{  r   r   r   r  r  )r+   r-   rG  rH  ry  r  r  rJ  r
  rz  r{  )r  r  r  decoder_attentionsr  encoder_last_hidden_staterG  encoder_attentionsrj  )rz   r
  rz  rJ  r  r"  r#  r   rQ   r  r  r`  r#   r#  r  r   r  r  r   r  r  )ry   r+   r-   r4  r5  ry  r6  r  r7  r  rJ  r  r
  rz  r{  rH  decoder_argsdecoder_outputss                     r(   r   SpeechT5Model.forward  s   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ""ll)-#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO %*T\\Cb*c*c%)\\%8%8%[%["((+^&" &4"dll$CDD02DELL,, 
-1"1!"4#9'!5+/!5#
 
 "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r*   )rz   r#  r"  r  NNNNNNNNNNNNNN)r   r   r   r   r   r   r   Modulerm   r  r  r,  r/  r\  r   r   r   r  r  r   rN   r   r   r   r   r   r   s   @r(   r!  r!    s    (,'+	 "))$ "))$	 ("59  04597;=A159=7;EIEI$(:>,0/3&*i
u||,i
 !!1!12i
 'u||4	i

 !))9)9 :i
 E--.i
 $E$5$56i
 'u||4i
 "%e.?.?(@"ABi
 "%e.?.?(@"ABi
 D>i
 %U%6%67i
 $D>i
 'tni
 d^i
  
uU&&');;	<!i
 i
r*   r!  zB
    SpeechT5 Model with a speech encoder and a text decoder.
    c            "         ^  \ rS rSrS/rS\4U 4S jjrS rS rS r	S r
S	 r\              SS
\\R                     S\\R                      S\\R                      S\\R                      S\\R                     S\\R                     S\\R"                     S\\\\R                           S\\\\R                           S\\   S\\   S\\   S\\   S\\R                      S\\\4   4S jj5       r\S 5       rSrU =r$ )SpeechT5ForSpeechToTextiJ  z#text_decoder_postnet.lm_head.weightrz   c                    > [         TU ]  U5        UR                  c  [        SU R                   S35      e[        U5      n[        U5      n[        XU5      U l        [        U5      U l
        U R                  5         g )NYou are trying to instantiate a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForSpeechToText.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rl   rm   r  r%   r|   r  r  r!  rU  r  text_decoder_postnetrv  )ry   rz   speech_encodertext_decoderr|   s       r(   rm    SpeechT5ForSpeechToText.__init__R  s}     $00@ A/ /  9@4V<%flK$>v$F! 	r*   c                 6    U R                   R                  5       $ r   rU  r,  r[  s    r(   r,  #SpeechT5ForSpeechToText.get_encoderf      }}((**r*   c                 6    U R                   R                  5       $ r   rU  r/  r[  s    r(   r/  #SpeechT5ForSpeechToText.get_decoderi  rL  r*   c                 T    U R                  5       R                  R                  5         gr2  r,  r  r\  r[  s    r(   r\  .SpeechT5ForSpeechToText.freeze_feature_encoderl      
 	!!88:r*   c                 6    U R                   R                  5       $ r   )rE  r  r[  s    r(   r  -SpeechT5ForSpeechToText.get_output_embeddingss  s    ((>>@@r*   c                 :    U R                   R                  U5        g r   )rE  r  r  s     r(   r  -SpeechT5ForSpeechToText.set_output_embeddingsv  s    !!77Gr*   r+   r-   decoder_input_idsr5  ry  r6  r  r7  r  rJ  r
  rz  r{  r  r6   c                    Ub  UOU R                   R                  nUb7  Uc4  [        XR                   R                  U R                   R                  5      nU R                  UUUUUUUUU	U
UUSS9nU R                  US   5      nSnUbF  [        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  UR                  UR                   UR"                  S9	$ )a%  
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
    Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
    into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install
    soundfile*). To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
    and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    SpeechT5 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
    also be used by default.

    If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
    or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
    only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

    Label indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

Example:

```python
>>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToText
>>> from datasets import load_dataset

>>> dataset = load_dataset(
...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True
... )  # doctest: +IGNORE_RESULT
>>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate

>>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
>>> model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")

>>> # audio file is decoded on the fly
>>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
>>> predicted_ids = model.generate(**inputs, max_length=100)

>>> # transcribe speech
>>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
>>> transcription[0]
'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
```

```python
>>> inputs["labels"] = processor(text_target=dataset[0]["text"], return_tensors="pt").input_ids

>>> # compute loss
>>> loss = model(**inputs).loss
>>> round(loss.item(), 2)
19.68
```
NT)r+   r-   r4  r5  ry  r6  r  r7  r  rJ  r
  rz  r{  r   r!   r   )	r  r  r  r  r9  r  r:  rG  r;  )rz   r  r)   r   r   rU  rE  r	   r   r  r   r  r  r9  r  r:  rG  r;  )ry   r+   r-   rX  r5  ry  r6  r  r7  r  rJ  r
  rz  r{  r  r;  r  r  loss_fctoutputs                       r(   r   SpeechT5ForSpeechToText.forwardy  sX   r &1%<k$++B]B] ($6KK44dkk6X6X%! --%)!2#9/!5++/!5   
  **71:6')HFKKDKK,B,BCV[[QS_UDY,F)-)9TGf$EvE#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r*   c                 P   ^ SnU  H  nU[        U4S jU 5       5      4-  nM     U$ )Nrj  c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)r   r   r   )r  
past_statebeam_idxs     r(   r  9SpeechT5ForSpeechToText._reorder_cache.<locals>.<genexpr>	  s1     ncmU_--aZ=N=N1OPPcms   7:)r  )r  r`  reordered_past
layer_pasts    `  r(   _reorder_cache&SpeechT5ForSpeechToText._reorder_cache	  s8    )Jncmnn N * r*   )rU  rE  r?  )r   r   r   r   _tied_weights_keysr   rm   r,  r/  r\  r  r  r   r   r   r  r  r   r   rN   r   r   r   r   rd  r   r   r   s   @r(   rB  rB  J  s    @@~ (++;AH  59598<=A159=7;EIEI$(,0/3&*-1E
u001E
 !!1!12E
 $E$4$45	E

 !))9)9 :E
 E--.E
 $E$5$56E
 'u||4E
 "%e.?.?(@"ABE
 "%e.?.?(@"ABE
 D>E
 $D>E
 'tnE
 d^E
 ))*E
  
uo%	&!E
 E
N  r*   rB  modelr  	thresholdminlenratiomaxlenratiovocoderoutput_cross_attentionsreturn_output_lengthsc
                    Uc  [        S5      eUc*  SXR                  R                  :H  R                  5       -
  n
OUn
UR	                  S5      nU R
                  R                  UU
SS9nUR                  n[        U R
                  R                  [        5      (       a@  U R
                  R                  R                  R                  US   R                  S   U
5      n
[        UR	                  S5      U-  U R                  R                  -  5      n[        UR	                  S5      U-  U R                  R                  -  5      nUR                  USU R                  R                  5      n/ n/ nS nSn0 n US-  nU R
                  R                   R                  UU5      nU R
                  R                   R#                  US S 2SS 24   S UU
USUSS9nU(       a.  UR%                  [&        R(                  " UR*                  SS95        UR                  R-                  S5      nUR.                  nU R0                  R3                  U5      nUR5                  XR                  R                  U R                  R                  5      nUR%                  U5        US S 2SS S 24   R5                  USU R                  R                  5      n[&        R(                  " UU4SS9n[&        R6                  " U R0                  R9                  U5      5      nUU:  a  GM  UU:  a@  [&        R:                  " USS9U:  n[&        R<                  " U5      S   R?                  5       nO[A        [C        U5      5      nU Vs/ s H  nUU;  d  M  UPM     nn[C        U5      S:  ad  [&        RD                  " U5      nURG                  SS5      RI                  SS	5      nU R0                  RK                  U5      nU H  n UU    UU '   M     [C        U5      U:  a  OGM  [A        [C        U5      5       Vs/ s H  nUU   PM
     nnU	(       d  US:X  a  US   O1[&        RL                  RN                  RP                  RS                  USS
9nUb	  U" U5      n!OUn!U(       a_  [&        R(                  " US	S9nUS:  a@  UR4                  " U[        UR	                  S5      U-  5      /UR	                  5       SS  Q76 nU!U4n!U!$ / n"[A        U5       H&  nU"R%                  UU   R	                  S5      5        M(     Uc7  [&        RL                  RN                  RP                  RS                  USS
9nUU"4n!Oy/ n#[&        RL                  RN                  RP                  RS                  USS
9nU" U5      n#U" Vs/ s H,  n[        U#R	                  S5      [U        U"5      -  5      U-  PM.     n$nU#U$4n!U(       a\  [&        R(                  " US	S9nUR4                  " U[        UR	                  S5      U-  5      /UR	                  5       SS  Q76 n/ U!QUP7n!U!$ s  snf s  snf s  snf )Na  `speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
                    the code snippet provided in this link:
                    https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
                    r   r   T)r+   r-   r{  r!   )r   r-   rG  rH  r  rJ  r
  r{  r   r   )batch_first)+r%   rz   r   r:   r   rU  r"  r  r#  r  r  r`  r#   r,   r"   r  r#  r  rU   r   r   r  squeezer  speech_decoder_postnetr  r   sigmoidr  rJ   r  rK   rL   rQ   stackr   flattenr  r   r   rnnpad_sequencer;   )%rg  r+   r  r-   rh  ri  rj  rk  rl  rm  rH  r   encoder_outr:  maxlenminlenoutput_sequencespectrogramr  r  r  result_spectrogramr  decoder_outlast_decoder_outputspectrumnew_spectrogramprobmeet_thresholdsmeet_indexesr/  spectrograms
meet_indexr;  spectrogram_lengths	waveformswaveform_lengthss%                                        r(   _generate_speechr  	  s    !
 	
 !"lll6O6O&O%T%T%V!V!/


A
C..((!- ) K !, = = %..((*IJJ!&!7!7!>!>!a!aN  #%;"
 *//2[@5<<C`C``aF*//2[@5<<C`C``aF 099#q%,,B[B[\OKO
C
q !& 6 6 = =oOa bnn,,<</237";#9+5 = 	
 ###EIIk.J.JPQ$RS);;CCAF%55 //889LM==ll&C&CU\\E^E^_8$ #1b!8,11#q%,,:S:ST))_o$FAN}}U99BBCVWX< V|"'))Db"9Y"F${{?;A>EEG$SY/'3S|!q@R7RA|LS< 1$${{;7+55a;CCAqI$;;CCLQ".J5A*5M&z2 #/%&#-i j 49=O9P3QR3Qa&q)3QLR ),l1ouxx~~7I7I7V7VWcqu7V7vk*G!G"$yy)9qAQw#3#8#8-2215;<$?O?T?T?VWYWZ?[$   01G* N% !sA&&|A';';A'>? ? 88>>--::<UY:ZL#%89GI 88>>--::<UY:ZL-I_rs_rZ[INN1$5<O8P$P QTU U_rs "23G"$yy)9qA/44S)..q1C78 ;K;P;P;RSUSV;W  32!12GNW T S4  ts   7
YY)Y%3YzB
    SpeechT5 Model with a text encoder and a speech decoder.
    c            &         ^  \ rS rSrSrS\4U 4S jjr\S\4S j5       r	S r
S r\                S"S\\R                     S	\\R                     S
\\R                      S\\R                     S\\R                      S\\R                      S\\R"                     S\\\\R                            S\\\\R                            S\\   S\\   S\\   S\\   S\\R                      S\\R                      S\\R"                     S\\\4   4"S jj5       r\R,                  " 5               S#S\R                  S	\\R                     S\\R                      S\S\S\S\\R2                     S\S\S\\R                   \\R                   \R                   4   4   4S jj5       r\R,                  " 5               S#S\R                  S\\R                      S	\\R                     S\S\S\S\\R2                     S\S\S\\R                   \\R                   \R                   4   4   4S  jj5       rS!rU =r$ )$SpeechT5ForTextToSpeechi	  r   rz   c                    > [         TU ]  U5        UR                  c  [        SU R                   S35      e[        U5      n[        U5      n[        XU5      U l        [        U5      U l
        U R                  5         g )NrD  a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForTextToSpeech.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rl   rm   r  r%   r|   r  r  r!  rU  r  rr  rv  )ry   rz   text_encoderspeech_decoderr|   s       r(   rm    SpeechT5ForTextToSpeech.__init__	  s}     $00@ A/ /  5V<8@%fNK&B6&J# 	r*   r6   c                     gr9  rj  )clss    r(   can_generate$SpeechT5ForTextToSpeech.can_generate	  s    
 r*   c                 6    U R                   R                  5       $ r   rJ  r[  s    r(   r,  #SpeechT5ForTextToSpeech.get_encoder	  rL  r*   c                 6    U R                   R                  5       $ r   rN  r[  s    r(   r/  #SpeechT5ForTextToSpeech.get_decoder	  rL  r*   r-   r4  r5  ry  r6  r  r7  r  rJ  r
  rz  r{  r  r  r  c                 p   Ub  UOU R                   R                  nUbB  Uc"  [        XR                   R                  U5      u  p4U R                   R                  (       a  SnU R                  UUUUUUUUU	U
UUUSS9nU R                  US   5      u  nnnSnUb,  [        U R                   5      nU" UUUUUUR                  5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  UR                  UR                  UR                  S9	$ )a@  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
    [`~PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
    Float values of input mel spectrogram.

    SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
    also be used by default.

    If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
    Tensor containing the speaker embeddings.
labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
    Float values of target mel spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
    computation. Spectrograms can be obtained using [`SpeechT5Processor`]. See [`SpeechT5Processor.__call__`]
    for details.
stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Binary tensor indicating the position of the stop token in the sequence.

Example:

```python
>>> from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, set_seed
>>> import torch

>>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
>>> model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
>>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

>>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
>>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

>>> set_seed(555)  # make deterministic

>>> # generate speech
>>> speech = model.generate(inputs["input_ids"], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
>>> speech.shape
torch.Size([15872])
```
NTr+   r-   r4  r5  ry  r6  r  r7  r  rJ  r  r
  rz  r{  r   r   	r  r|  r  r  r9  r  r:  rG  r;  )rz   r  r2   r,   r  rU  rr  r  r  r   r  r  r9  r:  rG  r;  )ry   r   r-   r4  r5  ry  r6  r  r7  r  rJ  r
  rz  r{  r  r  r  r;  r  r  r  r  	criterionr[  s                           r(   r   SpeechT5ForTextToSpeech.forward	  sx   X &1%<k$++B]B]#+?WKK88:P@<$ {{44$(!--")!5#9/!5++1/!5   
" AE@[@[\cde\f@g= 5v/<I&%((D +-;F)-)9TGf$EvE'-#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r*   rh  ri  rj  rk  rl  rm  c
                     UbY  UR                  S5      nUR                  S5      U:w  a3  UR                  S5      S:X  a  UR                  US5      nO[        S5      e[        U UUUUUUUUU	5
      $ )a  
Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
speech waveform using a vocoder.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary.

        Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
        [`~PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Attention mask from the tokenizer, required for batched inference to signal to the model where to
        ignore padded tokens from the input_ids.
    speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
        Tensor containing the speaker embeddings.
    threshold (`float`, *optional*, defaults to 0.5):
        The generated sequence ends when the predicted stop token probability exceeds this value.
    minlenratio (`float`, *optional*, defaults to 0.0):
        Used to calculate the minimum required length for the output sequence.
    maxlenratio (`float`, *optional*, defaults to 20.0):
        Used to calculate the maximum allowed length for the output sequence.
    vocoder (`nn.Module`, *optional*):
        The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
        spectrogram.
    output_cross_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of the decoder's cross-attention layers.
    return_output_lengths (`bool`, *optional*, defaults to `False`):
        Whether or not to return the concrete spectrogram/waveform lengths.

Returns:
    `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
    - when `return_output_lengths` is False
        - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
        - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(num_frames,)` -- The predicted speech waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
    - when `return_output_lengths` is True
        - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
        are padded to the maximum length.
        - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `List[Int]` -- A list of
        all the concrete lengths for each spectrogram.
        - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
        - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `List[Int]` -- A list of all
        the concrete lengths for each waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
r   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch_size.r   r  r%   r  )ry   r   r-   r  rh  ri  rj  rk  rl  rm  kwargsrZ   s               r(   generate SpeechT5ForTextToSpeech.generateH
  s    J )"*J!&&q)Z7%**1-2);)B)B:q)Q&$o   #!
 	
r*   c
                     UbY  UR                  S5      n
UR                  S5      U
:w  a3  UR                  S5      S:X  a  UR                  U
S5      nO[        S5      e[        U UUUUUUUUU	5
      $ )aW  
Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
speech waveform using a vocoder.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary.

        Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
        [`~PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
        Tensor containing the speaker embeddings.
    attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
        `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    threshold (`float`, *optional*, defaults to 0.5):
        The generated sequence ends when the predicted stop token probability exceeds this value.
    minlenratio (`float`, *optional*, defaults to 0.0):
        Used to calculate the minimum required length for the output sequence.
    maxlenratio (`float`, *optional*, defaults to 20.0):
        Used to calculate the maximum allowed length for the output sequence.
    vocoder (`nn.Module`, *optional*, defaults to `None`):
        The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
        spectrogram.
    output_cross_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of the decoder's cross-attention layers.
    return_output_lengths (`bool`, *optional*, defaults to `False`):
        Whether or not to return the concrete spectrogram/waveform lengths.

Returns:
    `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
    - when `return_output_lengths` is False
        - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
        - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(num_frames,)` -- The predicted speech waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
    - when `return_output_lengths` is True
        - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
        are padded to the maximum length.
        - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `List[Int]` -- A list of
        all the concrete lengths for each spectrogram.
        - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
        - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `List[Int]` -- A list of all
        the concrete lengths for each waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
r   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch size.r  )ry   r   r  r-   rh  ri  rj  rk  rl  rm  rZ   s              r(   generate_speech'SpeechT5ForTextToSpeech.generate_speech
  s    R )"*J!&&q)Z7%**1-2);)B)B:q)Q&$o   #!
 	
r*   rr  rU  NNNNNNNNNNNNNNNNNNg      ?r0   g      4@NFF)r   r   r   r   rm  r   rm   classmethodrN   r  r,  r/  r   r   r   r  r  r   r   r   r   r   r   r   r   r@  r  r  r   r   r   s   @r(   r  r  	  s    "O~ ( T  ++  1559<@=A159=7;EIEI$(,0/3&*:>.2.2#B
E,,-B
 !!1!12B
 'u'8'89	B

 !))9)9 :B
 E--.B
 $E$5$56B
 'u||4B
 "%e.?.?(@"ABB
 "%e.?.?(@"ABB
 D>B
 $D>B
 'tnB
 d^B
 %U%6%67B
  **+!B
" ell+#B
$ 
u..	/%B
 B
H ]]_ 6::> !'+(-&+Y
##Y
 !!1!12Y
 %U%6%67	Y

 Y
 Y
 Y
 "))$Y
 "&Y
  $Y
 
u  %(9(95;L;L(L"MM	NY
 Y
v ]]_ ;?59 !'+(-&+]
##]
 %U%6%67]
 !!1!12	]

 ]
 ]
 ]
 "))$]
 "&]
  $]
 
u  %(9(95;L;L(L"MM	N]
 ]
r*   r  zD
    SpeechT5 Model with a speech encoder and a speech decoder.
    c            &         ^  \ rS rSrS\4U 4S jjrS rS rS r\	                S!S\
\R                     S\
\R                     S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\\\R                           S\
\\\R                           S\
\   S\
\   S\
\   S\
\   S\
\R                     S\
\R                     S\
\R                     S\\\4   4"S jj5       r\R(                  " 5               S"S\R                  S\
\R                     S\
\R                     S\S\S\S\
\R.                     S\S\S\R                  4S jj5       rS rU =r$ )#SpeechT5ForSpeechToSpeechi  rz   c                    > [         TU ]  U5        [        U5      n[        U5      n[	        XU5      U l        [        U5      U l        U R                  5         g r   )	rl   rm   r  r  r!  rU  r  rr  rv  )ry   rz   rF  r  r|   s       r(   rm   "SpeechT5ForSpeechToSpeech.__init__  sK     8@8@%fnM&B6&J# 	r*   c                 6    U R                   R                  5       $ r   rJ  r[  s    r(   r,  %SpeechT5ForSpeechToSpeech.get_encoder  rL  r*   c                 6    U R                   R                  5       $ r   rN  r[  s    r(   r/  %SpeechT5ForSpeechToSpeech.get_decoder  rL  r*   c                 T    U R                  5       R                  R                  5         gr2  rQ  r[  s    r(   r\  0SpeechT5ForSpeechToSpeech.freeze_feature_encoder  rS  r*   r+   r-   r4  r5  ry  r6  r  r7  r  rJ  r
  rz  r{  r  r  r  r6   c                    Ub  UOU R                   R                  nUb%  Uc"  [        XR                   R                  U5      u  p4U R	                  UUUUUUUUU	U
UUUSS9nU R                  US   5      u  nnnSnU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  UR                  UR                  UR                  S9	$ )a   
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
    Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
    into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install
    soundfile*). To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
    and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
    Float values of input mel spectrogram.

    SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
    also be used by default.

    If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
    Tensor containing the speaker embeddings.
labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
    Float values of target mel spectrogram. Spectrograms can be obtained using [`SpeechT5Processor`]. See
    [`SpeechT5Processor.__call__`] for details.
stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Binary tensor indicating the position of the stop token in the sequence.

Example:

```python
>>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, set_seed
>>> from datasets import load_dataset
>>> import torch

>>> dataset = load_dataset(
...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True
... )  # doctest: +IGNORE_RESULT
>>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate

>>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
>>> model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
>>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

>>> # audio file is decoded on the fly
>>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

>>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

>>> set_seed(555)  # make deterministic

>>> # generate speech
>>> speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
>>> speech.shape
torch.Size([77824])
```
NTr  r   r   r  )rz   r  r2   r,   rU  rr  r   r  r  r9  r  r:  rG  r;  )ry   r+   r-   r4  r5  ry  r6  r  r7  r  rJ  r
  rz  r{  r  r  r  r;  r[   r|  r  r  r[  s                          r(   r   !SpeechT5ForSpeechToSpeech.forward$  s%   d &1%<k$++B]B]#+?WKK88:P@<$ --%)!5#9/!5++1/!5   
" "&!<!<WQZ!H;!^gabk1F)-)9TGf$EvE'##33")"?"?&99$55&-&G&G")"?"?&99

 
	
r*   rh  ri  rj  rk  rl  rm  c
                 n    Uc  [         R                  " SUR                  S9n[        U UUUUUUUUU	5
      $ )a   
Converts a raw speech waveform into a sequence of mel spectrograms, which are subsequently turned back into a
speech waveform using a vocoder.

Args:
    input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        Float values of input raw speech waveform.

        Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type `List[float]` or
        a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install soundfile*). To prepare the array
        into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into a tensor
        of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
    speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
        Tensor containing the speaker embeddings.
    attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
        `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    threshold (`float`, *optional*, defaults to 0.5):
        The generated sequence ends when the predicted stop token probability exceeds this value.
    minlenratio (`float`, *optional*, defaults to 0.0):
        Used to calculate the minimum required length for the output sequence.
    maxlenratio (`float`, *optional*, defaults to 20.0):
        Used to calculate the maximum allowed length for the output sequence.
    vocoder (`nn.Module`, *optional*, defaults to `None`):
        The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
        spectrogram.
    output_cross_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of the decoder's cross-attention layers.
    return_output_lengths (`bool`, *optional*, defaults to `False`):
        Whether or not to return the concrete spectrogram/waveform lengths.

Returns:
    `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
    - when `return_output_lengths` is False
        - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
        - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(num_frames,)` -- The predicted speech waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
    - when `return_output_lengths` is True
        - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
        are padded to the maximum length.
        - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `List[Int]` -- A list of
        all the concrete lengths for each spectrogram.
        - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
        - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `List[Int]` -- A list of all
        the concrete lengths for each waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
)r   i   ri  )r   rM   r   r  )
ry   r+   r  r-   rh  ri  rj  rk  rl  rm  s
             r(   r  )SpeechT5ForSpeechToSpeech.generate_speech  sM    R %!&Xl>Q>Q!R#!
 	
r*   r  r  r  )r   r   r   r   r   rm   r,  r/  r\  r   r   r   r  r  r   r   rN   r   r   r   r   r   r   r@  r  r   r   r   s   @r(   r  r    s   
~ 
++;  5959<@=A159=7;EIEI$(,0/3&*:>.2.2#|
u001|
 !!1!12|
 'u'8'89	|

 !))9)9 :|
 E--.|
 $E$5$56|
 'u||4|
 "%e.?.?(@"AB|
 "%e.?.?(@"AB|
 D>|
 $D>|
 'tn|
 d^|
 %U%6%67|
  **+!|
" ell+#|
$ 
u..	/%|
 |
| ]]_ ;?59 !'+(-&+V
''V
 %U%6%67V
 !!1!12	V

 V
 V
 V
 "))$V
 "&V
  $V
 
		V
 V
r*   r  c                   H   ^  \ rS rSrSU 4S jjrS	S jrS rS rS rSr	U =r
$ )
HifiGanResidualBlocki  c                   > [         TU ]  5         X@l        [        R                  " [        [        U5      5       Vs/ s H0  n[        R                  " UUUSX5   U R                  X#U   5      S9PM2     sn5      U l	        [        R                  " [        [        U5      5       Vs/ s H,  n[        R                  " UUUSSU R                  US5      S9PM.     sn5      U l
        g s  snf s  snf )Nr   )rj   dilationr   )rl   rm   leaky_relu_sloper   r+  rL   rQ   rq   get_paddingconvs1convs2)ry   channelsri   r  r  r/  r[   r|   s          r(   rm   HifiGanResidualBlock.__init__  s     0mm s8}-
 .A 		%[ ,,[1+F .

 mm s8}-
 .A 		 ,,[!< .



s   7C%%3C*c                     X-  U-
  S-  $ r   rj  )ry   ri   r  s      r(   r   HifiGanResidualBlock.get_padding  s    &1a77r*   c                 >   [         R                  R                  n[        [         R                  R                  S5      (       a$  [         R                  R                  R                  nU R
                   H  nU" U5        M     U R                   H  nU" U5        M     g Nr   )r   r   r   r   r   r  r  ry   r   r(  s      r(   apply_weight_norm&HifiGanResidualBlock.apply_weight_norm   si    hh**288,,m<<((33??K[[E ![[E !r*   c                     U R                    H"  n[        R                  R                  U5        M$     U R                   H"  n[        R                  R                  U5        M$     g r   )r  r   r   remove_weight_normr  ry   r(  s     r(   r  'HifiGanResidualBlock.remove_weight_norm*  sB    [[EHH''. ![[EHH''. !r*   c                 (   [        U R                  U R                  5       Hm  u  p#Un[        R                  R                  XR                  5      nU" U5      n[        R                  R                  XR                  5      nU" U5      nX-   nMo     U$ r   )rv  r  r  r   r  
leaky_relur  )ry   r   conv1conv2r:  s        r(   r   HifiGanResidualBlock.forward0  sz    T[[9LE$HMM44]DYDYZM!-0MMM44]DYDYZM!-0M)4M : r*   )r  r  r  )r   )r   r      g?)r   )r   r   r   r   rm   r  r  r  r   r   r   r   s   @r(   r  r    s!    
>8/ r*   r  z
    HiFi-GAN vocoder.
    c                      ^  \ rS rSr\rSrS\4U 4S jjrS rS r	S r
\" SS	9S\R                  S
\R                  4S j5       rSrU =r$ )SpeechT5HifiGani;  r|  rz   c                   > [         TU ]  U5        [        UR                  5      U l        [        UR
                  5      U l        [        R                  " UR                  UR                  SSSS9U l        [        R                  " 5       U l        [        [        UR
                  UR                   5      5       Ha  u  nu  p4U R                  R#                  [        R$                  " UR                  SU-  -  UR                  SUS-   -  -  UUXC-
  S-  S95        Mc     [        R                  " 5       U l        [)        [        U R                  5      5       Hp  nUR                  SUS-   -  -  n[        UR                  UR*                  5       H4  u  pFU R&                  R#                  [-        XTXaR.                  5      5        M6     Mr     [        R                  " WSSSSS9U l        U R3                  S[4        R6                  " UR                  5      5        U R3                  S[4        R8                  " UR                  5      5        U R;                  5         g )N   r   r   )ri   rj   r   r   rX  r  )rl   rm   rQ   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   rq   model_in_dimupsample_initial_channelconv_prer+  	upsamplerr  rv  upsample_kernel_sizesrU   ConvTranspose1d	resblocksrL   resblock_dilation_sizesr  r  	conv_postr   r   rM   rS   rv  )ry   rz   r/  upsample_rateri   r  r  r|   s          r(   rm   SpeechT5HifiGan.__init__D  s    v;;< !6!67		++
 /8V=R=RTZTpTp9q/r+A+NN!!""331=33a!eE +((8Q> 0s s4>>*+A661Q<HH),V-I-I6KiKi)j%%%&:8RZ\s\s&tu *k ,
 8QAaQRSVU[[1D1D%EFWejj1D1D&EF 	r*   c                 8   [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         ggg)zInitialize the weights.r0   rW  N)r#  r   rE  rq   r   ra  r]  rz   rb  rk   rc  )ry   rf  s     r(   rh  SpeechT5HifiGan._init_weightsj  sm    fryy"))455MM&&CT[[5R5R&S{{&  &&( ' 6r*   c                    [         R                  R                  n[        [         R                  R                  S5      (       a$  [         R                  R                  R                  nU" U R
                  5        U R                   H  nU" U5        M     U R                   H  nUR                  5         M     U" U R                  5        g r  )
r   r   r   r   r   r  r  r  r  r  r  s      r(   r  !SpeechT5HifiGan.apply_weight_normq  s    hh**288,,m<<((33??KDMM"^^E $^^E##% $DNN#r*   c                 R   [         R                  R                  U R                  5        U R                   H"  n[         R                  R                  U5        M$     U R
                   H  nUR                  5         M     [         R                  R                  U R                  5        g r   )r   r   r  r  r  r  r  r  s     r(   r  "SpeechT5HifiGan.remove_weight_norm}  sh    
##DMM2^^EHH''. $^^E$$& $
##DNN3r*   a  
        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
        waveform.
        r  r6   c                    U R                   R                  (       a  XR                  -
  U R                  -  nUR	                  5       S:H  nU(       d  UR                  S5      nUR                  SS5      nU R                  U5      n[        U R                  5       H  n[        R                  R                  X0R                   R                  5      nU R                  U   " U5      nU R                  X@R                   -     " U5      n[        SU R                   5       H)  nXPR                  X@R                   -  U-      " U5      -  nM+     XPR                   -  nM     [        R                  R                  U5      nU R#                  U5      n[$        R&                  " U5      nU(       d2  UR)                  S5      R                  SS5      R+                  S5      nU$ UR)                  S5      nU$ )a  
spectrogram (`torch.FloatTensor`):
    Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
    config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.

Returns:
    `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
    shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
r   r   r   r   r!   )rz   normalize_beforerX  r  r   r   r   r  rL   r  r   r  r  r  r  r  r  r  r   tanhrq  r   )ry   r|  
is_batchedr   r/  	res_statejwaveforms           r(   r   SpeechT5HifiGan.forward  s   " ;;''&2djj@K __&!+
%//2K#--a3m4t))*AMM44]KKD`D`aM NN1-m<Mq+;+;';<]KI1d../^^A0@0@,@1,DEmTT	 0%(8(88M + 00?}5

=1$,,Q/99!Q?DDRHH
  %,,Q/Hr*   )r  r  r  r  r  r  )r   r   r   r   r   rk  rm  rm   rh  r  r  r   r   r  r   r   r   r   s   @r(   r  r  ;  sd     )L#O$4 $L)
$4 (5#4#4 (9J9J ((r*   r  )rB  r  r  r!  rT  r  )r   Nr!  r  )`r   r   typingr   r   r   r   numpyrE   r   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   
generationr   integrations.deepspeedr   integrations.fsdpr   modeling_attn_mask_utilsr   r   modeling_outputsr   r   r   r   r   modeling_utilsr   r   r   r   configuration_speecht5r   r   
get_loggerr   r  _HIDDEN_STATES_START_POSITIONr   r:   r)   r2   r   r  ndarrayrc   r@  re   r   r   r   r   r   r  r   r%  r@  rN  r  r  r  r  r  r  r  r  r/  r>  rT  rp  r  r  r  r  r  r  r  r  r  r!  rB  r  rN   r  r  r  r  r  __all__rj  r*   r(   <module>r     s     / /     @ @ ! ) @ 7 e  . , I 
		H	% !" %,, c [^ " ei0,,0250KSTYT`T`Ka04 26tc?tt t U--.	t
 t ZZtp299 , 8 2A8BII A8J*bii *Zryy 0" "(299 +RYY +^1		 1D")) DN1")) 1h% %P<299 <2		 .(-		 (-V& & ]B		 ]B@")) 0:299 :zi299 iX "?o "? "?JF
- F
R"&= "J'$; 'T
#: 
@L
- L
^-&= -`1$; 1h(#: (V8M299 8Mv:bii :z 
Z
+ Z

Z
z 
y5 y
y~ 7;15#'$)"'L"L##L !!2!23L U--.	L
 L L L bii L "L  L 5eE$5$5u7H7H$HIIJL^ 
c
5 c

c
L 
p
 7 p

p
f;299 ;| 
to t
tnr*   