
    fTh                     z   S r SSKrSSKJr  SSKJrJrJrJr  SSK	r
SSKrSSKrSSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJrJr  SSKJr  \R@                  " \!5      r"\ " S S\5      5       r#\ " S S\5      5       r$\RJ                  RL                  S 5       r'     S=S jr(S r) " S S\R                  RT                  5      r+ " S S\RT                  5      r, " S S\RT                  5      r- " S S\RT                  5      r. " S S \RT                  5      r/ " S! S"\RT                  5      r0 " S# S$\RT                  5      r1 " S% S&\RT                  5      r2 " S' S(\RT                  5      r3 " S) S*\RT                  5      r4 " S+ S,\RT                  5      r5 " S- S.\RT                  5      r6 " S/ S0\RT                  5      r7 " S1 S2\RT                  5      r8 " S3 S4\RT                  5      r9 " S5 S6\RT                  5      r:\ " S7 S8\5      5       r;\" S9S:9 " S; S<\;5      5       r<S<S8/r=g)>zPyTorch VITS model.    N)	dataclass)AnyOptionalTupleUnion)nn   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask)BaseModelOutputModelOutput)PreTrainedModel)auto_docstringlogging   )
VitsConfigc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
VitsModelOutput'   a  
Describes the outputs for the VITS model, with potential hidden states and attentions.

Args:
    waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        The final audio waveform predicted by the model.
    sequence_lengths  (`torch.FloatTensor` of shape `(batch_size,)`):
        The length in samples of each element in the `waveform` batch.
    spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
        The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi
        GAN decoder model to obtain the final audio waveform.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attention weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nwaveformsequence_lengthsspectrogramhidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   r   r   r   __static_attributes__r       ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/vits/modeling_vits.pyr   r   '   s    0 -1Hhu(()048hu00186:K% 1 123:8<M8E%"3"345<59Ju00129r'   r   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
VitsTextEncoderOutputH   a  
Describes the outputs for the VITS text encoder model, with potential hidden states and attentions.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        The predicted mean values of the prior distribution for the latent text variables.
    prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        The predicted log-variance values of the prior distribution for the latent text variables.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attention weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nlast_hidden_stateprior_meansprior_log_variancesr   r   r   )r   r   r    r!   r"   r,   r   r#   r$   r%   r-   r.   r   r   r   r&   r   r'   r(   r*   r*   H   s~    . 6:x 1 129/3K%++,37;%"3"34;8<M8E%"3"345<59Ju00129r'   r*   c                     X-   n[         R                  " US S 2S U2S S 24   5      n[         R                  " US S 2US 2S S 24   5      nXE-  nU$ N)r#   tanhsigmoid)input_ainput_bnum_channelsin_actt_acts_actactss          r(   fused_add_tanh_sigmoid_multiplyr:   h   sP    FJJva,123EMM&LM1!456E=DKr'   c	                    X* :  X:*  -  n	U	) n
[         R                  " U 5      n[         R                  " U 5      n[        R                  " [        R                  " SU-
  5      S-
  5      n[
        R                  R                  USS9nXS'   XS'   X
   X'   SX'   [        X	   XSS24   X)SS24   X9SS24   UUUUUS9	u  X'   X'   X4$ )	ap	  
This transformation represents a monotonically increasing piecewise rational quadratic function. Outside of the
`tail_bound`, the transform behaves as an identity function.

Args:
    inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Second half of the hidden-states input to the Vits convolutional flow module.
    unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    reverse (`bool`, *optional*, defaults to `False`):
        Whether the model is being run in reverse mode.
    tail_bound (`float`, *optional* defaults to 5):
        Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
        transform behaves as an identity function.
    min_bin_width (`float`, *optional*, defaults to 1e-3):
        Minimum bin value across the width dimension for the piecewise rational quadratic function.
    min_bin_height (`float`, *optional*, defaults to 1e-3):
        Minimum bin value across the height dimension for the piecewise rational quadratic function.
    min_derivative (`float`, *optional*, defaults to 1e-3):
        Minimum bin value across the derivatives for the piecewise rational quadratic function.
Returns:
    outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Hidden-states as transformed by the piecewise rational quadratic function with the `tail_bound` limits
        applied.
    log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Logarithm of the absolute value of the determinants corresponding to the `outputs` with the `tail_bound`
        limits applied.
r   )r   r   )pad.r   .        N)	inputsunnormalized_widthsunnormalized_heightsunnormalized_derivativesreverse
tail_boundmin_bin_widthmin_bin_heightmin_derivative)	r#   
zeros_likenplogexpr   
functionalr<   _rational_quadratic_spline)rA   rB   rC   rD   rE   rF   rG   rH   rI   inside_interval_maskoutside_interval_maskoutputslog_abs_detconstants                 r(   (_unconstrained_rational_quadratic_splinerU   q   s    \ #k1f6JK11v&G""6*KvvbffQ/0145H!}}001Iv0V'/V$(0W%%+%BG"),K&Ga+/a0GH12IJ!9PQ:Q!R#%%
HDG!;#D r'   c	                    Un	U* n
[         R                  " U 5      U
:  d  [         R                  " U 5      U	:  a  [        S5      eUR                  S   nXk-  S:  a  [        SU SU 35      eX{-  S:  a  [        SU SU 35      e[
        R                  R                  USS9nUSXk-  -
  U-  -   n[         R                  " USS9n[
        R                  R                  US	S
SS9nX-
  U-  U
-   nXS'   XS'   USSS24   USSS24   -
  nU[
        R                  R                  U5      -   n[
        R                  R                  USS9nUSX{-  -
  U-  -   n[         R                  " USS9n[
        R                  R                  US	S
SS9nX-
  U-  U
-   nU
US'   U	US'   USSS24   USSS24   -
  nU(       a  UOUnUS==   S-  ss'   [         R                  " U S   U:  SS9S-
  nUS   nUR                  SU5      S   nUR                  SU5      S   nUR                  SU5      S   nX-  nUR                  SU5      S   nUR                  SU5      S   nUSSS24   R                  SU5      S   nUR                  SU5      S   nUU-   SU-  -
  nU(       d  U U-
  U-  nUSU-
  -  nUUUR                  S5      -  UU-  -   -  nUUU-  -   nUUU-  -   n UR                  S5      UUR                  S5      -  SU-  U-  -   USU-
  R                  S5      -  -   -  n![         R                  " U!5      S[         R                  " U5      -  -
  n"U U"4$ U U-
  n#U#U-  n$UUU-
  -  U$-   n%UU-  U$-
  n&U* U#-  n'U&R                  S5      SU%-  U'-  -
  n(U(S:  R                  5       (       d  [!        SU( 35      eSU'-  U&* [         R"                  " U(5      -
  -  n)U)U-  U-   n U)SU)-
  -  nUUU-  -   nUR                  S5      UU)R                  S5      -  SU-  U-  -   USU)-
  R                  S5      -  -   -  n![         R                  " U!5      S[         R                  " U5      -  -
  n"U U"* 4$ )a  
This transformation represents a monotonically increasing piecewise rational quadratic function. Unlike the
function `_unconstrained_rational_quadratic_spline`, the function behaves the same across the `tail_bound`.

Args:
    inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Second half of the hidden-states input to the Vits convolutional flow module.
    unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    reverse (`bool`):
        Whether the model is being run in reverse mode.
    tail_bound (`float`):
        Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
        transform behaves as an identity function.
    min_bin_width (`float`):
        Minimum bin value across the width dimension for the piecewise rational quadratic function.
    min_bin_height (`float`):
        Minimum bin value across the height dimension for the piecewise rational quadratic function.
    min_derivative (`float`):
        Minimum bin value across the derivatives for the piecewise rational quadratic function.
Returns:
    outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Hidden-states as transformed by the piecewise rational quadratic function.
    log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Logarithm of the absolute value of the determinants corresponding to the `outputs`.
z-Input to a transform is not within its domainr?         ?zMinimal bin width z" too large for the number of bins zMinimal bin height dimr   )r   r   rT   r@   )r<   modevaluer=   r>   .Ngư>).N      r   zinvalid discriminant )r#   minmax
ValueErrorshaper   rN   softmaxcumsumr<   softplussumgatherpowrL   allRuntimeErrorsqrt)*rA   rB   rC   rD   rE   rF   rG   rH   rI   upper_boundlower_boundnum_binswidths	cumwidthsderivativesheights
cumheightsbin_locationsbin_idxinput_cumwidthsinput_bin_widthsinput_cumheightsdeltainput_deltainput_derivativesinput_derivatives_plus_oneinput_heightsintermediate1thetatheta_one_minus_theta	numeratordenominatorrR   derivative_numeratorrS   intermediate2intermediate3abcdiscriminantroots*                                             r(   rO   rO      sC   X K+Kyy;&%))F*;k*IHII"((,H#%-m_<^_g^hijj 3&.~.>>`ai`jkll]]""#6B"?Fa-"::fDDFV,I!!)jPS!TI*i7+EI#f$gsABw)C"H"55F 2==#9#9:R#SSKmm##$8b#AGN$= =HHGg2.J"":6
RU"VJ+z9KGJ$Jv%Jwab!JsCRCx$88G")JyM'd"iiy)]:CaGGi G&&r73F;O}}R1&9!((W5f=E,,r7+F3K#**2w7?!,S!"W!5!<!<R!I&!QNN2w/7M%(BBQ_TM/)-== %U 3![599Q<%?BSVkBk%kl	!M4I$II"Y%<<*q1&15+o 5561u9//!"445 

 ii 45EIIk<R8RR## !11%5[+<<=M--=L=(uuQx!a%!)+!&&((!6|nEFFA1"uzz,778))O; $D 1!M4I$II*q1&!4+o 5561t8.."334 

 ii 45EIIk<R8RR$$r'   c                   D   ^  \ rS rSrS\S\4U 4S jjrSS jrS rSr	U =r
$ )	VitsWaveNetiC  config
num_layersc           	        > [         TU ]  5         UR                  U l        X l        [        R
                  R                  5       U l        [        R
                  R                  5       U l        [
        R                  " UR                  5      U l        [        [
        R                  R                  S5      (       a%  [
        R                  R                  R                  nO[
        R                  R                  nUR                   S:w  aG  [        R
                  R#                  UR                   SUR                  -  U-  S5      nU" USS9U l        ['        U5       H  nUR(                  U-  nUR*                  U-  U-
  S-  n[        R
                  R#                  UR                  SUR                  -  UR*                  UUS9nU" USS9nU R                  R-                  U5        XRS-
  :  a  SUR                  -  n	OUR                  n	[        R
                  R#                  UR                  U	S5      n
U" U
SS9n
U R                  R-                  U
5        M     g )Nweight_normr   r\   r   weight)name)in_channelsout_channelskernel_sizedilationpadding)super__init__hidden_sizer   r#   r   
ModuleList	in_layersres_skip_layersDropoutwavenet_dropoutdropouthasattrutilsparametrizationsr   speaker_embedding_sizeConv1d
cond_layerrangewavenet_dilation_ratewavenet_kernel_sizeappend)selfr   r   r   r   ir   r   in_layerres_skip_channelsres_skip_layer	__class__s              r(   r   VitsWaveNet.__init__D  s   !--$,,.$xx224zz&"8"89288,,m<<((33??K((..K((A-)F)FFL^L^H^akHkmnoJ)*8DDOz"A33Q6H11H<xGAMGxx"..!3!33"66! ' H #8(;HNN!!(+ >!$%(:(:$:!$*$6$6!"XX__V-?-?ARTUVN(hGN  ''7+ #r'   c                    [         R                  " U5      n[         R                  " U R                  /5      nUb  U R	                  U5      n[        U R                  5       H  nU R                  U   " U5      nUb0  US-  U R                  -  nUS S 2XSU R                  -  -   2S S 24   n	O[         R                  " U5      n	[        XyUS   5      n
U R                  U
5      n
U R                  U   " U
5      nX`R                  S-
  :  a;  US S 2S U R                  2S S 24   nX-   U-  nXKS S 2U R                  S 2S S 24   -   nM  XK-   nM     XB-  $ )Nr\   r   r   )r#   rJ   	IntTensorr   r   r   r   r   r:   r   r   )r   rA   padding_maskglobal_conditioningrR   num_channels_tensorr   r   cond_offsetglobal_statesr9   res_skip_actsres_actss                r(   forwardVitsWaveNet.forwardm  sV   ""6*#oot/?/?.@A*"&//2E"Ft'A NN1-f5M".!ed&6&66 3A{STW[WgWgSgEg7gij4j k % 0 0 ?2=QdefQghD<<%D 003D9M??Q&&(,>d.>.>,>)AB +|;!!T5E5E5G2J$KK!1% (( %%r'   c                 z   U R                   S:w  a3  [        R                  R                  R	                  U R
                  5        U R                   H,  n[        R                  R                  R	                  U5        M.     U R                   H,  n[        R                  R                  R	                  U5        M.     g )Nr   )r   r#   r   r   remove_weight_normr   r   r   r   layers     r(   r   VitsWaveNet.remove_weight_norm  st    &&!+HHNN--doo>^^EHHNN--e4 $))EHHNN--e4 *r'   )r   r   r   r   r   r   r0   )r   r   r    r!   r   intr   r   r   r&   __classcell__r   s   @r(   r   r   C  s&    '8z '8s '8R&:5 5r'   r   c                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )VitsPosteriorEncoderi  r   c                 >  > [         TU ]  5         UR                  U l        [        R
                  " UR                  UR                  S5      U l        [        XR                  S9U l        [        R
                  " UR                  U R                  S-  S5      U l        g )Nr   r   r\   )r   r   	flow_sizer   r   r   spectrogram_binsr   conv_prer   $posterior_encoder_num_wavenet_layerswavenet	conv_projr   r   r   s     r(   r   VitsPosteriorEncoder.__init__  ss    ",,		&"9"96;M;MqQ"66a6ab6#5#5t7H7H17LaPr'   c                 &   U R                  U5      U-  nU R                  XU5      nU R                  U5      U-  n[        R                  " X@R
                  SS9u  pVU[        R                  " U5      [        R                  " U5      -  -   U-  nXuU4$ )Nr   rX   )r   r   r   r#   splitr   
randn_likerM   )r   rA   r   r   statsmean
log_stddevsampleds           r(   r   VitsPosteriorEncoder.forward  s    v&5f4GHv&5 ;;u.?.?QG%**40599Z3HHHLXj((r'   )r   r   r   r   r0   	r   r   r    r!   r   r   r   r&   r   r   s   @r(   r   r     s    Qz Q) )r'   r   c                   H   ^  \ rS rSrSU 4S jjrS	S jrS rS rS rSr	U =r
$ )
HifiGanResidualBlocki  c                   > [         TU ]  5         X@l        [        R                  " [        [        U5      5       Vs/ s H0  n[        R                  " UUUSX5   U R                  X#U   5      S9PM2     sn5      U l	        [        R                  " [        [        U5      5       Vs/ s H,  n[        R                  " UUUSSU R                  US5      S9PM.     sn5      U l
        g s  snf s  snf )Nr   )strider   r   )r   r   leaky_relu_sloper   r   r   lenr   get_paddingconvs1convs2)r   channelsr   r   r   r   _r   s          r(   r   HifiGanResidualBlock.__init__  s     0mm s8}-
 .A 		%[ ,,[1+F .

 mm s8}-
 .A 		 ,,[!< .



s   7C%%3C*c                     X-  U-
  S-  $ )Nr\   r   )r   r   r   s      r(   r    HifiGanResidualBlock.get_padding  s    &1a77r'   c                 >   [         R                  R                  n[        [         R                  R                  S5      (       a$  [         R                  R                  R                  nU R
                   H  nU" U5        M     U R                   H  nU" U5        M     g Nr   )r   r   r   r   r   r   r   r   r   r   s      r(   apply_weight_norm&HifiGanResidualBlock.apply_weight_norm  si    hh**288,,m<<((33??K[[E ![[E !r'   c                     U R                    H"  n[        R                  R                  U5        M$     U R                   H"  n[        R                  R                  U5        M$     g r0   )r   r   r   r   r   r   s     r(   r   'HifiGanResidualBlock.remove_weight_norm  sB    [[EHH''. ![[EHH''. !r'   c                 (   [        U R                  U R                  5       Hm  u  p#Un[        R                  R                  XR                  5      nU" U5      n[        R                  R                  XR                  5      nU" U5      nX-   nMo     U$ r0   )zipr   r   r   rN   
leaky_relur   )r   r   conv1conv2residuals        r(   r   HifiGanResidualBlock.forward  sz    T[[9LE$HMM44]DYDYZM!-0MMM44]DYDYZM!-0M)4M : r'   )r   r   r   )r	   )r   r	      g?r   )r   r   r    r!   r   r   r   r   r   r&   r   r   s   @r(   r   r     s!    
>8/ r'   r   c                      ^  \ rS rSrS\4U 4S jjrS rS r SS\R                  S\
\R                     S\R                  4S	 jjrS
rU =r$ )VitsHifiGani  r   c                 `  > [         TU ]  5         Xl        [        UR                  5      U l        [        UR                  5      U l        [        R                  " UR                  UR                  SSSS9U l        [        R                  " 5       U l        [        [!        UR                  UR"                  5      5       Ha  u  nu  p4U R                  R%                  [        R&                  " UR                  SU-  -  UR                  SUS-   -  -  UUXC-
  S-  S95        Mc     [        R                  " 5       U l        [+        [        U R                  5      5       Hp  nUR                  SUS-   -  -  n[!        UR                  UR,                  5       H4  u  pFU R(                  R%                  [/        XTXaR0                  5      5        M6     Mr     [        R                  " WSSSSSS9U l        UR4                  S:w  a2  [        R                  " UR4                  UR                  S5      U l        g g )	N   r   r	   )r   r   r   r\   F)r   r   r   biasr   )r   r   r   r   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   r   r   upsample_initial_channelr   r   	upsampler	enumerater   upsample_kernel_sizesr   ConvTranspose1d	resblocksr   resblock_dilation_sizesr   r   	conv_postr   cond)r   r   r   upsample_rater   r   r   r   s          r(   r   VitsHifiGan.__init__  s   v;;< !6!67		++
 /8V=R=RTZTpTp9q/r+A+NN!!""331=33a!eE +((8Q> 0s s4>>*+A661Q<HH),V-I-I6KiKi)j%%%&:8RZ\s\s&tu *k ,
 8QAaQRY^_((A-		&"?"?A`A`bcdDI .r'   c                 N   [         R                  R                  n[        [         R                  R                  S5      (       a$  [         R                  R                  R                  nU R
                   H  nU" U5        M     U R                   H  nUR                  5         M     g r   )r   r   r   r   r   r  r  r   r   s      r(   r   VitsHifiGan.apply_weight_norm	  sm    hh**288,,m<<((33??K^^E $^^E##% $r'   c                     U R                    H"  n[        R                  R                  U5        M$     U R                   H  nUR                  5         M     g r0   )r  r   r   r   r  r   s     r(   r   VitsHifiGan.remove_weight_norm  s<    ^^EHH''. $^^E$$& $r'   r   r   returnc                    U R                  U5      nUb  X0R                  U5      -   n[        U R                  5       H  n[        R
                  R                  X0R                  R                  5      nU R                  U   " U5      nU R                  X@R                  -     " U5      n[        SU R                  5       H)  nXPR                  X@R                  -  U-      " U5      -  nM+     XPR                  -  nM     [        R
                  R                  U5      nU R                  U5      n[        R                  " U5      nU$ )a  
Converts a spectrogram into a speech waveform.

Args:
    spectrogram (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`):
        Tensor containing the spectrograms.
    global_conditioning (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_size, 1)`, *optional*):
        Tensor containing speaker embeddings, for multispeaker models.

Returns:
    `torch.FloatTensor`: Tensor of shape shape `(batch_size, 1, num_frames)` containing the speech waveform.
r   )r   r  r   r   r   rN   r   r   r   r  r  r   r  r#   r1   )r   r   r   r   r   	res_statejr   s           r(   r   VitsHifiGan.forward  s    k2*)II6I,JJMt))*AMM44]KKD`D`aM NN1-m<Mq+;+;';<]KI1d../^^A0@0@,@1,DEmTT	 0%(8(88M + 00?}5::m,r'   )r  r   r  r   r   r   r  r  r0   )r   r   r    r!   r   r   r   r   r#   r$   r   r   r&   r   r   s   @r(   r   r     s\    "ez "eH&' bf  ,, CKEL]L]C^ 			   r'   r   c                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )VitsResidualCouplingLayeri<  r   c                 >  > [         TU ]  5         UR                  S-  U l        [        R
                  " U R                  UR                  S5      U l        [        XR                  S9U l
        [        R
                  " UR                  U R                  S5      U l        g )Nr\   r   r   )r   r   r   half_channelsr   r   r   r   r    prior_encoder_num_wavenet_layersr   r  r   s     r(   r   "VitsResidualCouplingLayer.__init__=  ss    #--2		$"4"4f6H6H!L"66]6]^6#5#5t7I7I1Mr'   c                    [         R                  " XR                  /S-  SS9u  pVU R                  U5      U-  nU R	                  XrU5      nU R                  U5      U-  n[         R                  " U5      n	U(       dP  X[         R                  " U	5      -  U-  -   n[         R                  " XV/SS9n
[         R                  " U	SS/5      nX4$ Xh-
  [         R                  " U	* 5      -  U-  n[         R                  " XV/SS9n
U
S 4$ )Nr\   r   rX   )
r#   r   r  r   r   r  rJ   rM   catre   )r   rA   r   r   rE   
first_halfsecond_halfr   r   r   rR   log_determinants               r(   r   !VitsResidualCouplingLayer.forwardE  s    "'++f7I7I6JQ6NTU"V
j1L@]BUV~~m,|;%%d+
uyy/D!D|!SSKii 9qAG#ii
QF;O++&-J;1GG,VKii 9qAGD= r'   )r  r   r  r   NFr   r   s   @r(   r  r  <  s    Nz N! !r'   r  c                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )VitsResidualCouplingBlockiW  r   c                    > [         TU ]  5         [        R                  " 5       U l        [        UR                  5       H'  nU R                  R                  [        U5      5        M)     g r0   )	r   r   r   r   flowsr   prior_encoder_num_flowsr   r  r   r   r   r   s      r(   r   "VitsResidualCouplingBlock.__init__X  sH    ]]_
v556AJJ7?@ 7r'   c                     U(       d8  U R                    H&  nU" XU5      u  p[        R                  " US/5      nM(     U$ [        U R                   5       H%  n[        R                  " US/5      nU" XUSS9u  pM'     U$ )Nr   TrE   )r$  r#   flipreversed)r   rA   r   r   rE   flowr   s          r(   r   !VitsResidualCouplingBlock.forward^  s}    

 7JK	FQC0 #  !,FQC0 7JTXY	 - r'   )r$  r   r   r   s   @r(   r"  r"  W  s    Az A	 	r'   r"  c                   >   ^  \ rS rSrSS\4U 4S jjjrSS jrSrU =r$ )VitsDilatedDepthSeparableConvij  r   c                 N  > [         TU ]  5         UR                  nUR                  nUR                  U l        [        R                  " U5      U l        [        R                  " 5       U l
        [        R                  " 5       U l        [        R                  " 5       U l        [        R                  " 5       U l        [        U R
                  5       H  nX5-  nX6-  U-
  S-  nU R                  R                  [        R                   " UUUUUUS95        U R                  R                  [        R                   " XDS5      5        U R                  R                  [        R"                  " U5      5        U R                  R                  [        R"                  " U5      5        M     g )Nr\   )r   r   r   groupsr   r   r   )r   r   duration_predictor_kernel_sizer   depth_separable_num_layersr   r   r   r   r   convs_dilatedconvs_pointwisenorms_1norms_2r   r   r   	LayerNorm)	r   r   dropout_rater   r   r   r   r   r   s	           r(   r   &VitsDilatedDepthSeparableConv.__init__k  s,   ;;%% ;;zz,/]]_!}}}}}}t'A"~H"-8Q>G%%		 (!) +#%#	   ''		(a(HILLX 67LLX 67 (r'   c                 "   Ub  X-   n[        U R                  5       H  nU R                  U   " X-  5      nU R                  U   " UR	                  SS5      5      R	                  SS5      n[
        R                  R                  U5      nU R                  U   " U5      nU R                  U   " UR	                  SS5      5      R	                  SS5      n[
        R                  R                  U5      nU R                  U5      nX-   nM     X-  $ Nr   r?   )r   r   r4  r6  	transposer   rN   gelur5  r7  r   )r   rA   r   r   r   r   s         r(   r   %VitsDilatedDepthSeparableConv.forward  s    *1Ft'A ..q1&2GHM LLOM,C,CAr,JKUUVWY[\MMM..}=M 003MBM LLOM,C,CAr,JKUUVWY[\MMM..}=M LL7M+F ( $$r'   )r4  r5  r   r6  r7  r   )r@   r0   r   r   s   @r(   r/  r/  j  s    8z 8 88% %r'   r/  c                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )VitsConvFlowi  r   c                   > [         TU ]  5         UR                  U l        UR                  S-  U l        UR                  U l        UR                  U l	        [        R                  " U R
                  U R                  S5      U l        [        U5      U l        [        R                  " U R                  U R
                  U R                  S-  S-
  -  S5      U l        g )Nr\   r   r	   )r   r   r   filter_channelsdepth_separable_channelsr  duration_predictor_flow_binsrm   duration_predictor_tail_boundrF   r   r   r   r/  conv_ddsr   r   s     r(   r   VitsConvFlow.__init__  s    %11#<<A;; >>		$"4"4d6J6JAN5f=4#7#79K9Kt}}_`O`cdOd9eghir'   c           	         [         R                  " XR                  /S-  SS9u  pVU R                  U5      nU R	                  XrU5      nU R                  U5      U-  nUR                  u  pn
UR                  XSU
5      R                  SSSS5      nUSS U R                  24   [        R                  " U R                  5      -  nUSU R                  SU R                  -  24   [        R                  " U R                  5      -  nUSSU R                  -  S 24   n[        UUUUUU R                  S9u  pn[         R                  " XV/SS9U-  nU(       d  [         R                   " X-  SS/5      nUU4$ US 4$ )	Nr\   r   rX   r?   r   r	   .)rE   rF   )r#   r   r  r   rG  r   ra   reshapepermuterm   mathrj   rC  rU   rF   r  re   )r   rA   r   r   rE   r  r  r   
batch_sizer   lengthrB   rC   rD   rS   rR   r  s                    r(   r   VitsConvFlow.forward  s}   "'++f7I7I6JQ6NTU"V
j1mCVW}5D'1'7'7$
f%--jBOWWXY[\^_abc+C4==,@ADIIdNbNbDcc,S$--!dmmBS2S-STW[W`W`aeauauWvv#0a$--6G6I1I#J #K $$
  ))Z51=L#ii(BQFKOO++D= r'   )rG  r   r   rC  r  rm   rF   r   r   r   s   @r(   rA  rA    s    	jz 	j! !r'   rA  c                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )VitsElementwiseAffinei  r   c                 ,  > [         TU ]  5         UR                  U l        [        R
                  " [        R                  " U R                  S5      5      U l        [        R
                  " [        R                  " U R                  S5      5      U l	        g Nr   )
r   r   rD  r   r   	Parameterr#   zeros	translate	log_scaler   s     r(   r   VitsElementwiseAffine.__init__  sY    77ekk$--&CDekk$--&CDr'   c                 8   U(       d]  U R                   [        R                  " U R                  5      U-  -   nXR-  n[        R                  " U R                  U-  SS/5      nXV4$ XR                   -
  [        R                  " U R                  * 5      -  U-  nUS 4$ Nr   r\   )rV  r#   rM   rW  re   )r   rA   r   r   rE   rR   r  s          r(   r   VitsElementwiseAffine.forward  s    nnuyy'@6'IIG,G#ii(E1vNO++.%))T^^O2LL|[GD= r'   )r   rW  rV  r   r   r   s   @r(   rQ  rQ    s    Ez E! !r'   rQ  c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )VitsStochasticDurationPredictori  c                   > [         TU ]  5         UR                  nUR                  n[        R
                  " X3S5      U l        [        R
                  " X3S5      U l        [        UUR                  S9U l
        US:w  a  [        R
                  " X#S5      U l        [        R                  " 5       U l        U R                  R                  [        U5      5        [!        UR"                  5       H'  nU R                  R                  [%        U5      5        M)     [        R
                  " SUS5      U l        [        R
                  " X3S5      U l        [        UUR                  S9U l        [        R                  " 5       U l        U R,                  R                  [        U5      5        [!        UR"                  5       H'  nU R,                  R                  [%        U5      5        M)     g )Nr   )r9  r   )r   r   r   r   r   r   r   r   r/  duration_predictor_dropoutrG  r  r   r$  r   rQ  r   duration_predictor_num_flowsrA  post_conv_prepost_conv_projpost_conv_dds
post_flows)r   r   	embed_dimrC  r   r   s        r(   r   (VitsStochasticDurationPredictor.__init__  sb   11	 ,,		/AF?QG5::

 >		)a@DI]]_


/78v::;AJJl623 <  YYq/1= ii!L:::

 --/4V<=v::;AOO""<#78 <r'   c                    [         R                  " U5      nU R                  U5      nUb)  [         R                  " U5      nXR                  U5      -   nU R	                  X5      nU R                  U5      U-  nU(       Gd  U R                  U5      nU R                  Xr5      nU R                  U5      U-  n[         R                  " UR                  S5      SUR                  S5      5      R                  UR                  UR                  S9U-  nSn	Un
U R                   H*  nU" XX-   S9u  p[         R                  " U
S/5      n
X-  n	M,     [         R                   " U
SS/SS9u  pU	[         R"                  " [$        R&                  R)                  U5      [$        R&                  R)                  U* 5      -   U-  SS/5      -  n	[         R"                  " S[*        R,                  " S[*        R.                  -  5      US-  -   -  U-  SS/5      U	-
  nU[         R0                  " U5      -
  U-  n[         R,                  " [         R2                  " US5      5      U-  n[         R"                  " U* SS/5      n[         R4                  " X/SS9nU R6                   H*  nU" UX!S9u  nn[         R                  " US/5      nUU-  nM,     [         R"                  " S	[*        R,                  " S[*        R.                  -  5      US-  -   -  U-  SS/5      U-
  nUU-   $ [9        [;        U R6                  5      5      nUS S
 US   /-   n[         R                  " UR                  S5      SUR                  S5      5      R                  UR                  UR                  S9U-  nU H&  n[         R                  " US/5      nU" UX!SS9u  nnM(     [         R                   " USS/SS9u  nnU$ )Nr   r\   )devicedtype)r   r   rX         gh㈵>g      ?r?   T)r   rE   )r#   detachr   r  rG  r   ra  rc  rb  randnsizetorh  ri  rd  r*  r   re   r   rN   
logsigmoidrL  rL   pir2   	clamp_minr  r$  listr+  )r   rA   r   r   	durationsrE   noise_scaler   random_posteriorlog_determinant_posterior_sumlatents_posteriorr,  r  r  r  logqlog_determinant_sumlatentsnllr$  r   log_durations                         r(   r   'VitsStochasticDurationPredictor.forward  s   f%v&*"',,/B"Cii(;<<Fv4',6 ..y9M ..}KM //>MM INN1-q)..2CDGGv}}djdpdpGq  -.) 059%I_62! %*JJ/@1#$F!-@- ( ',kk2CaVQR&S#J)UYY))*58P8PR\Q\8]]ammpqstou. ) 		$$((1tww;"7;KQ;N"OPS__bcefagh/0 
 $emmJ&??<OJ5??:t#DETJ"'))ZK!Q"@ii 9qAG

+/+b(**Wqc2#6# #
 ))C488AK#8GQJ#GH<WZ[]^Y_`cvvC:$**-.E#2J%),E FKKNAv{{1~>AA^d^j^jAk  **Wqc2!'<]ab
  $kk'Aq6qAOL!r'   )	r  rG  r   r   r$  rc  ra  rb  rd  )NNFrW   r   r   r    r!   r   r   r&   r   r   s   @r(   r]  r]    s    9@@  @ r'   r]  c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )VitsDurationPredictori9  c                 p  > [         TU ]  5         UR                  nUR                  n[        R
                  " UR                  5      U l        [        R                  " UR                  X2US-  S9U l
        [        R                  " X1R                  S9U l        [        R                  " X3X"S-  S9U l        [        R                  " X1R                  S9U l        [        R                  " USS5      U l        UR"                  S:w  a2  [        R                  " UR"                  UR                  S5      U l        g g )Nr\   )r   epsr   r   )r   r   r2  "duration_predictor_filter_channelsr   r   r_  r   r   r   conv_1r8  layer_norm_epsnorm_1conv_2norm_2projr   r  )r   r   r   rC  r   s       r(   r   VitsDurationPredictor.__init__:  s    ;; CCzz&"C"CDii 2 2OZeijZjkll?8M8MNii+fgWghll?8M8MNIIoq!4	((A-		&"?"?ASASUVWDI .r'   c                 `   [         R                  " U5      nUb)  [         R                  " U5      nXR                  U5      -   nU R                  X-  5      n[         R                  " U5      nU R                  UR                  SS5      5      R                  SS5      nU R                  U5      nU R                  X-  5      n[         R                  " U5      nU R                  UR                  SS5      5      R                  SS5      nU R                  U5      nU R                  X-  5      nX-  $ r<  )r#   rl  r  r  relur  r=  r   r  r  r  )r   rA   r   r   s       r(   r   VitsDurationPredictor.forwardI  s    f%*"',,/B"Cii(;<<FV23F#V--a45??2Ff%V23F#V--a45??2Ff%601$$r'   )r  r  r  r   r  r  r  r0   r  r   s   @r(   r  r  9  s    X% %r'   r  c                   8  ^  \ rS rSrSrS\4U 4S jjrS\R                  S\	S\	4S jr
    SS	\R                  S
\\R                     S\\R                     S\\R                     S\S\\R                  \\R                     4   4S jjrS rS rS rSrU =r$ )VitsAttentioni^  z?Multi-headed attention with relative positional representation.r   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l        U R                  U R
                  -  U l	        U R                  S-  U l
        U R                  U R
                  -  U R                  :w  a&  [        SU R                   SU R
                   S35      e[        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        U R                  (       a  [        R&                  " [(        R*                  " SU R                  S-  S-   U R                  5      U R                  -  5      U l        [        R&                  " [(        R*                  " SU R                  S-  S-   U R                  5      U R                  -  5      U l        g g )Nrj  zIhidden_size must be divisible by num_attention_heads (got `hidden_size`: z and `num_attention_heads`: z).)r   r   r\   )r   r   r   re  num_attention_heads	num_headsattention_dropoutr   window_sizehead_dimscalingr`   r   Linearuse_biask_projv_projq_projout_projrT  r#   rm  	emb_rel_k	emb_rel_vr   s     r(   r   VitsAttention.__init__a  s   ++33//!--$..8}}d*MMDNN*t~~=[\`\j\j[k.t~~.>bB 
 iiV__UiiV__UiiV__U		$..$..vW\\%++a9I9IA9MPQ9QSWS`S`*adhdpdp*pqDN\\%++a9I9IA9MPQ9QSWS`S`*adhdpdp*pqDN r'   tensorseq_lenbszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ rZ  )viewr  r  r=  
contiguous)r   r  r  r  s       r(   _shapeVitsAttention._shapez  s5    {{3GQQRSUVWbbddr'   r   key_value_statesattention_masklayer_head_maskoutput_attentionsr  c                 0	   UR                  5       u  pgnU R                  U5      U R                  -  n	U R                  U R	                  U5      SU5      n
U R                  U R                  U5      SU5      nX`R                  -  SU R                  4nU R                  XU5      R                  " U6 n	U
R                  " U6 n
UR                  " U6 nU
R                  S5      n[        R                  " XR                  SS5      5      nUR                  5       X`R                  -  X}4:w  a-  [        SX`R                  -  X}4 SUR                  5        35      eU R                  bX  U R                  U R                  U5      n[        R                   " XR                  SS5      5      nU R#                  U5      nUU-  nUbv  UR                  5       USX}4:w  a"  [        SUSX}4 SUR                  5        35      eUR                  X`R                  X}5      U-   nUR                  X`R                  -  X}5      n[$        R&                  R)                  USS	9nUb  UR                  5       U R                  4:w  a*  [        S
U R                  4 SUR                  5        35      eUR                  SSSS5      UR                  X`R                  X}5      -  nUR                  X`R                  -  X}5      nU(       a;  UR                  X`R                  X}5      nUR                  X`R                  -  X}5      nOSn[$        R&                  R+                  XR*                  U R,                  S9n[        R                  " UU5      nUR                  5       X`R                  -  XpR                  4:w  a5  [        SX`R                  XpR                  4 SUR                  5        35      eU R                  bI  U R                  U R.                  U5      nU R1                  U5      n[        R                   " UU5      nUU-  nUR                  X`R                  XpR                  5      nUR                  SS5      nUR3                  XgU R4                  5      nU R7                  U5      nUU4$ )z#Input shape: Batch x Time x Channelr?   r   r\   z$Attention weights should be of size z	, but is Nrk  z!Attention mask should be of size rX   z/Head mask for a single layer should be of size )ptrainingz `attn_output` should be of size )rn  r  r  r  r  r  r  r  r  r#   bmmr=  r`   r  _get_relative_embeddingsr  matmul'_relative_position_to_absolute_positionr   rN   rb   r   r  r  '_absolute_position_to_relative_positionrJ  re  r  )r   r   r  r  r  r  r  tgt_lenr   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightskey_relative_embeddingsrelative_logitsrel_pos_biasattn_weights_reshaped
attn_probsattn_outputvalue_relative_embeddingsrelative_weightss                          r(   r   VitsAttention.forward}  sR    (,,.a {{=1DLL@ [[]!;RE
{{4;;}#=r3GNN*B>
{{<#>CCZP__j1
#((*5//!$yy/C/CAq/IJ3#7"JJ6nn8Lg7_6` a %%'(* 
 '&*&C&CDNNT[&\##ll<9Z9Z[]_a9bcOGGXLL(L%""$a(BB 7a8R7SS\]k]p]p]r\st  (,,S..'SVddL',,S>>-A7TL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVdVdfm?wwL',,S>>-A7TL
 %1$5$5c>>7$\!055cNN6JG]L$(!]]**<<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 '(,(E(EdnnV](^%#KKJW <<(8:STL<'K!&&sNNG]]S!++Aq1 "))#GmmK0111r'   c           	          [        X R                  S-   -
  S5      nUS:  a%  [        R                  R	                  USSX3SS/5      n[        U R                  S-   U-
  S5      nUSU-  -   S-
  nUS S 2XE24   $ )Nr   r   r\   )r_   r  r   rN   r<   )r   relative_embeddingsrN  
pad_lengthslice_start_positionslice_end_positions         r(   r  &VitsAttention._get_relative_embeddings  s    #3#3a#78!<
>"$--"3"34G!QPZhiklIm"n"D$4$4q$8F#BAF1AJ>B"1&:&M#MNNr'   c                 H   UR                  5       u  p#n[        R                  R                  U/ SQ5      nUR	                  X#S-  U-  /5      n[        R                  R                  USUS-
  SS/5      nUR	                  X#S-   SU-  S-
  /5      nUS S 2S U2US-
  S 24   nU$ )N)r   r   r   r   r   r   r\   r   r   rn  r   rN   r<   r  r   xbatch_headsrN  r   x_flatx_finals          r(   r  5VitsAttention._relative_position_to_absolute_position  s    !"Q MMa!34 qj6&9:;""6Avz1a+@A ++{QJF
QGH!WfWfqjl23r'   c           	      >   UR                  5       u  p#n[        R                  R                  USUS-
  SSSS/5      nUR	                  X#SU-  S-
  -  /5      n[        R                  R                  XSSSS/5      nUR	                  X#SU-  /5      S S 2S S 2SS 24   nU$ )Nr   r   r\   r  r  s          r(   r  5VitsAttention._absolute_position_to_relative_position  s    !"Q MMa!VaZAq!!<=F
Q&?@A ""6Aq!+<=++{AJ?@AqrJr'   )r   r  r  re  r  r  r  r  r  r  r  r  )NNNF)r   r   r    r!   r"   r   r   r#   Tensorr   r  r   boolr   r   r  r  r  r&   r   r   s   @r(   r  r  ^  s    Irz r2eU\\ eC ec e 481526"'`2||`2 #5<<0`2 !.	`2
 "%,,/`2  `2 
u||Xell33	4`2DO
 
r'   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )VitsFeedForwardi  c                 t  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  5      U l        [        R                  " UR
                  UR                  UR                  5      U l        [        R                  " UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        UR                  S:  a.  UR                  S-
  S-  nUR                  S-  nX#SSSS/U l        g S U l        g )Nr   r\   r   )r   r   r   r   r   ffn_dimffn_kernel_sizer  r  r   activation_dropoutr   
isinstance
hidden_actstrr
   act_fnr   )r   r   pad_left	pad_rightr   s       r(   r   VitsFeedForward.__init__  s    ii 2 2FNNFDZDZ[ii0B0BFDZDZ[zz&";";<f''-- !2!23DK ++DK!!A%..2q8H..!3I$Aq!<DLDLr'   c                    UR                  SSS5      nUR                  SSS5      nX-  nU R                  b)  [        R                  R	                  XR                  5      nU R                  U5      nU R                  U5      nU R                  U5      nX-  nU R                  b)  [        R                  R	                  XR                  5      nU R                  U5      nX-  nUR                  SSS5      nU$ )Nr   r\   r   )	rK  r   r   rN   r<   r  r  r   r  )r   r   r   s      r(   r   VitsFeedForward.forward  s    %--aA6#++Aq!4%4<<#MM--m\\JMM2M2]3%4<<#MM--m\\JMM2%4%--aA6r'   )r  r  r  r   r   r  r   s   @r(   r  r    s     $ r'   r  c            	          ^  \ rS rSrS\4U 4S jjr  S
S\R                  S\R                  S\	\R                     S\
4S jjrS	rU =r$ )VitsEncoderLayeri.  r   c                 d  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        g )Nr  )r   r   r  	attentionr   r   hidden_dropoutr   r8  r   r  
layer_normr  feed_forwardfinal_layer_normr   s     r(   r   VitsEncoderLayer.__init__/  sz    &v.zz&"7"78,,v'9'9v?T?TU+F3 "V-?-?VEZEZ [r'   r   r   r  r  c                    UnU R                  UUUS9u  pU R                  U5      nU R                  XQ-   5      nUnU R                  X5      nU R                  U5      nU R	                  XQ-   5      nU4nU(       a  Xv4-  nU$ )N)r   r  r  )r  r   r  r  r  )r   r   r   r  r  r   r  rR   s           r(   r   VitsEncoderLayer.forward7  s     !&*nn')/ '5 '
# ]3(@A ))-F]3--h.FG "&Gr'   )r  r   r  r  r  r   )r   r   r    r!   r   r   r#   r  r$   r   r  r   r&   r   r   s   @r(   r  r  .  s\    \z \ 26"'|| '' !.	
   r'   r  c                      ^  \ rS rSrS\4U 4S jjr    SS\R                  S\R                  S\\R                     S\\
   S\\
   S	\\
   S
\\\4   4S jjrSrU =r$ )VitsEncoderiU  r   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        UR                  U l
        g s  snf r   )r   r   r   r   r   r   num_hidden_layersr  layersgradient_checkpointing	layerdropr&  s      r(   r   VitsEncoder.__init__V  s`    mmuVMeMeGf$gGf!%5f%=Gf$gh&+#)) %hs   A7r   r   r  r  output_hidden_statesreturn_dictr  c                    U(       a  SOS nU(       a  SOS nUb  [        X1R                  5      nX-  n[        5       =(       d    [        U 5      n	U R                   H  n
U(       a  Xq4-   n[
        R                  R                  SS5      nU R                  =(       a    XR                  :  nU(       a  U	(       aP  U R                  (       a1  U R                  (       a   U R                  U
R                  UUUU5      nO	U
" UUUUS9nUS   nU(       a  SnU(       d  M  UWS   4-   nM     X-  nU(       a  Xq4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )Nr   r   r   )r  r   r  )NNc              3   .   #    U  H  oc  M  Uv   M     g 7fr0   r   ).0vs     r(   	<genexpr>&VitsEncoder.forward.<locals>.<genexpr>  s     m$[q$[s   	)r,   r   r   )r   ri  r   r   r  rK   randomuniformr  r  r  _gradient_checkpointing_func__call__tupler   )r   r   r   r  r  r  r  all_hidden_statesall_self_attentionssynced_gpusencoder_layerdropout_probabilityskip_the_layerlayer_outputss                 r(   r   VitsEncoder.forward]  s_    #7BD$5b4 %7H[H[\N%402R6LT6R![[M#$58H$H! #%))"3"3Aq"9!]]U0Cnn0TN![..4==$($E$E%..%$&)%M %2%'5%1*;	%M !.a 0 ,  &9]1=M<O&O#? )B &4 14D Dm]GZ$[mmm++*
 	
r'   )r   r  r  r  )NNNN)r   r   r    r!   r   r   r#   r$   r   r  r  r   r   r   r   r&   r   r   s   @r(   r  r  U  s    *z * 26,0/3&*B
((B
 ''B
 !.	B

 $D>B
 'tnB
 d^B
 
uo%	&B
 B
r'   r  c                      ^  \ rS rSrSrS\4U 4S jjrS rS r    SS\	R                  S\	R                  S	\\	R                     S
\\   S\\   S\\   S\\\	R                     \4   4S jjrSrU =r$ )VitsTextEncoderi  zk
Transformer encoder that uses relative positional representation instead of absolute positional encoding.
r   c                 (  > [         TU ]  5         Xl        [        R                  " UR
                  UR                  UR                  5      U l        [        U5      U l
        [        R                  " UR                  UR                  S-  SS9U l        g )Nr\   r   )r   )r   r   r   r   	Embedding
vocab_sizer   pad_token_idembed_tokensr  encoderr   r   projectr   s     r(   r   VitsTextEncoder.__init__  sm    LL):):F<N<NPVPcPcd"6*yy!3!3V5E5E5IWXYr'   c                     U R                   $ r0   r  r   s    r(   get_input_embeddings$VitsTextEncoder.get_input_embeddings         r'   c                     Xl         g r0   r  )r   r[   s     r(   set_input_embeddings$VitsTextEncoder.set_input_embeddings  s    !r'   	input_idsr   r  r  r  r  r  c           	         U R                  U5      [        R                  " U R                  R                  5      -  nU R                  UUUUUUS9nU(       d  US   OUR                  n	U R                  U	R                  SS5      5      R                  SS5      U-  n
[        R                  " XR                  R                  SS9u  pU(       d  XU4USS  -   nU$ [        U	UUUR                  UR                  S9$ )N)r   r   r  r  r  r  r   r   r\   rX   )r,   r-   r.   r   r   )r  rL  rj   r   r   r  r,   r  r=  r#   r   r   r*   r   r   )r   r"  r   r  r  r  r  r   encoder_outputsr,   r   r-   r.   rR   s                 r(   r   VitsTextEncoder.forward  s     )))4tyyAXAX7YY,,'%)/!5# ' 
 7BOA.GhGh.88A>?II!QOR^^+0;;ukk>S>SYZ+[((7JKo^_^`NaaGN$/# 3)77&11
 	
r'   )r   r  r  r  )NNNT)r   r   r    r!   r"   r   r   r  r   r#   r  r$   r   r  r   r   r*   r   r&   r   r   s   @r(   r  r    s    Zz Z!" 26,0/3&*#
<<#
 ''#
 !.	#

 $D>#
 'tn#
 d^#
 
uU\\"$99	:#
 #
r'   r  c                   *    \ rS rSr\rSrSrSrS r	Sr
g)VitsPreTrainedModeli  vitsr"  Tc                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[        R                  5      (       a  [        R                  R                  UR                  5        UR                  bh  [        R                   " UR"                  UR$                  UR&                  S   -  -  5      n[        R                  R)                  UR                  U* US9  gg[        U[        R*                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR,                  b2  UR                  R                  UR,                     R                  5         ggg)zInitialize the weightsr@   )r   stdNrW   r   )r   r   )r  r   r  r   datanormal_r   initializer_ranger   zero_r8  fill_r   initkaiming_normal_rL  rj   r1  r   r   uniform_r  padding_idx)r   moduleks      r(   _init_weights!VitsPreTrainedModel._init_weights  s   fbii((MM&&CT[[5R5R&S{{&  &&( '--KK""$MM$$S)		**GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' --MM&&CT[[5R5R&S!!-""6#5#56<<> . .r'   r   N)r   r   r    r!   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr6  r&   r   r'   r(   r'  r'    s    L!O&*#?r'   r'  z@
    The complete VITS model, for text-to-speech synthesis.
    )custom_introc                      ^  \ rS rSrS\4U 4S jjrS r\       SS\\	R                     S\\	R                     S\\   S\\   S	\\   S
\\   S\\	R                     S\\\   \4   4S jj5       rSrU =r$ )	VitsModeli  r   c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        U5      U l        UR                  (       a  [        U5      U l        O[        U5      U l        UR                  S:  a0  [        R                  " UR                  UR                   5      U l        [%        U5      U l        UR(                  U l        UR*                  U l        UR,                  U l        U R/                  5         g rS  )r   r   r   r  text_encoderr"  r,  r   decoder"use_stochastic_duration_predictionr]  duration_predictorr  num_speakersr   r  r   embed_speakerr   posterior_encoderspeaking_rateru  noise_scale_duration	post_initr   s     r(   r   VitsModel.__init__  s     +F3-f5	"6*44&Ef&MD#&;F&CD#"!#f.A.A6C`C`!aD "6f!= $11!--$*$?$?! 	r'   c                     U R                   $ r0   )r@  r  s    r(   get_encoderVitsModel.get_encoder  r  r'   r"  r  
speaker_idr  r  r  labelsr  c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  [	        S5      eU R
                  R                  R                  R                  nUb!  UR                  S5      R                  U5      n	O4[        R                  " U5      R                  S5      R                  U5      n	U R                   R                  S:  a  Ub  SUs=::  a  U R                   R                  :  d(  O  [        SU R                   R                  S-
   S35      e[        U[         5      (       a  [        R"                  " SX0R$                  S	9nU R'                  U5      R                  S5      n
OSn
U R                  UU	UUUUS
9nU(       d  US   OUR(                  nUR+                  SS5      nU	R+                  SS5      n	U(       d  US   OUR,                  nU(       d  US   OUR.                  nU R                   R0                  (       a  U R3                  UU	U
SU R4                  S9nOU R3                  XU
5      nSU R6                  -  n[        R8                  " [        R:                  " U5      U	-  U-  5      n[        R<                  " [        R>                  " USS/5      S5      RA                  5       n[        RB                  " URE                  5       UR                  UR$                  S9nUR                  S5      UR                  S5      :  nUR                  S5      R                  U	R                  5      n[        R                  " U	S5      [        R                  " US5      -  nURF                  u  nnnn[        RH                  " US5      RK                  UU-  S5      n[        RB                  " UUR                  UR$                  S9nUR                  S5      U:  nUR                  UR                  5      RK                  UUU5      nU[L        RN                  RQ                  U/ SQ5      SS2SS24   -
  nUR                  S5      R+                  SS5      U-  n[        RR                  " URU                  S5      U5      R+                  SS5      n[        RR                  " URU                  S5      U5      R+                  SS5      nU[        RV                  " U5      [        R:                  " U5      -  U RX                  -  -   nU R[                  UUU
SS9nUU-  n U R]                  U U
5      n!U!RU                  S5      n!U[^        R`                  " U R                   Rb                  5      -  n"U(       d  U!U"U 4USS -   n#U#$ [e        U!U"U URf                  URh                  S9$ )a'  
speaker_id (`int`, *optional*):
    Which speaker embedding to use. Only used for multispeaker models.
labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
    Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
    computation.

Example:

```python
>>> from transformers import VitsTokenizer, VitsModel, set_seed
>>> import torch

>>> tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
>>> model = VitsModel.from_pretrained("facebook/mms-tts-eng")

>>> inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")

>>> set_seed(555)  # make deterministic

>>> with torch.no_grad():
...     outputs = model(inputs["input_ids"])
>>> outputs.waveform.shape
torch.Size([1, 45824])
```
Nz&Training of VITS is not supported yet.r?   r   r   z Set `speaker_id` in the range 0-.r   )rn  
fill_valuerh  )r"  r   r  r  r  r  r\   T)rE   ru  rW   )ri  rh  )r   r   r   r   r   r   r	   r)  )r   r   r   r   r   )5r   r  r  use_return_dictNotImplementedErrorr@  r  r   ri  	unsqueezero  r#   	ones_likerD  r`   r  r   fullrh  rE  r,   r=  r-   r.   rB  rC  rH  rG  ceilrM   rr  re   longaranger_   ra   rc   r  r   rN   r<   r  squeezer   ru  r,  rA  rK   prodr   r   r   r   )$r   r"  r  rN  r  r  r  rO  
mask_dtypeinput_padding_maskspeaker_embeddingstext_encoder_outputr   r-   r.   r}  length_scaledurationpredicted_lengthsindicesoutput_padding_mask	attn_maskrM  r   output_lengthinput_lengthcum_durationvalid_indicespadded_indicesattnprior_latentsr{  r   r   r   rR   s$                                       r(   r   VitsModel.forward  s;   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%&NOO&&33::@@
%!/!9!9"!=!@!@!L!&!;!E!Eb!I!L!LZ!X;;##a'J,B
=T[[%=%== #CDKKD\D\_`D`Caab!cdd*c**"ZZTjQ\Q\]
!%!3!3J!?!I!I"!M!%"//+)/!5# 0 
 7B+A.GZGlGl%//15/99!Q?4?)!,EXEdEd<G1!4M`MtMt;;9922"" 55 3 L  22=VhiLT///::eii58JJ\YZ!OOEIIhA,GKPPR ,,0446>O>U>U^o^v^vw%//25F5P5PQR5SS1;;A>AABTBZBZ[ OO$6:U__M`bd=ee	5>__2
A}l||Hb166zL7PRST,,}HNN8??[))!,|;%((9>>z<Yfg&):):=J\)]^_adbdad^d)ee''*44Q:YF ll4<<?K@JJ1aP#ll4<<?<OPZZ[\^_`#e&6&6{&CeiiPcFd&dgkgwgw&ww))M+>@R\`)a 33<<-?@##A&,rwwt{{7Q7Q/RR!1;?BUVWVXBYYGN-#-;;*55
 	
r'   )
r   rA  rC  rE  r,  ru  rH  rF  rG  r@  )NNNNNNN)r   r   r    r!   r   r   rL  r   r   r#   r  r   r  r$   r   r   r   r   r   r&   r   r   s   @r(   r>  r>    s    z 4!  -115$(,0/3&*.2~
ELL)~
 !.~
 SM	~

 $D>~
 'tn~
 d^~
 **+~
 
uSz?*	+~
 ~
r'   r>  )Fg      @MbP?ro  ro  )>r"   rL  dataclassesr   typingr   r   r   r   numpyrK   r#   torch.utils.checkpointr   activationsr
   integrations.deepspeedr   integrations.fsdpr   modeling_attn_mask_utilsr   modeling_outputsr   r   modeling_utilsr   r   r   r   configuration_vitsr   
get_loggerr   loggerr   r*   jitscriptr:   rU   rO   Moduler   r   r   r   r  r"  r/  rA  rQ  r]  r  r  r  r  r  r  r'  r>  __all__r   r'   r(   <module>r     s,     ! . .     ! @ 7 B < - , * 
		H	% :k : :@ :K : :>   G TE%PM5%((// M5`)299 )&;299 ;|U")) Up!		 !6		 &+%BII +%\(!299 (!V!BII !$a bii a H"%BII "%JcBII cL'bii 'T$ryy $NJ
")) J
Z5
bii 5
p ?/ ? ?4 
]
# ]

]
@ -
.r'   