o
    ZŽhòù  ã                   @   sv  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z
ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ e  e!¡Z"eG dd„ deƒƒZ#eG dd„ deƒƒZ$ej%j&dd„ ƒZ'					dCdd„Z(dd„ Z)G dd„ dejj*ƒZ+G dd „ d ej*ƒZ,G d!d"„ d"ej*ƒZ-G d#d$„ d$ej*ƒZ.G d%d&„ d&ej*ƒZ/G d'd(„ d(ej*ƒZ0G d)d*„ d*ej*ƒZ1G d+d,„ d,ej*ƒZ2G d-d.„ d.ej*ƒZ3G d/d0„ d0ej*ƒZ4G d1d2„ d2ej*ƒZ5G d3d4„ d4ej*ƒZ6G d5d6„ d6ej*ƒZ7G d7d8„ d8ej*ƒZ8G d9d:„ d:ej*ƒZ9G d;d<„ d<ej*ƒZ:eG d=d>„ d>eƒƒZ;ed?d@G dAdB„ dBe;ƒƒZ<dBd>gZ=dS )DzPyTorch VITS model.é    N)Ú	dataclass)ÚAnyÚOptionalÚTupleÚUnion)Únné   )ÚACT2FN)Úis_deepspeed_zero3_enabled)Úis_fsdp_managed_module)Ú_prepare_4d_attention_mask)ÚBaseModelOutputÚModelOutput)ÚPreTrainedModel)Úauto_docstringÚloggingé   )Ú
VitsConfigc                   @   sx   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dS )ÚVitsModelOutputaC  
    Describes the outputs for the VITS model, with potential hidden states and attentions.

    Args:
        waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            The final audio waveform predicted by the model.
        sequence_lengths  (`torch.FloatTensor` of shape `(batch_size,)`):
            The length in samples of each element in the `waveform` batch.
        spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
            The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi
            GAN decoder model to obtain the final audio waveform.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attention weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    NÚwaveformÚsequence_lengthsÚspectrogramÚhidden_statesÚ
attentions)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   ÚtorchÚFloatTensorÚ__annotations__r   r   r   r   r   © r!   r!   úU/var/www/auris/lib/python3.10/site-packages/transformers/models/vits/modeling_vits.pyr   '   s   
 r   c                   @   st   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dS )ÚVitsTextEncoderOutputaa  
    Describes the outputs for the VITS text encoder model, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            The predicted mean values of the prior distribution for the latent text variables.
        prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            The predicted log-variance values of the prior distribution for the latent text variables.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attention weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    NÚlast_hidden_stateÚprior_meansÚprior_log_variancesr   r   )r   r   r   r   r$   r   r   r   r    r%   r&   r   r   r   r!   r!   r!   r"   r#   H   s   
 r#   c                 C   sT   | | }t  |d d …d |…d d …f ¡}t  |d d …|d …d d …f ¡}|| }|S ©N)r   ÚtanhÚsigmoid)Zinput_aZinput_bZnum_channelsZin_actZt_actZs_actÚactsr!   r!   r"   Úfused_add_tanh_sigmoid_multiplyh   s
     r+   Fç      @çü©ñÒMbP?c	                 C   sÎ   | | k| |k@ }	|	 }
t  | ¡}t  | ¡}t t d| ¡d ¡}tjj|dd}||d< ||d< | |
 ||
< d||
< t| |	 ||	dd…f ||	dd…f ||	dd…f |||||d	\||	< ||	< ||fS )	aô	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Outside of the
    `tail_bound`, the transform behaves as an identity function.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`, *optional*, defaults to `False`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`, *optional* defaults to 5):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function with the `tail_bound` limits
            applied.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs` with the `tail_bound`
            limits applied.
    r   )r   r   )Úpad©.r   ©.éÿÿÿÿç        N)	ÚinputsÚunnormalized_widthsÚunnormalized_heightsÚunnormalized_derivativesÚreverseÚ
tail_boundÚmin_bin_widthÚmin_bin_heightÚmin_derivative)	r   Ú
zeros_likeÚnpÚlogÚexpr   Ú
functionalr.   Ú_rational_quadratic_spline)r3   r4   r5   r6   r7   r8   r9   r:   r;   Zinside_interval_maskZoutside_interval_maskÚoutputsÚlog_abs_detÚconstantr!   r!   r"   Ú(_unconstrained_rational_quadratic_splineq   s,   .

÷rE   c	           *      C   s  |}	| }
t  | ¡|
k st  | ¡|	krtdƒ‚|jd }|| dkr,td|› d|› ƒ‚|| dkr<td|› d|› ƒ‚tjj|dd}|d||  |  }t j|dd}tjj	|d	d
dd}|	|
 | |
 }|
|d< |	|d< |ddd…f |ddd…f  }|tj 
|¡ }tjj|dd}|d||  |  }t j|dd}tjj	|d	d
dd}|	|
 | |
 }|
|d< |	|d< |ddd…f |ddd…f  }|rÎ|n|}|d  d7  < t j| d |kddd }|d }| d|¡d }| d|¡d }| d|¡d }|| }| d|¡d }| d|¡d }|ddd…f  d|¡d }| d|¡d }|| d|  }|s†| | | }|d|  }||| d¡ ||   }|||  }|||  } | d¡|| d¡ d| |  |d|  d¡   }!t  |!¡dt  |¡  }"| |"fS | | }#|#| }$|||  |$ }%|| |$ }&| |# }'|& d¡d|% |'  }(|(dk ¡ sºtd|(› ƒ‚d|' |& t  |(¡  })|)| | } |)d|)  }|||  }| d¡||) d¡ d| |  |d|)  d¡   }!t  |!¡dt  |¡  }"| |" fS )a(	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Unlike the
    function `_unconstrained_rational_quadratic_spline`, the function behaves the same across the `tail_bound`.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs`.
    z-Input to a transform is not within its domainr1   ç      ð?zMinimal bin width z" too large for the number of bins zMinimal bin height ©Údimr   )r   r   rD   r2   )r.   ÚmodeÚvaluer/   r0   .Ngíµ ÷Æ°>).Né   é   r   zinvalid discriminant )r   ÚminÚmaxÚ
ValueErrorÚshaper   r@   ÚsoftmaxÚcumsumr.   ZsoftplusÚsumÚgatherÚpowr>   ÚallÚRuntimeErrorÚsqrt)*r3   r4   r5   r6   r7   r8   r9   r:   r;   Úupper_boundÚlower_boundÚnum_binsÚwidthsZ	cumwidthsZderivativesZheightsZ
cumheightsZbin_locationsZbin_idxZinput_cumwidthsZinput_bin_widthsZinput_cumheightsÚdeltaZinput_deltaZinput_derivativesZinput_derivatives_plus_oneZinput_heightsZintermediate1ÚthetaZtheta_one_minus_thetaÚ	numeratorÚdenominatorrB   Zderivative_numeratorrC   Zintermediate2Zintermediate3ÚaÚbÚcZdiscriminantÚrootr!   r!   r"   rA   »   s–   ,
  
ÿþÿ

ÿþÿ
rA   c                       s8   e Zd Zdedef‡ fdd„Zd
dd„Zdd	„ Z‡  ZS )ÚVitsWaveNetÚconfigÚ
num_layersc                    sB  t ƒ  ¡  |j| _|| _tj ¡ | _tj ¡ | _t 	|j
¡| _ttjjdƒr,tjjj}ntjj}|jdkrJtj |jd|j | d¡}||dd| _t|ƒD ]P}|j| }|j| | d }tjj|jd|j |j||d}||dd}| j |¡ ||d k r†d|j }	n|j}	tj |j|	d¡}
||
dd}
| j |
¡ qNd S )NÚweight_normr   rK   r   Úweight)Úname)Úin_channelsÚout_channelsÚkernel_sizeÚdilationÚpadding)ÚsuperÚ__init__Úhidden_sizerg   r   r   Ú
ModuleListÚ	in_layersÚres_skip_layersÚDropoutZwavenet_dropoutÚdropoutÚhasattrÚutilsÚparametrizationsrh   Úspeaker_embedding_sizeÚConv1dÚ
cond_layerÚrangeZwavenet_dilation_rateZwavenet_kernel_sizeÚappend)Úselfrf   rg   rh   r}   Úirn   ro   Zin_layerZres_skip_channelsZres_skip_layer©Ú	__class__r!   r"   rq   D  s>   


ûëzVitsWaveNet.__init__Nc                 C   s  t  |¡}t  | jg¡}|d ur|  |¡}t| jƒD ]p}| j| |ƒ}|d urA|d | j }|d d …||d| j  …d d …f }	nt  |¡}	t||	|d ƒ}
|  	|
¡}
| j
| |
ƒ}|| jd k r†|d d …d | j…d d …f }|| | }||d d …| jd …d d …f  }q|| }q|| S )NrK   r   r   )r   r<   Z	IntTensorrr   r}   r~   rg   rt   r+   rw   ru   )r€   r3   Úpadding_maskÚglobal_conditioningrB   Znum_channels_tensorr   r   Zcond_offsetZglobal_statesr*   Zres_skip_actsZres_actsr!   r!   r"   Úforwardm  s&   

&

"
zVitsWaveNet.forwardc                 C   sR   | j dkrtjj | j¡ | jD ]	}tjj |¡ q| jD ]	}tjj |¡ qd S )Nr   )r{   r   r   ry   Úremove_weight_normr}   rt   ru   ©r€   Úlayerr!   r!   r"   r‡   Š  s   


ÿzVitsWaveNet.remove_weight_normr'   )	r   r   r   r   Úintrq   r†   r‡   Ú__classcell__r!   r!   r‚   r"   re   C  s    
)re   c                       s,   e Zd Zdef‡ fdd„Zddd„Z‡  ZS )ÚVitsPosteriorEncoderrf   c                    sR   t ƒ  ¡  |j| _t |j|jd¡| _t	||j
d| _t |j| jd d¡| _d S )Nr   ©rg   rK   )rp   rq   Ú	flow_sizerl   r   r|   Zspectrogram_binsrr   Úconv_prere   Z$posterior_encoder_num_wavenet_layersÚwavenetÚ	conv_proj©r€   rf   r‚   r!   r"   rq   ”  s
   
zVitsPosteriorEncoder.__init__Nc                 C   sf   |   |¡| }|  |||¡}|  |¡| }tj|| jdd\}}|t |¡t |¡  | }|||fS )Nr   rG   )r   r   r‘   r   Úsplitrl   Ú
randn_liker?   )r€   r3   r„   r…   ÚstatsÚmeanÚ
log_stddevZsampledr!   r!   r"   r†   œ  s   
zVitsPosteriorEncoder.forwardr'   ©r   r   r   r   rq   r†   r‹   r!   r!   r‚   r"   rŒ   “  ó    rŒ   c                       s@   e Zd Zd‡ fdd„	Zddd„Zd	d
„ Zdd„ Zdd„ Z‡  ZS )ÚHifiGanResidualBlockr   ©r   r   é   çš™™™™™¹?c                    sb   t ƒ  ¡  |ˆ_t ‡ ‡‡‡fdd„ttˆƒƒD ƒ¡ˆ_t ‡ ‡‡fdd„ttˆƒƒD ƒ¡ˆ_d S )Nc                    s2   g | ]}t jˆ ˆ ˆd ˆ| ˆ ˆˆ| ¡d‘qS ©r   )Ústridern   ro   ©r   r|   Úget_padding)Ú.0r   ©Úchannelsrn   rm   r€   r!   r"   Ú
<listcomp>¬  s    	øúÿz1HifiGanResidualBlock.__init__.<locals>.<listcomp>c                    s*   g | ]}t jˆ ˆ ˆd d ˆ ˆd ¡d‘qS rž   r    ©r¢   Ú_)r¤   rm   r€   r!   r"   r¥   ¹  s    	ø
úÿ)	rp   rq   Úleaky_relu_sloper   rs   r~   ÚlenÚconvs1Úconvs2)r€   r¤   rm   rn   r¨   r‚   r£   r"   rq   §  s   

	÷ÿ
	÷
ÿzHifiGanResidualBlock.__init__r   c                 C   s   || | d S )NrK   r!   )r€   rm   rn   r!   r!   r"   r¡   Æ  s   z HifiGanResidualBlock.get_paddingc                 C   sL   t jj}tt jjdƒrt jjj}| jD ]}||ƒ q| jD ]}||ƒ qd S ©Nrh   )r   ry   rh   rx   rz   rª   r«   ©r€   rh   r‰   r!   r!   r"   Úapply_weight_normÉ  ó   




ÿz&HifiGanResidualBlock.apply_weight_normc                 C   s4   | j D ]}tj |¡ q| jD ]}tj |¡ qd S r'   )rª   r   ry   r‡   r«   rˆ   r!   r!   r"   r‡   Ó  s
   

ÿz'HifiGanResidualBlock.remove_weight_normc                 C   sX   t | j| jƒD ]"\}}|}tj || j¡}||ƒ}tj || j¡}||ƒ}|| }q|S r'   )Úziprª   r«   r   r@   Ú
leaky_relur¨   )r€   r   Zconv1Zconv2Úresidualr!   r!   r"   r†   Ù  s   
zHifiGanResidualBlock.forward)r   r›   r   ©r   )	r   r   r   rq   r¡   r®   r‡   r†   r‹   r!   r!   r‚   r"   rš   ¦  s    

rš   c                       sV   e Zd Zdef‡ fdd„Zdd„ Zdd„ Z	dd	ejd
e	ej dejfdd„Z
‡  ZS )ÚVitsHifiGanrf   c              
      sF  t ƒ  ¡  || _t|jƒ| _t|jƒ| _tj	|j
|jdddd| _t ¡ | _tt|j|jƒƒD ]$\}\}}| j tj|jd|  |jd|d   |||| d d¡ q/t ¡ | _tt| jƒƒD ]#}|jd|d   }t|j|jƒD ]\}}| j t||||jƒ¡ qrq`tj	|dddddd| _|jdkr¡t 	|j|jd¡| _d S d S )	Né   r   r   )rm   rŸ   ro   rK   F)rm   rŸ   ro   Úbiasr   )rp   rq   rf   r©   Zresblock_kernel_sizesÚnum_kernelsÚupsample_ratesÚnum_upsamplesr   r|   rŽ   Zupsample_initial_channelr   rs   Ú	upsamplerÚ	enumerater°   Zupsample_kernel_sizesr   ZConvTranspose1dÚ	resblocksr~   Zresblock_dilation_sizesrš   r¨   Ú	conv_postr{   Úcond)r€   rf   r   Zupsample_raterm   r¤   rn   r‚   r!   r"   rq   å  s@   
û

ûÿ

ÿ
ÿzVitsHifiGan.__init__c                 C   sL   t jj}tt jjdƒrt jjj}| jD ]}||ƒ q| jD ]}| ¡  qd S r¬   )r   ry   rh   rx   rz   rº   r¼   r®   r­   r!   r!   r"   r®   	  r¯   zVitsHifiGan.apply_weight_normc                 C   s0   | j D ]}tj |¡ q| jD ]}| ¡  qd S r'   )rº   r   ry   r‡   r¼   rˆ   r!   r!   r"   r‡     s
   


ÿzVitsHifiGan.remove_weight_normNr   r…   Úreturnc                 C   sÀ   |   |¡}|dur||  |¡ }t| jƒD ]8}tj || jj¡}| j	| |ƒ}| j
|| j  |ƒ}td| jƒD ]}|| j
|| j |  |ƒ7 }q7|| j }qtj |¡}|  |¡}t |¡}|S )aG  
        Converts a spectrogram into a speech waveform.

        Args:
            spectrogram (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`):
                Tensor containing the spectrograms.
            global_conditioning (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_size, 1)`, *optional*):
                Tensor containing speaker embeddings, for multispeaker models.

        Returns:
            `torch.FloatTensor`: Tensor of shape shape `(batch_size, 1, num_frames)` containing the speech waveform.
        Nr   )r   r¾   r~   r¹   r   r@   r±   rf   r¨   rº   r¼   r·   r½   r   r(   )r€   r   r…   r   r   Z	res_stateÚjr   r!   r!   r"   r†     s   


zVitsHifiGan.forwardr'   )r   r   r   r   rq   r®   r‡   r   r   r   r†   r‹   r!   r!   r‚   r"   r´   ä  s    $
ÿÿÿþr´   c                       ó,   e Zd Zdef‡ fdd„Zddd„Z‡  ZS )	ÚVitsResidualCouplingLayerrf   c                    sR   t ƒ  ¡  |jd | _t | j|jd¡| _t||j	d| _
t |j| jd¡| _d S )NrK   r   r   )rp   rq   rŽ   Úhalf_channelsr   r|   rr   r   re   Z prior_encoder_num_wavenet_layersr   r½   r’   r‚   r!   r"   rq   =  s
   
z"VitsResidualCouplingLayer.__init__NFc                 C   sÆ   t j|| jgd dd\}}|  |¡| }|  |||¡}|  |¡| }t  |¡}	|sJ||t  |	¡ |  }t j||gdd}
t  	|	ddg¡}|
|fS || t  |	 ¡ | }t j||gdd}
|
d fS )NrK   r   rG   )
r   r“   rÃ   r   r   r½   r<   r?   ÚcatrS   )r€   r3   r„   r…   r7   Ú
first_halfÚsecond_halfr   r–   r—   rB   Úlog_determinantr!   r!   r"   r†   E  s   
z!VitsResidualCouplingLayer.forward©NFr˜   r!   r!   r‚   r"   rÂ   <  r™   rÂ   c                       rÁ   )	ÚVitsResidualCouplingBlockrf   c                    s8   t ƒ  ¡  t ¡ | _t|jƒD ]
}| j t|ƒ¡ qd S r'   )	rp   rq   r   rs   Úflowsr~   Zprior_encoder_num_flowsr   rÂ   )r€   rf   r§   r‚   r!   r"   rq   X  s
   

ÿz"VitsResidualCouplingBlock.__init__NFc                 C   sh   |s| j D ]}||||ƒ\}}t |dg¡}q|S t| j ƒD ]}t |dg¡}||||dd\}}q|S )Nr   T©r7   )rÊ   r   ÚflipÚreversed)r€   r3   r„   r…   r7   Úflowr§   r!   r!   r"   r†   ^  s   
ýz!VitsResidualCouplingBlock.forwardrÈ   r˜   r!   r!   r‚   r"   rÉ   W  ó    rÉ   c                       s.   e Zd Zddef‡ fdd„Zd	dd„Z‡  ZS )
ÚVitsDilatedDepthSeparableConvr2   rf   c                    sÖ   t ƒ  ¡  |j}|j}|j| _t |¡| _t 	¡ | _
t 	¡ | _t 	¡ | _t 	¡ | _t| jƒD ]:}|| }|| | d }| j
 tj||||||d¡ | j t ||d¡¡ | j t |¡¡ | j t |¡¡ q.d S )NrK   )rk   rl   rm   Úgroupsrn   ro   r   )rp   rq   Úduration_predictor_kernel_sizerr   Zdepth_separable_num_layersrg   r   rv   rw   rs   Úconvs_dilatedÚconvs_pointwiseÚnorms_1Únorms_2r~   r   r|   Ú	LayerNorm)r€   rf   Údropout_raterm   r¤   r   rn   ro   r‚   r!   r"   rq   k  s4   




úÿ
ñz&VitsDilatedDepthSeparableConv.__init__Nc                 C   s®   |d ur|| }t | jƒD ]E}| j| || ƒ}| j| | dd¡ƒ dd¡}tj |¡}| j| |ƒ}| j	| | dd¡ƒ dd¡}tj |¡}|  
|¡}|| }q|| S ©Nr   r1   )r~   rg   rÓ   rÕ   Ú	transposer   r@   ZgelurÔ   rÖ   rw   )r€   r3   r„   r…   r   r   r!   r!   r"   r†   ‡  s   

z%VitsDilatedDepthSeparableConv.forward)r2   r'   r˜   r!   r!   r‚   r"   rÐ   j  s    rÐ   c                       rÁ   )	ÚVitsConvFlowrf   c                    sr   t ƒ  ¡  |j| _|jd | _|j| _|j| _	t
 | j| jd¡| _t|ƒ| _t
 | j| j| jd d  d¡| _d S )NrK   r   r   )rp   rq   rr   Úfilter_channelsÚdepth_separable_channelsrÃ   Zduration_predictor_flow_binsr[   Zduration_predictor_tail_boundr8   r   r|   r   rÐ   Úconv_ddsr‘   r’   r‚   r!   r"   rq   ™  s   

&zVitsConvFlow.__init__NFc                 C   s  t j|| jgd dd\}}|  |¡}|  |||¡}|  |¡| }|j\}}	}
| ||	d|
¡ dddd¡}|dd | j	…f t
 | j¡ }|d| j	d| j	 …f t
 | j¡ }|dd| j	 d …f }t|||||| jd\}}t j||gdd| }|s‰t  || ddg¡}||fS |d fS )	NrK   r   rG   r1   r   r   .)r7   r8   )r   r“   rÃ   r   rÞ   r‘   rP   ÚreshapeÚpermuter[   ÚmathrX   rÜ   rE   r8   rÄ   rS   )r€   r3   r„   r…   r7   rÅ   rÆ   r   Ú
batch_sizer¤   Úlengthr4   r5   r6   rC   rB   rÇ   r!   r!   r"   r†   ¤  s,   
$
ú	zVitsConvFlow.forwardrÈ   r˜   r!   r!   r‚   r"   rÛ   ˜  s    rÛ   c                       rÁ   )	ÚVitsElementwiseAffinerf   c                    sB   t ƒ  ¡  |j| _t t | jd¡¡| _t t | jd¡¡| _	d S ©Nr   )
rp   rq   rÝ   r¤   r   Ú	Parameterr   ZzerosÚ	translateÚ	log_scaler’   r‚   r!   r"   rq   Ä  s   
zVitsElementwiseAffine.__init__NFc                 C   sd   |s | j t | j¡|  }|| }t | j| ddg¡}||fS || j  t | j ¡ | }|d fS ©Nr   rK   )rç   r   r?   rè   rS   )r€   r3   r„   r…   r7   rB   rÇ   r!   r!   r"   r†   Ê  s   zVitsElementwiseAffine.forwardrÈ   r˜   r!   r!   r‚   r"   rä   Ã  rÏ   rä   c                       s&   e Zd Z‡ fdd„Zddd„Z‡  ZS )	ÚVitsStochasticDurationPredictorc                    s  t ƒ  ¡  |j}|j}t ||d¡| _t ||d¡| _t||j	d| _
|dkr/t ||d¡| _t ¡ | _| j t|ƒ¡ t|jƒD ]
}| j t|ƒ¡ qAt d|d¡| _t ||d¡| _t||j	d| _t ¡ | _| j t|ƒ¡ t|jƒD ]
}| j t|ƒ¡ qvd S )Nr   )rØ   r   )rp   rq   r{   rr   r   r|   r   r‘   rÐ   Úduration_predictor_dropoutrÞ   r¾   rs   rÊ   r   rä   r~   Zduration_predictor_num_flowsrÛ   Úpost_conv_preÚpost_conv_projÚpost_conv_ddsÚ
post_flows)r€   rf   Ú	embed_dimrÜ   r§   r‚   r!   r"   rq   Ö  s4   
þ
þ
ÿz(VitsStochasticDurationPredictor.__init__NFrF   c                 C   s¬  t  |¡}|  |¡}|d urt  |¡}||  |¡ }|  ||¡}|  |¡| }|s	|  |¡}|  ||¡}|  |¡| }t  	| 
d¡d| 
d¡¡j|j|jd| }d}	|}
| jD ]}||
||| d\}
}t  |
dg¡}
|	|7 }	qYt j|
ddgdd\}}|	t  tj |¡tj | ¡ | ddg¡7 }	t  dt dtj ¡|d   | ddg¡|	 }|t  |¡ | }t  t  |d¡¡| }t  | ddg¡}t j||gdd}| jD ]}||||d\}}t  |dg¡}||7 }qÖt  d	t dtj ¡|d   | ddg¡| }|| S tt| jƒƒ}|d d
… |d g }t  	| 
d¡d| 
d¡¡j|j|jd| }|D ]}t  |dg¡}||||dd\}}q3t j|ddgdd\}}|S )Nr   rK   )ÚdeviceÚdtype)r…   r   rG   ç      à¿gñhãˆµøä>g      à?éþÿÿÿr1   T)r…   r7   )r   Údetachr   r¾   rÞ   r‘   rì   rî   rí   ÚrandnÚsizeÚtorñ   rò   rï   rÌ   r“   rS   r   r@   Z
logsigmoidrá   r>   Úpir)   Ú	clamp_minrÄ   rÊ   ÚlistrÍ   )r€   r3   r„   r…   Z	durationsr7   Únoise_scaler   Zrandom_posteriorZlog_determinant_posterior_sumZlatents_posteriorrÎ   rÇ   rÅ   rÆ   ZlogqZlog_determinant_sumÚlatentsZnllrÊ   r§   Úlog_durationr!   r!   r"   r†   ö  sh   



&ÿÿ


ÿ
"ÿ*ÿÿ

0&ÿÿz'VitsStochasticDurationPredictor.forward)NNFrF   ©r   r   r   rq   r†   r‹   r!   r!   r‚   r"   rê   Õ  s     rê   c                       s&   e Zd Z‡ fdd„Zddd„Z‡  ZS )ÚVitsDurationPredictorc                    s°   t ƒ  ¡  |j}|j}t |j¡| _tj|j	|||d d| _
tj||jd| _tj||||d d| _tj||jd| _t |dd¡| _|jdkrVt |j|j	d¡| _d S d S )NrK   )ro   ©Zepsr   r   )rp   rq   rÒ   Z"duration_predictor_filter_channelsr   rv   rë   rw   r|   rr   Úconv_1r×   Úlayer_norm_epsÚnorm_1Úconv_2Únorm_2Úprojr{   r¾   )r€   rf   rm   rÜ   r‚   r!   r"   rq   :  s   

ÿzVitsDurationPredictor.__init__Nc                 C   s¸   t  |¡}|d urt  |¡}||  |¡ }|  || ¡}t  |¡}|  | dd¡¡ dd¡}|  |¡}|  || ¡}t  |¡}|  	| dd¡¡ dd¡}|  |¡}|  
|| ¡}|| S rÙ   )r   rõ   r¾   r  Zrelur  rÚ   rw   r  r  r  )r€   r3   r„   r…   r!   r!   r"   r†   I  s   





zVitsDurationPredictor.forwardr'   rÿ   r!   r!   r‚   r"   r   9  s    r   c                       s¦   e Zd ZdZdef‡ fdd„Zdejdedefdd	„Z		
	
	
	ddejde
ej de
ej de
ej dedeeje
ej f fdd„Zdd„ Zdd„ Zdd„ Z‡  ZS )ÚVitsAttentionz?Multi-headed attention with relative positional representation.rf   c                    s.  t ƒ  ¡  |j| _|j| _|j| _|j| _| j| j | _	| j	d | _
| j	| j | jkr8td| j› d| j› dƒ‚tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _| jr•t t d| jd d | j	¡| j
 ¡| _t t d| jd d | j	¡| j
 ¡| _d S d S )Nró   zIhidden_size must be divisible by num_attention_heads (got `hidden_size`: z and `num_attention_heads`: z).)r¶   r   rK   )rp   rq   rr   rð   Znum_attention_headsÚ	num_headsZattention_dropoutrw   Úwindow_sizeÚhead_dimÚscalingrO   r   ÚLinearZuse_biasÚk_projÚv_projÚq_projÚout_projræ   r   rö   Ú	emb_rel_kÚ	emb_rel_vr’   r‚   r!   r"   rq   a  s*   

ÿÿ(,þzVitsAttention.__init__ÚtensorÚseq_lenÚbszc                 C   s    |  ||| j| j¡ dd¡ ¡ S ré   )Úviewr	  r  rÚ   Ú
contiguous)r€   r  r  r  r!   r!   r"   Ú_shapez  s    zVitsAttention._shapeNFr   Úkey_value_statesÚattention_maskÚlayer_head_maskÚoutput_attentionsr¿   c                 C   s  |  ¡ \}}}|  |¡| j }	|  |  |¡d|¡}
|  |  |¡d|¡}|| j d| jf}|  |	||¡j|Ž }	|
j|Ž }
|j|Ž }|
  d¡}t	 
|	|
 dd¡¡}|  ¡ || j ||fkrmtd|| j ||f› d|  ¡ › ƒ‚| jdurŒ|  | j|¡}t	 |	| dd¡¡}|  |¡}||7 }|dur¿|  ¡ |d||fkrªtd|d||f› d|  ¡ › ƒ‚| || j||¡| }| || j ||¡}tjj|dd	}|durü|  ¡ | jfkrátd
| jf› d|  ¡ › ƒ‚| dddd¡| || j||¡ }| || j ||¡}|r| || j||¡}| || j ||¡}nd}tjj|| j| jd}t	 
||¡}|  ¡ || j || jfkrFtd|| j|| jf› d|  ¡ › ƒ‚| jdurb|  | j|¡}|  |¡}t	 ||¡}||7 }| || j|| j¡}| dd¡}| ||| j¡}|  |¡}||fS )z#Input shape: Batch x Time x Channelr1   r   rK   z$Attention weights should be of size z	, but is Nrô   z!Attention mask should be of size rG   z/Head mask for a single layer should be of size )ÚpÚtrainingz `attn_output` should be of size )r÷   r  r  r  r  r  r	  r  r  r   ZbmmrÚ   rO   r
  Ú_get_relative_embeddingsr  ÚmatmulÚ'_relative_position_to_absolute_positionr   r@   rQ   rw   r  r  Ú'_absolute_position_to_relative_positionrß   rð   r  )r€   r   r  r  r  r  r  Ztgt_lenr§   Zquery_statesZ
key_statesZvalue_statesZ
proj_shapeZsrc_lenÚattn_weightsZkey_relative_embeddingsZrelative_logitsZrel_pos_biasZattn_weights_reshapedZ
attn_probsZattn_outputZvalue_relative_embeddingsZrelative_weightsr!   r!   r"   r†   }  sx   


ÿÿ

ÿÿÿ"ÿÿ

zVitsAttention.forwardc              	   C   sn   t || jd  dƒ}|dkrtj |dd||ddg¡}t | jd | dƒ}|d|  d }|d d …||…f S )Nr   r   rK   )rN   r
  r   r@   r.   )r€   Zrelative_embeddingsrã   Z
pad_lengthZslice_start_positionZslice_end_positionr!   r!   r"   r   ß  s   z&VitsAttention._get_relative_embeddingsc                 C   sŽ   |  ¡ \}}}tj |g d¢¡}| ||d | g¡}tj |d|d ddg¡}| ||d d| d g¡}|d d …d |…|d d …f }|S )N)r   r   r   r   r   r   rK   r   r   ©r÷   r   r@   r.   r  ©r€   ÚxZbatch_headsrã   r§   Zx_flatZx_finalr!   r!   r"   r"  è  s   z5VitsAttention._relative_position_to_absolute_positionc              	   C   sŠ   |  ¡ \}}}tj |d|d ddddg¡}| ||d| d  g¡}tj ||dddg¡}| ||d| g¡d d …d d …dd …f }|S )Nr   r   rK   r%  r&  r!   r!   r"   r#  ÷  s   *z5VitsAttention._absolute_position_to_relative_position)NNNF)r   r   r   r   r   rq   r   ÚTensorrŠ   r  r   Úboolr   r†   r   r"  r#  r‹   r!   r!   r‚   r"   r  ^  s0    úþýüûú
ùb	r  c                       s$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )ÚVitsFeedForwardc                    s¨   t ƒ  ¡  t |j|j|j¡| _t |j|j|j¡| _t 	|j
¡| _t|jtƒr/t|j | _n|j| _|jdkrO|jd d }|jd }||ddddg| _d S d | _d S )Nr   rK   r   )rp   rq   r   r|   rr   Zffn_dimZffn_kernel_sizer  r  rv   Zactivation_dropoutrw   Ú
isinstanceZ
hidden_actÚstrr	   Úact_fnro   )r€   rf   Úpad_leftÚ	pad_rightr‚   r!   r"   rq     s   



zVitsFeedForward.__init__c                 C   s¢   |  ddd¡}|  ddd¡}|| }| jd urtj || j¡}|  |¡}|  |¡}|  |¡}|| }| jd ur?tj || j¡}|  |¡}|| }|  ddd¡}|S )Nr   rK   r   )	rà   ro   r   r@   r.   r  r-  rw   r  )r€   r   r„   r!   r!   r"   r†     s   





zVitsFeedForward.forwardrÿ   r!   r!   r‚   r"   r*    s    r*  c                	       sL   e Zd Zdef‡ fdd„Z		ddejdejdeej d	e	fd
d„Z
‡  ZS )ÚVitsEncoderLayerrf   c                    sX   t ƒ  ¡  t|ƒ| _t |j¡| _tj|j	|j
d| _t|ƒ| _tj|j	|j
d| _d S )Nr  )rp   rq   r  Ú	attentionr   rv   Zhidden_dropoutrw   r×   rr   r  Ú
layer_normr*  Úfeed_forwardÚfinal_layer_normr’   r‚   r!   r"   rq   /  s   


zVitsEncoderLayer.__init__NFr   r„   r  r  c                 C   sp   |}| j |||d\}}|  |¡}|  || ¡}|}|  ||¡}|  |¡}|  || ¡}|f}|r6||f7 }|S )N)r   r  r  )r1  rw   r2  r3  r4  )r€   r   r„   r  r  r²   r$  rB   r!   r!   r"   r†   7  s    
ý


zVitsEncoderLayer.forwardrÈ   )r   r   r   r   rq   r   r(  r   r   r)  r†   r‹   r!   r!   r‚   r"   r0  .  s    ûþýüûr0  c                       sp   e Zd Zdef‡ fdd„Z				ddejdejdeej dee	 d	ee	 d
ee	 de
eef fdd„Z‡  ZS )ÚVitsEncoderrf   c                    sB   t ƒ  ¡  ˆ | _t ‡ fdd„tˆ jƒD ƒ¡| _d| _ˆ j	| _	d S )Nc                    s   g | ]}t ˆ ƒ‘qS r!   )r0  r¦   ©rf   r!   r"   r¥   Y  s    z(VitsEncoder.__init__.<locals>.<listcomp>F)
rp   rq   rf   r   rs   r~   Znum_hidden_layersÚlayersÚgradient_checkpointingÚ	layerdropr’   r‚   r6  r"   rq   V  s
   
 zVitsEncoder.__init__Nr   r„   r  r  Úoutput_hidden_statesÚreturn_dictr¿   c                 C   s  |rdnd }|r
dnd }|d urt ||jƒ}|| }tƒ p t| ƒ}	| jD ]F}
|r-||f }tj dd¡}| jo;|| j	k }|r@|	r]| j
rQ| jrQ|  |
j||||¡}n|
||||d}|d }|rad}|rj||d f }q$|| }|rv||f }|s„tdd„ |||fD ƒƒS t|||dS )	Nr!   r   r   )r  r„   r  )NNc                 s   s    | ]	}|d ur|V  qd S r'   r!   )r¢   Úvr!   r!   r"   Ú	<genexpr>™  s   € z&VitsEncoder.forward.<locals>.<genexpr>)r$   r   r   )r   rò   r
   r   r7  r=   ÚrandomÚuniformr  r9  r8  Z_gradient_checkpointing_funcÚ__call__Útupler   )r€   r   r„   r  r  r:  r;  Zall_hidden_statesZall_self_attentionsZsynced_gpusZencoder_layerZdropout_probabilityZskip_the_layerZlayer_outputsr!   r!   r"   r†   ]  sT   	

ûü€
ýzVitsEncoder.forward)NNNN)r   r   r   r   rq   r   r   r   r(  r)  r   r   r   r†   r‹   r!   r!   r‚   r"   r5  U  s*    ùþýüûúù
ør5  c                       sŠ   e Zd ZdZdef‡ fdd„Zdd„ Zdd„ Z							
ddej	dej
deej	 dee dee dee deeej	 ef fdd„Z‡  ZS )ÚVitsTextEncoderzs
    Transformer encoder that uses relative positional representation instead of absolute positional encoding.
    rf   c                    sN   t ƒ  ¡  || _t |j|j|j¡| _t	|ƒ| _
tj|j|jd dd| _d S )NrK   r   )rm   )rp   rq   rf   r   Ú	EmbeddingZ
vocab_sizerr   Zpad_token_idÚembed_tokensr5  Úencoderr|   rŽ   Úprojectr’   r‚   r!   r"   rq   §  s
   

zVitsTextEncoder.__init__c                 C   ó   | j S r'   ©rD  ©r€   r!   r!   r"   Úget_input_embeddings®  ó   z$VitsTextEncoder.get_input_embeddingsc                 C   s
   || _ d S r'   rH  )r€   rJ   r!   r!   r"   Úset_input_embeddings±  s   
z$VitsTextEncoder.set_input_embeddingsNTÚ	input_idsr„   r  r  r:  r;  r¿   c                 C   sª   |   |¡t | jj¡ }| j||||||d}|s|d n|j}	|  |	 dd¡¡ dd¡| }
t	j
|
| jjdd\}}|sJ|	||f|dd …  }|S t|	|||j|jdS )N)r   r„   r  r  r:  r;  r   r   rK   rG   )r$   r%   r&   r   r   )rD  rá   rX   rf   rr   rE  r$   rF  rÚ   r   r“   rŽ   r#   r   r   )r€   rM  r„   r  r  r:  r;  r   Zencoder_outputsr$   r•   r%   r&   rB   r!   r!   r"   r†   ´  s,   	ú	ûzVitsTextEncoder.forward)NNNT)r   r   r   r   r   rq   rJ  rL  r   r(  r   r   r)  r   r   r#   r†   r‹   r!   r!   r‚   r"   rB  ¢  s0    ùþýüûúùørB  c                   @   s$   e Zd ZeZdZdZdZdd„ ZdS )ÚVitsPreTrainedModelZvitsrM  Tc                 C   s  t |tjƒr |jjjd| jjd |jdur|jj 	¡  dS dS t |tj
ƒr5|jj 	¡  |jj d¡ dS t |tjƒrdtj |j¡ |jdurbt |j|j|jd   ¡}tjj|j| |d dS dS t |tjƒr…|jjjd| jjd |jdur‡|jj|j  	¡  dS dS dS )zInitialize the weightsr2   )r–   ZstdNrF   r   )ra   rb   )r+  r   r  ri   ÚdataZnormal_rf   Zinitializer_ranger¶   Zzero_r×   Zfill_r|   ÚinitZkaiming_normal_rá   rX   rÑ   rk   rm   Zuniform_rC  Zpadding_idx)r€   ÚmoduleÚkr!   r!   r"   Ú_init_weightsá  s(   
ÿ
þ
ýz!VitsPreTrainedModel._init_weightsN)	r   r   r   r   Zconfig_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingrS  r!   r!   r!   r"   rN  Ú  s    rN  z@
    The complete VITS model, for text-to-speech synthesis.
    )Zcustom_introc                       s–   e Zd Zdef‡ fdd„Zdd„ Ze							ddeej	 deej	 d	ee
 d
ee dee dee deej deee ef fdd„ƒZ‡  ZS )Ú	VitsModelrf   c                    s–   t ƒ  |¡ || _t|ƒ| _t|ƒ| _t|ƒ| _|j	r!t
|ƒ| _nt|ƒ| _|jdkr4t |j|j¡| _t|ƒ| _|j| _|j| _|j| _|  ¡  d S rå   )rp   rq   rf   rB  Útext_encoderrÉ   rÎ   r´   ÚdecoderÚ"use_stochastic_duration_predictionrê   Úduration_predictorr   Únum_speakersr   rC  r{   Úembed_speakerrŒ   Zposterior_encoderÚspeaking_raterü   Únoise_scale_durationZ	post_initr’   r‚   r!   r"   rq   û  s   





zVitsModel.__init__c                 C   rG  r'   )rU  rI  r!   r!   r"   Úget_encoder  rK  zVitsModel.get_encoderNrM  r  Ú
speaker_idr  r:  r;  Úlabelsr¿   c           $      C   sŠ  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur&tdƒ‚| jjjj}|dur9| 	d¡ 
|¡}	nt |¡ 	d¡ 
|¡}	| j jdkr~|dur~d|  krZ| j jk sgn td| j jd › dƒ‚t|tƒrutjd|| jd	}|  |¡ 	d¡}
nd}
| j||	||||d
}|s‘|d n|j}| dd¡}|	 dd¡}	|s¦|d n|j}|s¯|d n|j}| j jrÂ| j||	|
d| jd}n|  ||	|
¡}d| j }t t |¡|	 | ¡}t t |ddg¡d¡  ¡ }tj!| "¡ |j|jd}| 	d¡| 	d¡k }| 	d¡ 
|	j¡}t 	|	d¡t 	|d¡ }|j#\}}}}t $|d¡ %|| d¡}tj!||j|jd}| 	d¡|k }| 
|j¡ %|||¡}|t&j' (|g d¢¡dd…dd…f  }| 	d¡ dd¡| }t )| *d¡|¡ dd¡}t )| *d¡|¡ dd¡}|t +|¡t |¡ | j,  }| j-|||
dd}|| } |  .| |
¡}!|! *d¡}!|t/ 0| j j1¡ }"|sº|!|"| f|dd…  }#|#S t2|!|"| |j3|j4dS )a¿  
        speaker_id (`int`, *optional*):
            Which speaker embedding to use. Only used for multispeaker models.
        labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
            Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation.

        Example:

        ```python
        >>> from transformers import VitsTokenizer, VitsModel, set_seed
        >>> import torch

        >>> tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
        >>> model = VitsModel.from_pretrained("facebook/mms-tts-eng")

        >>> inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")

        >>> set_seed(555)  # make deterministic

        >>> with torch.no_grad():
        ...     outputs = model(inputs["input_ids"])
        >>> outputs.waveform.shape
        torch.Size([1, 45824])
        ```
        Nz&Training of VITS is not supported yet.r1   r   r   z Set `speaker_id` in the range 0-Ú.r³   )r÷   Z
fill_valuerñ   )rM  r„   r  r  r:  r;  rK   T)r7   rü   rF   )rò   rñ   )r   r   r   r   r   r   r   rË   )r   r   r   r   r   )5rf   r  r:  Zuse_return_dictÚNotImplementedErrorrU  rD  ri   rò   Z	unsqueezerø   r   Z	ones_likerY  rO   r+  rŠ   Úfullrñ   rZ  r$   rÚ   r%   r&   rW  rX  r\  r[  Úceilr?   rú   rS   ÚlongZarangerN   rP   rR   r  r   r@   r.   r!  Zsqueezer”   rü   rÎ   rV  r=   Úprodr¸   r   r   r   )$r€   rM  r  r^  r  r:  r;  r_  Z
mask_dtypeZinput_padding_maskZspeaker_embeddingsZtext_encoder_outputr   r%   r&   rþ   Zlength_scaleÚdurationZpredicted_lengthsÚindicesZoutput_padding_maskZ	attn_maskrâ   r§   Zoutput_lengthZinput_lengthZcum_durationZvalid_indicesZpadded_indicesZattnZprior_latentsrý   r   r   r   rB   r!   r!   r"   r†     sŽ   %ÿ
úû
&
ûzVitsModel.forward)NNNNNNN)r   r   r   r   rq   r]  r   r   r   r(  rŠ   r)  r   r   r   r   r   r†   r‹   r!   r!   r‚   r"   rT  õ  s8    øþýüûúùø	÷rT  )Fr,   r-   r-   r-   )>r   rá   Údataclassesr   Útypingr   r   r   r   Únumpyr=   r   Ztorch.utils.checkpointr   Zactivationsr	   Zintegrations.deepspeedr
   Zintegrations.fsdpr   Zmodeling_attn_mask_utilsr   Zmodeling_outputsr   r   Zmodeling_utilsr   ry   r   r   Zconfiguration_vitsr   Z
get_loggerr   Úloggerr   r#   ZjitÚscriptr+   rE   rA   ÚModulere   rŒ   rš   r´   rÂ   rÉ   rÐ   rÛ   rä   rê   r   r  r*  r0  r5  rB  rN  rT  Ú__all__r!   r!   r!   r"   Ú<module>   sp   
 

÷J 	P>X.+d% '*'M8ÿ !