o
    ZhB                    @   s  d Z ddlZddlmZmZmZmZ ddlZddl	Z	ddl
Z	ddl	mZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( e%)e*Z+dZ,de	j-de.de.fddZ/	dde	j-de.dee	j- fddZ0		ddee.e.f de1de.dee	j2 de.dej3fd d!Z4G d"d# d#ej5Z6G d$d% d%ej5Z7G d&d' d'ej5Z8G d(d) d)ej5Z9G d*d+ d+ej5Z:G d,d- d-ej5Z;G d.d/ d/e	jj5Z<G d0d1 d1ej5Z=G d2d3 d3ej5Z>G d4d5 d5ej5Z?G d6d7 d7ej5Z@G d8d9 d9ej5ZAG d:d; d;ej5ZBG d<d= d=ej5ZCG d>d? d?ej5ZDG d@dA dAej5ZEG dBdC dCej5ZFG dDdE dEej5ZGG dFdG dGej5ZHG dHdI dIej5ZIG dJdK dKej5ZJe$G dLdM dMe"ZKG dNdO dOeKZLG dPdQ dQeKZMG dRdS dSeKZNG dTdU dUeKZOG dVdW dWeKZPG dXdY dYeKZQG dZd[ d[eKZRG d\d] d]eKZSG d^d_ d_ej5ZTG d`da daej5ZUe$dbdcG ddde deeKZVe$dfdcG dgdh dheKeZW			i	j	k		l	lddmeKde	jXdnee	jX dee	j2 doe1dpe1dqe1dreej5 dseYdteYdee	jXee	jXe	jXf f fdudvZZe$dwdcG dxdy dyeKZ[e$dzdcG d{d| d|eKZ\G d}d~ d~ej5Z]e$ddcG dd de"Z^g dZ_dS )zPyTorch SpeechT5 model.    N)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossL1Loss   )ACT2FN)GenerationMixin)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSeq2SeqSpectrogramOutput)PreTrainedModel)auto_docstringlogging   )SpeechT5ConfigSpeechT5HifiGanConfig	input_idspad_token_iddecoder_start_token_idc                 C   sh   |  | j}| ddddf  |ddddf< ||dddf< |du r*td||dk| |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r   r   Zshifted_input_ids r%   ]/var/www/auris/lib/python3.10/site-packages/transformers/models/speecht5/modeling_speecht5.pyshift_tokens_right4   s   (r'   input_valuesreduction_factorattention_maskc                 C   s   |dkr"| dd|d d|f } |dur"|dd|d d|f }|  | j}| ddddf  |ddddf< ||dkd ||fS )zw
    Shift input spectrograms one timestep to the right. Also applies the reduction factor to the sequence length.
    r   Nr         Y        )r    r!   r"   r$   )r(   r)   r*   Zshifted_input_valuesr%   r%   r&   shift_spectrograms_rightD   s   (r-   r!   	mask_probmask_length	min_masksreturnc                    s  | \}dk rt dkrt d d dtjd   fdd}|dur:| d	 n
fd
dt|D }tj	|ft
d}g }	|}
|
dkrZ|S |D ];}||}tjjt|d  |dd}t|dkr}d }n|d }t|tj|
| tjd| g}|	| q\t|	}	t|	dddddf ||
f}	|	||
 }	tddddf }t|||
f||
 }|	| }	|	 d krd |	|	d k< t||	dd	 |S )af  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    sX   t |     }t|}| kr }| d  |k r*t| d  d}|S )z;Given input length, compute how many spans should be maskedr   r   )intmax)input_lengthnum_masked_spanepsilonr/   r.   r0   sequence_lengthr%   r&   compute_num_masked_span   s   
z6_compute_mask_indices.<locals>.compute_num_masked_spanNr   c                    s   g | ]} qS r%   r%   .0_)r9   r%   r&   
<listcomp>   s    z)_compute_mask_indices.<locals>.<listcomp>dtyper   F)replace)r#   nprandomranditemdetachsumtolistrangezerosboolchoicearangelenZconcatenateonesZint32appendarrayZbroadcast_toreshaper4   Zput_along_axis)r!   r.   r/   r*   r0   
batch_sizer:   input_lengthsZspec_aug_maskZspec_aug_mask_idxsZmax_num_masked_spanr5   r6   Zspec_aug_mask_idxZdummy_mask_idxoffsetsr%   r7   r&   _compute_mask_indicesZ   s\   

rV   c                       &   e Zd Zd fdd	Zdd Z  ZS )SpeechT5NoLayerNormConvLayerr   c                    sj   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _d S )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr   feat_extract_activation
activationselfconfiglayer_id	__class__r%   r&   r^      s   
z%SpeechT5NoLayerNormConvLayer.__init__c                 C   s   |  |}| |}|S N)rf   rh   rj   hidden_statesr%   r%   r&   forward      

z$SpeechT5NoLayerNormConvLayer.forwardr   __name__
__module____qualname__r^   rr   __classcell__r%   r%   rm   r&   rX      s    rX   c                       rW   )SpeechT5LayerNormConvLayerr   c                    s|   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
tj| jdd| _t|j | _d S )Nr   r   rY   T)Zelementwise_affine)r]   r^   r_   r`   ra   r   rb   rc   rd   re   rf   	LayerNorm
layer_normr   rg   rh   ri   rm   r%   r&   r^      s   
z#SpeechT5LayerNormConvLayer.__init__c                 C   s:   |  |}|dd}| |}|dd}| |}|S )Nr   )rf   	transposer|   rh   rp   r%   r%   r&   rr      s   


z"SpeechT5LayerNormConvLayer.forwardrt   ru   r%   r%   rm   r&   rz      s    rz   c                       rW   )SpeechT5GroupNormConvLayerr   c                    s   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _tj| j| jdd| _d S )Nr   r   rY   T)Z
num_groupsZnum_channelsZaffine)r]   r^   r_   r`   ra   r   rb   rc   rd   re   rf   r   rg   rh   	GroupNormr|   ri   rm   r%   r&   r^     s   
z#SpeechT5GroupNormConvLayer.__init__c                 C   s"   |  |}| |}| |}|S ro   )rf   r|   rh   rp   r%   r%   r&   rr     s   


z"SpeechT5GroupNormConvLayer.forwardrt   ru   r%   r%   rm   r&   r     s    r   c                	       s   e Zd ZdZddededee f fddZddededee fd	d
Zeddededee fddZ	e
 dde
jdefddZ	dde
jdedee fddZ  ZS )%SpeechT5SinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsembedding_dimpadding_idxc                    s4   t    d| _|| _|| _| || j || d S N   )r]   r^   offsetr   r   make_weights)rj   r   r   r   rm   r%   r&   r^      s
   
z.SpeechT5SinusoidalPositionalEmbedding.__init__num_embeddingsc                 C   sB   |  |||}t| dr|j| jj| jjd}| jd|dd d S )Nweightsr@   deviceF
persistent)get_embeddinghasattrtor   r@   r   register_buffer)rj   r   r   r   Zemb_weightsr%   r%   r&   r   '  s   
z2SpeechT5SinusoidalPositionalEmbedding.make_weightsc                 C   s   |d }t d|d  }ttj|tjd |  }tj| tjd d|d }tjt	|t
|gdd| d}|d dkrUtj|t| dgdd}|durad||ddf< |t S )	z
        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
        description in Section 3.5 of "Attention Is All You Need".
        r   i'  r   r?   r   dimr   N)mathlogtorchexprM   int64float	unsqueezecatsincosviewrJ   r   Zget_default_dtype)r   r   r   Zhalf_dimembr%   r%   r&   r   /  s    $&z3SpeechT5SinusoidalPositionalEmbedding.get_embeddingr   r   past_key_values_lengthc                 C   s|   |  \}}| || j||j}| jd | }|| j dkr-| || j | j| j | j	d|
d
||d S )Nr   r   r   )size"create_position_ids_from_input_idsr   r   r   r   r   r   r   index_selectr   rF   )rj   r   r   bszseq_lenZposition_idsZmax_posr%   r%   r&   rr   A  s   "z-SpeechT5SinusoidalPositionalEmbedding.forwardc                 C   s6   | | }tj|dd|| | }| | S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:
        Returns: torch.Tensor
        r   r   )ner3   r   cumsumZtype_aslong)rj   r   r   r   maskZincremental_indicesr%   r%   r&   r   P  s   zHSpeechT5SinusoidalPositionalEmbedding.create_position_ids_from_input_idsro   rt   )rv   rw   rx   __doc__r3   r   r^   r   staticmethodr   r   no_gradTensorrr   r   ry   r%   r%   rm   r&   r     s      r   c                       $   e Zd Z fddZdd Z  ZS )SpeechT5PositionalConvEmbeddingc                    s$  t    tj|j|j|j|jd |jd| _tjj	}t
tjjdr'tjjj	}t r{dd l}|jj| jjdd || jddd| _W d    n1 sLw   Y  t
| jdrd| jjjj}| jjjj}n| jj}| jj}|j| | |j| | n	|| jddd| _t|j| _t|j | _d S )	Nr   )rZ   paddinggroupsweight_normr   )Zmodifier_rankweight)namer   parametrizations)r]   r^   r   rb   hidden_sizenum_conv_pos_embeddingsZnum_conv_pos_embedding_groupsrf   utilsr   r   r   r   	deepspeedzeroZGatheredParametersr   Z	original0Z	original1weight_gweight_vZregister_external_parameterSpeechT5SamePadLayerr   r   rg   rh   )rj   rk   r   r   r   r   rm   r%   r&   r^   c  s4   

z(SpeechT5PositionalConvEmbedding.__init__c                 C   s:   | dd}| |}| |}| |}| dd}|S Nr   r   )r~   rf   r   rh   rp   r%   r%   r&   rr     s   


z'SpeechT5PositionalConvEmbedding.forwardru   r%   r%   rm   r&   r   b  s    !r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS ) SpeechT5ScaledPositionalEncodinguS   
    Scaled positional encoding, see §3.2 in https://arxiv.org/abs/1809.08895
      c                    s   t ||}t d|d}t t jd|dt jd td|   }t 	| | |d d dd df< t 
| | |d d dd df< |d}t   | jd|dd tj|d	| _|| _t jt d
| _d S )Nr   r   r   r?   g     @peFr   p      ?)r   rJ   rM   r   r   r   r   r   r   r   r   r]   r^   r   r   Dropoutdropoutr   	Parametertensoralpha)rj   r   r   max_lenr   positionZdiv_termrm   r%   r&   r^     s   .$$

z)SpeechT5ScaledPositionalEncoding.__init__c                 C   s4   || j | jd d d |df   }| |}|S )Nr   )r   r   r   r   )rj   r   r%   r%   r&   rr     s   &
z(SpeechT5ScaledPositionalEncoding.forward)r   )rv   rw   rx   r   r^   rr   ry   r%   r%   rm   r&   r     s    r   c                       rW   )"SpeechT5RelativePositionalEncoding  c                    s.   t    || _|| _tjd| || _d S r   )r]   r^   r   
max_lengthr   r   	Embeddingpe_k)rj   r   r   rm   r%   r&   r^     s   
z+SpeechT5RelativePositionalEncoding.__init__c                 C   s   |j d }td|j|jtjd}|d d d f |d d d f  }| j ||| j k < | jd ||| jk< || j }| |S )Nr   r   r   r@   )r!   r   rM   r   r   r   r   r   )rj   rq   r   Zpos_seqr%   r%   r&   rr     s   
 

z*SpeechT5RelativePositionalEncoding.forward)r   ru   r%   r%   rm   r&   r     s    r   c                       r   )r   c                    s*   t    |d dkrd| _d S d| _d S )Nr   r   r   )r]   r^   num_pad_remove)rj   r   rm   r%   r&   r^     s   
 zSpeechT5SamePadLayer.__init__c                 C   s,   | j dkr|d d d d d | j  f }|S Nr   )r   rp   r%   r%   r&   rr     s   
zSpeechT5SamePadLayer.forwardru   r%   r%   rm   r&   r     s    r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )SpeechT5FeatureEncoderz.Construct the features from raw audio waveformc                    s   t     jdkr t ddg fddt jd D  }n jdkr2 fddt jD }n	td	 j d
t|| _	d| _
d| _d S )Ngroupr   rl   c                    s   g | ]
}t  |d  dqS )r   r   )rX   r<   irk   r%   r&   r>     s    z3SpeechT5FeatureEncoder.__init__.<locals>.<listcomp>r   layerc                    s   g | ]}t  |d qS )r   )rz   r   r   r%   r&   r>     s    z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r]   r^   Zfeat_extract_normr   rI   Znum_feat_extract_layersr#   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)rj   rk   r   rm   r   r&   r^     s   





zSpeechT5FeatureEncoder.__init__c                 C   s   |   D ]}d|_qd| _d S )NF)
parametersrequires_gradr   )rj   paramr%   r%   r&   _freeze_parameters  s   
z)SpeechT5FeatureEncoder._freeze_parametersc                 C   s\   |d d d f }| j r| jrd|_| jD ]}| j r'| jr'| jr'| |j|}q||}q|S NT)r   trainingr   r   r   _gradient_checkpointing_func__call__)rj   r(   rq   Z
conv_layerr%   r%   r&   rr     s   

zSpeechT5FeatureEncoder.forward)rv   rw   rx   r   r^   r   rr   ry   r%   r%   rm   r&   r     s
    r   c                       r   )SpeechT5FeatureProjectionc                    sJ   t    tj|jd |jd| _t|jd |j| _	t
|j| _d S )Nr   Zeps)r]   r^   r   r{   r_   layer_norm_epsr|   Linearr   
projectionr   Zfeat_proj_dropoutr   rj   rk   rm   r%   r&   r^     s   
z"SpeechT5FeatureProjection.__init__c                 C   s&   |  |}| |}| |}||fS ro   )r|   r   r   )rj   rq   Znorm_hidden_statesr%   r%   r&   rr     s   


z!SpeechT5FeatureProjection.forwardru   r%   r%   rm   r&   r     s    r   c                       s   e Zd Z fddZdd Z		ddejdeej deej	 fd	d
Z
dedejfddZdeejef fddZ		ddej	deej	 deej fddZ  ZS )SpeechT5SpeechEncoderPrenetc                    s|   t    || _t|| _t|| _|jdks|jdkr(t	
t|j | _t|| _t|j|j d |j|j| _d S )Nr,   r   )r]   r^   rk   r   feature_encoderr   feature_projectionmask_time_probmask_feature_probr   r   r   r   r   uniform_masked_spec_embedr   pos_conv_embedr   max_speech_positionsr   pos_sinusoidal_embedr   rm   r%   r&   r^     s   




z$SpeechT5SpeechEncoderPrenet.__init__c                 C   s   | j   d S ro   )r   r   rj   r%   r%   r&   freeze_feature_encoder  s   z2SpeechT5SpeechEncoderPrenet.freeze_feature_encoderNr(   r*   mask_time_indicesc           	      C   s   |  |}|dd}|d ur| |jd |}| |\}}| j|||d}| |}|| }|d ur<|d }nt	j
|jd d t	j|jd}| |}|| }||fS )Nr   r   )r   r*   r   )r   r~   "_get_feature_vector_attention_maskr!   r   _mask_hidden_statesr   r   r   r   rJ   r   r   )	rj   r(   r*   r   Zextract_featuresrq   Zpositional_conv_embeddingpadding_maskZ positional_sinusoidal_embeddingsr%   r%   r&   rr     s&   


z#SpeechT5SpeechEncoderPrenet.forwardfeature_vector_lengthc                 C   s   |j ddd d df }| |tj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dg d
dg }|S )Nr   r   r   r   r   r   )r    _get_feat_extract_output_lengthsr   r   r   r!   rJ   r@   r   rM   fliprK   )rj   r   r*   Znon_padded_lengthsoutput_lengthsrS   r%   r%   r&   r   =  s   
"z>SpeechT5SpeechEncoderPrenet._get_feature_vector_attention_maskrT   c                 C   s4   dd }t | jj| jjD ]
\}}||||}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )Nfloor)Zrounding_moder   )r   div)r5   rZ   r[   r%   r%   r&   _conv_out_lengthR  s   zVSpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengths.<locals>._conv_out_length)ziprk   rc   rd   )rj   rT   r  rZ   r[   r%   r%   r&   r   M  s   z<SpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengthsrq   c                 C   s  t | jdds	|S | \}}}|dur| j|j||< n-| jjdkrK| jrKt||f| jj| jj	|| jj
d}tj||jtjd}| j|j||< | jjdkr| jrt||f| jj| jj| jjd}tj||jtjd}|dddf d|d}d||< |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        Zapply_spec_augmentTNr   )r.   r/   r*   r0   r   )r.   r/   r0   r   )getattrrk   r   r   r   r@   r   r   rV   Zmask_time_lengthZmask_time_min_masksr   r   r   rK   r   Zmask_feature_lengthZmask_feature_min_masksexpand)rj   rq   r   r*   rS   r9   r   Zmask_feature_indicesr%   r%   r&   r   ]  s4   z/SpeechT5SpeechEncoderPrenet._mask_hidden_statesNN)rv   rw   rx   r^   r   r   r   r   
LongTensorFloatTensorrr   r3   r   r   r   r   ry   r%   r%   rm   r&   r     s.    
#r   c                       sB   e Zd Z fddZdd Z	d
dejdeej fdd	Z  Z	S )SpeechT5SpeechDecoderPrenetc                    sr   t     | _t fddt jD | _t j	 j
| _t j j
 j| _t j j
  j
| _d S )Nc                    s*   g | ]}t |d kr jn j jqS rt   )r   r   num_mel_binsspeech_decoder_prenet_unitsr   r   r%   r&   r>     s    z8SpeechT5SpeechDecoderPrenet.__init__.<locals>.<listcomp>)r]   r^   rk   r   r   rI   Zspeech_decoder_prenet_layerslayersr   r  r   final_layerr   positional_dropoutr   encode_positionsZspeaker_embedding_dimspeaker_embeds_layerr   rm   r   r&   r^     s   


z$SpeechT5SpeechDecoderPrenet.__init__c                 C   sJ   t j|d |d}|d|ddd}t |dk|dd d|  S )Nr   r   r   )r   Z	bernoullir   repeatr   where)rj   inputs_embedsr   r   Z	all_masksr%   r%   r&   _consistent_dropout  s   z/SpeechT5SpeechDecoderPrenet._consistent_dropoutNr(   speaker_embeddingsc                 C   s   |}| j D ]}tj||}| || jj}q| |}| |}|d urKtj	|}|
dd|dd}tj||gdd}tj| |}|S )Nr   r   r   )r  r   
functionalZrelur  rk   Zspeech_decoder_prenet_dropoutr  r  	normalizer   r  r   r   r   r  )rj   r(   r  r  r   r%   r%   r&   rr     s   


z#SpeechT5SpeechDecoderPrenet.forwardro   )
rv   rw   rx   r^   r  r   r   r   rr   ry   r%   r%   rm   r&   r    s    r  c                       rW   )SpeechT5BatchNormConvLayerr   c                    s   t    |dkr|j}n|j}||jd kr|j}n|j}tj|||jd|jd d dd| _t	|| _
||jd k rCt | _nd | _t|j| _d S )Nr   r   r   F)rZ   r[   r   r\   )r]   r^   r  Zspeech_decoder_postnet_unitsspeech_decoder_postnet_layersr   rb   Zspeech_decoder_postnet_kernelrf   ZBatchNorm1d
batch_normZTanhrh   r   Zspeech_decoder_postnet_dropoutr   )rj   rk   rl   r`   ra   rm   r%   r&   r^     s(   
z#SpeechT5BatchNormConvLayer.__init__c                 C   s6   |  |}| |}| jd ur| |}| |}|S ro   )rf   r  rh   r   rp   r%   r%   r&   rr     s   




z"SpeechT5BatchNormConvLayer.forwardrt   ru   r%   r%   rm   r&   r    s    r  c                       s<   e Zd Z fddZdejfddZdejfddZ  ZS )SpeechT5SpeechDecoderPostnetc                    s^   t     | _t j j j | _t j j| _	t
 fddt jD | _d S )Nc                    s   g | ]}t  |qS r%   )r  r   r   r%   r&   r>     s    z9SpeechT5SpeechDecoderPostnet.__init__.<locals>.<listcomp>)r]   r^   rk   r   r   r   r  r)   feat_outprob_outr   rI   r  r  r   rm   r   r&   r^     s   

z%SpeechT5SpeechDecoderPostnet.__init__rq   c                 C   sJ   |  ||dd| jj}| |}| ||dd}|||fS )Nr   r   )r  r   r   rk   r  postnetr   )rj   rq   outputs_before_postnetoutputs_after_postnetlogitsr%   r%   r&   rr     s   

z$SpeechT5SpeechDecoderPostnet.forwardc                 C   s0   | dd}| jD ]}||}q	|| dd S r   )r~   r  )rj   rq   Zlayer_outputr   r%   r%   r&   r!    s   

z$SpeechT5SpeechDecoderPostnet.postnet)	rv   rw   rx   r^   r   r   rr   r!  ry   r%   r%   rm   r&   r    s    r  c                       s<   e Zd Z fddZdd Zdd Zdejfdd	Z  Z	S )
SpeechT5TextEncoderPrenetc                    s>   t    || _t|j|j|j| _t	|j
|j|j| _d S ro   )r]   r^   rk   r   r   
vocab_sizer   r   embed_tokensr   r  max_text_positionsr  r   rm   r%   r&   r^     s   

z"SpeechT5TextEncoderPrenet.__init__c                 C      | j S ro   r'  r   r%   r%   r&   get_input_embeddings     z.SpeechT5TextEncoderPrenet.get_input_embeddingsc                 C   
   || _ d S ro   r*  rj   valuer%   r%   r&   set_input_embeddings     
z.SpeechT5TextEncoderPrenet.set_input_embeddingsr   c                 C   s   |  |}| |}|S ro   )r'  r  )rj   r   r  r%   r%   r&   rr     rs   z!SpeechT5TextEncoderPrenet.forward)
rv   rw   rx   r^   r+  r0  r   r   rr   ry   r%   r%   rm   r&   r%    s
    
r%  c                	       sZ   e Zd Z fddZdd Zdd Z		ddejd	eej	 d
ee
ej  fddZ  ZS )SpeechT5TextDecoderPrenetc                    sn   t    || _t|j| _|jrt	|j
nd| _t|j|j
|j| _t|j|j d |j
|j| _d S )Nr   r   )r]   r^   rk   r   r   r  r   Zscale_embeddingr   sqrtr   embed_scaler   r&  r   r'  r   r(  embed_positionsr   rm   r%   r&   r^     s   

z"SpeechT5TextDecoderPrenet.__init__c                 C   r)  ro   r*  r   r%   r%   r&   r+  '  r,  z.SpeechT5TextDecoderPrenet.get_input_embeddingsc                 C   r-  ro   r*  r.  r%   r%   r&   r0  *  r1  z.SpeechT5TextDecoderPrenet.set_input_embeddingsNr   r*   past_key_valuesc                 C   s~   |d ur|  }|d|d }ntd|d ur"|d d jd nd}| ||}| || j }||7 }| |}||fS )Nr   z'You have to specify `decoder_input_ids`r   r   )r   r   r#   r!   r5  r'  r4  r   )rj   r   r*   r6  input_shaper   Z	positionsr  r%   r%   r&   rr   -  s   
z!SpeechT5TextDecoderPrenet.forwardr	  )rv   rw   rx   r^   r+  r0  r   r   r   r
  r   r  rr   ry   r%   r%   rm   r&   r2    s    r2  c                       s<   e Zd Z fddZdejfddZdd Zdd	 Z  Z	S )
SpeechT5TextDecoderPostnetc                    s*   t    || _tj|j|jdd| _d S )NFr\   )r]   r^   rk   r   r   r   r&  lm_headr   rm   r%   r&   r^   D  s   
z#SpeechT5TextDecoderPostnet.__init__rq   c                 C   s
   |  |S ro   r:  rp   r%   r%   r&   rr   I  r1  z"SpeechT5TextDecoderPostnet.forwardc                 C   r)  ro   r;  r   r%   r%   r&   get_output_embeddingsL  r,  z0SpeechT5TextDecoderPostnet.get_output_embeddingsc                 C   r-  ro   r;  rj   Znew_embeddingsr%   r%   r&   set_output_embeddingsO  r1  z0SpeechT5TextDecoderPostnet.set_output_embeddings)
rv   rw   rx   r^   r   r   rr   r<  r>  ry   r%   r%   rm   r&   r8  C  s
    r8  c                       s   e Zd ZdZ			ddedededed	ef
 fd
dZdej	dedefddZ
						ddej	deej	 deeej	  deej	 deej	 deej	 dedeej	eej	 eeej	  f fddZ  ZS )SpeechT5Attentionz
    Multi-headed attention from 'Attention Is All You Need' paper with relative position bias (see
    https://aclanthology.org/N18-2074.pdf)
    r,   FT	embed_dim	num_headsr   
is_decoderr\   c                    s   t    || _|| _|| _|| | _| j| | jkr'td| j d| d| jd | _|| _t	j
|||d| _t	j
|||d| _t	j
|||d| _t	j
|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r9  )r]   r^   r@  rA  r   head_dimr#   scalingrB  r   r   k_projv_projq_projout_proj)rj   r@  rA  r   rB  r\   rm   r%   r&   r^   Y  s"   


zSpeechT5Attention.__init__r   r   r   c                 C   s    | ||| j| jdd S r   )r   rA  rC  r~   
contiguous)rj   r   r   r   r%   r%   r&   _shapet  s    zSpeechT5Attention._shapeNrq   key_value_statespast_key_valuer*   layer_head_maskposition_biasoutput_attentionsr1   c                 C   s  |du}|  \}	}
}| || j }|r"|dur"|d }|d }nZ|r9| | |d|	}| | |d|	}nC|durh| | |d|	}| | |d|	}tj|d |gdd}tj|d |gdd}n| | |d|	}| | |d|	}| jr||f}|	| j	 d| j
f}| ||
|	j| }|j| }|j| }| d}t||dd}|  |	| j	 |
|fkrtd|	| j	 |
|f d|   |dur| |	| j	 d| j
dd}t||d	d}|dd|	| j	 | d| d}||7 }|dur:|  |	d|
|fkr%td
|	d|
|f d|   ||	| j	|
|| }||	| j	 |
|}tjj|dd}|dury|  | j	fkr^td| j	f d|   |dddd||	| j	|
| }||	| j	 |
|}|r||	| j	|
|}||	| j	 |
|}nd}tjj|| j| jd}t||}|  |	| j	 |
| j
fkrtd|	| j	|
| j
f d|   ||	| j	|
| j
}|dd}||	|
| j}| |}|||fS )z#Input shape: Batch x Time x ChannelNr   r   r   r   r   z$Attention weights should be of size z	, but is r}   z!Attention mask should be of size z/Head mask for a single layer should be of size )r   r   z `attn_output` should be of size )r   rG  rD  rJ  rE  rF  r   r   rB  rA  rC  r   Zbmmr~   r#   rI  matmulr   r  Zsoftmaxr   r   rR   r@  rH  )rj   rq   rK  rL  r*   rM  rN  rO  Zis_cross_attentionr   tgt_lenr=   Zquery_statesZ
key_statesZvalue_statesZ
proj_shapeZsrc_lenattn_weightsZ	reshape_qZrel_pos_biasZattn_weights_reshapedZ
attn_probsZattn_outputr%   r%   r&   rr   w  s   




"

"

zSpeechT5Attention.forward)r,   FT)NNNNNF)rv   rw   rx   r   r3   r   rK   r^   r   r   rJ  r   r   rr   ry   r%   r%   rm   r&   r?  S  sP    		r?  c                       r   )SpeechT5FeedForwardc                    sl   t    t|j| _t|j|| _t	|j
tr!t|j
 | _n|j
| _t||j| _t|j| _d S ro   )r]   r^   r   r   Zactivation_dropoutintermediate_dropoutr   r   intermediate_dense
isinstanceZ
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropout)rj   rk   Zintermediate_sizerm   r%   r&   r^     s   
zSpeechT5FeedForward.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S ro   )rU  rX  rT  rY  r[  rp   r%   r%   r&   rr     s   




zSpeechT5FeedForward.forwardru   r%   r%   rm   r&   rS    s    rS  c                       s^   e Zd Zdef fddZ				ddejdeej deej d	eej d
ef
ddZ	  Z
S )SpeechT5EncoderLayerrk   c                    sj   t    t|j|j|jdd| _t|j	| _
tj|j|jd| _t||j| _tj|j|jd| _d S )NFr@  rA  r   rB  r   )r]   r^   r?  r   encoder_attention_headsattention_dropout	attentionr   r   rZ  r   r{   r   r|   rS  Zencoder_ffn_dimfeed_forwardfinal_layer_normr   rm   r%   r&   r^     s   
zSpeechT5EncoderLayer.__init__NFrq   r*   rM  rN  rO  c           
      C   sj   |}| j |||||d\}}}| |}|| }| |}|| | }| |}|f}	|r3|	|f7 }	|	S )as  
        Args:
            hidden_states (`torch.FloatTensor`):
                input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`):
                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
                large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(config.encoder_attention_heads,)`.
            position_bias (`torch.FloatTensor`):
                relative position embeddings of size `(seq_len, seq_len, hidden_size // encoder_attention_heads)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rq   r*   rM  rN  rO  )r`  r   r|   ra  rb  )
rj   rq   r*   rM  rN  rO  residualrR  r=   outputsr%   r%   r&   rr     s"   



zSpeechT5EncoderLayer.forward)NNNF)rv   rw   rx   r   r^   r   r   r   rK   rr   ry   r%   r%   rm   r&   r\    s"    r\  c                       s   e Zd Zdef fddZ								ddejdeej d	eej d
eej deej deej deeej  dee	 dee	 fddZ
  ZS )SpeechT5DecoderLayerrk   c                    s   t    t|j|j|jdd| _t|j	| _
tj|j|jd| _t|j|j|jdd| _tj|j|jd| _t||j| _tj|j|jd| _d S )NTr]  r   )r   rB  )r]   r^   r?  r   Zdecoder_attention_headsr_  	self_attnr   r   rZ  r   r{   r   self_attn_layer_normencoder_attnencoder_attn_layer_normrS  Zdecoder_ffn_dimra  rb  r   rm   r%   r&   r^   I  s$   
zSpeechT5DecoderLayer.__init__NFTrq   r*   encoder_hidden_statesencoder_attention_maskrM  cross_attn_layer_head_maskrL  rO  	use_cachec
                 C   s   |}
|dur|dd nd}| j |||||d\}}}| |}|
| }| |}d}d}|dur_|}
|dur=|dd nd}| j||||||d\}}}| |}|
| }| |}|| }|| | }| |}|f}|rv|||f7 }|	r}||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, hidden_size)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        Nr   )rq   rL  r*   rM  rO  r}   )rq   rK  r*   rM  rL  rO  )rf  r   rg  rh  ri  ra  rb  )rj   rq   r*   rj  rk  rM  rl  rL  rO  rm  rc  Zself_attn_past_key_valueZself_attn_weightsZpresent_key_valueZcross_attn_present_key_valueZcross_attn_weightsZcross_attn_past_key_valuerd  r%   r%   r&   rr   _  sJ   





zSpeechT5DecoderLayer.forward)NNNNNNFT)rv   rw   rx   r   r^   r   r   r   r   rK   rr   ry   r%   r%   rm   r&   re  H  s:    	
re  c                   @   s$   e Zd ZeZdZdZdZdd ZdS )SpeechT5PreTrainedModelspeecht5r(   Tc              	   C   s  t |tr*tjj|jjddtd|jj	d |jj
   d tj|jjd dS t |trRtd|jj }tjj|jj| |d tjj|jj| |d dS t |tjrr|jjjd| jjd |jdurp|jj  dS dS t |tjtjfr|jj  |jjd dS t |tjrtj|j |jdurt|j|j
|j	d   }tjj|j| |d dS dS t |tjr|jjjd| jjd |jdur|jj|j   dS dS dS )	zInitialize the weightsr   r   r   meanZstd)abr,   Nr   )rV  r   r   initnormal_rf   r   r   r3  rZ   Zin_channelsZ	constant_r\   r   r   Zin_featuresr   r   datark   initializer_rangezero_r{   r   Zfill_rb   Zkaiming_normal_r   r   r   )rj   modulekr%   r%   r&   _init_weights  s>   
 



z%SpeechT5PreTrainedModel._init_weightsN)	rv   rw   rx   r   config_classZbase_model_prefixmain_input_nameZsupports_gradient_checkpointingr{  r%   r%   r%   r&   rn    s    rn  c                       z   e Zd ZdZdef fddZ					ddejdeej	 deej	 d	ee
 d
ee
 dee
 deeef fddZ  ZS )SpeechT5Encoderzu
    Transformer encoder consisting of *config.encoder_layers* layers. Each layer is a [`SpeechT5EncoderLayer`].
    rk   c                    s~   t    tj j jd| _t j| _	 j
| _t fddt jD | _t j j  j| _d| _|   d S )Nr   c                       g | ]}t  qS r%   )r\  r;   r   r%   r&   r>         z,SpeechT5Encoder.__init__.<locals>.<listcomp>F)r]   r^   r   r{   r   r   r|   r   rZ  r   Zencoder_layerdrop	layerdropr   rI   Zencoder_layersr  r   r^  Zencoder_max_relative_positionr5  r   	post_initr   rm   r   r&   r^     s    zSpeechT5Encoder.__init__Nrq   r*   	head_maskrO  output_hidden_statesreturn_dictr1   c              	   C   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur(t||j}| |}| |}| |}t	 p=t
| }|rBdnd}	|rHdnd}
|durk| d t| jkrktdt| j d| d  dt| jD ]Z\}}|r{|	|f }	d}| jrtg }|| jk }|r|r| jr| jr| |j|||dur|| nd||}n|||||dur|| nd|d}|d }|rd	}|r|
|d
 f }
qp|r|	|f }	|stdd ||	|
fD S t||	|
dS )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the encoder prenet.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr%   r   z&The head_mask should be specified for  layers, but it is for .F)r*   rN  rM  rO  r	  r   c                 s       | ]	}|d ur|V  qd S ro   r%   r<   vr%   r%   r&   	<genexpr>Z  s    z*SpeechT5Encoder.forward.<locals>.<genexpr>last_hidden_staterq   
attentions)rk   rO  r  use_return_dictr   r@   r|   r   r5  r   r   r   rN   r  r#   	enumerater   r   rD   r  r   r   r   tupler   )rj   rq   r*   r  rO  r  r  rN  synced_gpusall_hidden_statesall_self_attentionsidxZencoder_layerskip_the_layerdropout_probabilitylayer_outputsr%   r%   r&   rr     st   $






	
zSpeechT5Encoder.forwardNNNNNrv   rw   rx   r   r   r^   r   r  r   r   rK   r   r   r   rr   ry   r%   r%   rm   r&   r    s.    
r  c                       r~  )SpeechT5EncoderWithSpeechPrenetz
    Wrapper around SpeechT5Encoder that applies SpeechT5SpeechEncoderPrenet to convert the audio waveform data to
    hidden features.
    rk   c                    ,   t  | t|| _t|| _|   d S ro   )r]   r^   r   prenetr  wrapped_encoderr  r   rm   r%   r&   r^   i     

z(SpeechT5EncoderWithSpeechPrenet.__init__Nr(   r*   r  rO  r  r  r1   c           	      C   s*   |  ||\}}| j||||||d}|S N)rq   r*   r  rO  r  r  r  r  	rj   r(   r*   r  rO  r  r  rq   rd  r%   r%   r&   rr   q  s   		z'SpeechT5EncoderWithSpeechPrenet.forwardr  r  r%   r%   rm   r&   r  c  s.    
r  c                       s   e Zd ZdZdef fddZdd Zdd Z										dd
ej	de
ej de
ej de
e de
e de
e deeef fddZ  ZS )SpeechT5EncoderWithTextPrenetz|
    Wrapper around SpeechT5Encoder that applies SpeechT5TextEncoderPrenet to convert the input_ids to hidden features.
    rk   c                    r  ro   )r]   r^   r%  r  r  r  r  r   rm   r%   r&   r^     r  z&SpeechT5EncoderWithTextPrenet.__init__c                 C   
   | j  S ro   r  r+  r   r%   r%   r&   r+    r1  z2SpeechT5EncoderWithTextPrenet.get_input_embeddingsc                 C      | j | d S ro   r  r0  r.  r%   r%   r&   r0       z2SpeechT5EncoderWithTextPrenet.set_input_embeddingsNr(   r*   r  rO  r  r  r1   c           	      C   s$   |  |}| j||||||d}|S r  r  r  r%   r%   r&   rr     s   
		z%SpeechT5EncoderWithTextPrenet.forwardr  )rv   rw   rx   r   r   r^   r+  r0  r   r  r   r   rK   r   r   r   rr   ry   r%   r%   rm   r&   r    s2    
r  c                       r~  )SpeechT5EncoderWithoutPrenet
    This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
    [`SpeechT5Model`].
    rk   c                    "   t  | t|| _|   d S ro   )r]   r^   r  r  r  r   rm   r%   r&   r^        
z%SpeechT5EncoderWithoutPrenet.__init__Nr(   r*   r  rO  r  r  r1   c                 C   s   | j ||||||dS r  )r  )rj   r(   r*   r  rO  r  r  r%   r%   r&   rr     s   	z$SpeechT5EncoderWithoutPrenet.forwardr  r  r%   r%   rm   r&   r    s.    

r  c                          e Zd ZdZdef fddZ											ddeej deej	 deej d	eej	 d
eej
 deej
 deeej  dee dee dee dee deeef fddZ  ZS )SpeechT5Decoderzt
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SpeechT5DecoderLayer`]
    rk   c                    sF   t     j| _t fddt jD | _d| _	| 
  d S )Nc                    r  r%   )re  r;   r   r%   r&   r>     r  z,SpeechT5Decoder.__init__.<locals>.<listcomp>F)r]   r^   Zdecoder_layerdropr  r   r   rI   Zdecoder_layersr  r   r  r   rm   r   r&   r^     s
    zSpeechT5Decoder.__init__Nrq   r*   rj  rk  r  cross_attn_head_maskr6  rm  rO  r  r  r1   c                 C   s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}| dd }|dur=|d d jd nd}t||||}|durX|durXt||j	|d d}t
 p^t| }| jrn| jrn|rntd d}|
rrdnd}|	rxdnd}|	r|durdnd}|rdnd}t||gd	d
gD ](\}}|dur| d t| jkrtd| dt| j d| d  dqt| jD ]\}}|
r||f }d}| jrtg }|| jk }|r|sq|dur|| nd}| jr| jr| |j|||||dur|| nd|dur|| ndd|	|
}n||||||dur"|| nd|dur,|| nd||	|d	}|d }|rF|||	rAdnd f7 }|	r\||d f }|dur\||d f }q|
re||f }|rj|nd}|s}tdd |||||fD S t|||||dS )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the decoder prenet.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
                cross-attention on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr   r   r   )rQ  zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr%   r  r  zThe `z` should be specified for r  r  )r*   rj  rk  rM  rl  rL  rO  rm  r
   r   c                 s   r  ro   r%   r  r%   r%   r&   r    s    z*SpeechT5Decoder.forward.<locals>.<genexpr>)r  r6  rq   r  cross_attentions)rk   rO  r  rm  r  r   r!   r   r   r@   r   r   r   r   loggerZwarning_oncer  rN   r  r#   r  r   rD   r  r   r   r  r   )rj   rq   r*   rj  rk  r  r  r6  rm  rO  r  r  r7  r   r  r  r  Zall_cross_attentionsZnext_decoder_cacheZ	attn_maskZ	mask_namer  Zdecoder_layerr  r  rL  r  Z
next_cacher%   r%   r&   rr     s   I





zSpeechT5Decoder.forwardNNNNNNNNNNNrv   rw   rx   r   r   r^   r   r   r  r
  r   r   rK   r   r   r   rr   ry   r%   r%   rm   r&   r    sN    	

r  c                       s   e Zd ZdZdef fddZ												ddeej deej	 deej d	eej	 d
eej
 deej
 deej
 deeej  dee dee dee dee deeef fddZ  ZS )SpeechT5DecoderWithSpeechPrenetz
    Wrapper around SpeechT5Decoder that applies SpeechT5SpeechDecoderPrenet to convert log-mel filterbanks to hidden
    features.
    rk   c                    r  ro   )r]   r^   r  r  r  wrapped_decoderr  r   rm   r%   r&   r^     r  z(SpeechT5DecoderWithSpeechPrenet.__init__Nr(   r*   rj  rk  r  r  r  r6  rm  rO  r  r  r1   c                 C   s0   |  ||}| j||||||||	|
||d}|S N)rq   r*   rj  rk  r  r  r6  rm  rO  r  r  r  r  )rj   r(   r*   rj  rk  r  r  r  r6  rm  rO  r  r  decoder_hidden_statesrd  r%   r%   r&   rr     s   z'SpeechT5DecoderWithSpeechPrenet.forward)NNNNNNNNNNNNr  r%   r%   rm   r&   r    sT    
	

r  c                       s   e Zd ZdZdef fddZdd Zdd Z																						dd
ee	j
 dee	j dee	j
 dee	j dee	j dee	j deee	j
  dee dee dee dee deeef fddZ  ZS )SpeechT5DecoderWithTextPrenetz{
    Wrapper around SpeechT5Decoder that applies SpeechT5TextDecoderPrenet to convert input tokens to hidden features.
    rk   c                    r  ro   )r]   r^   r2  r  r  r  r  r   rm   r%   r&   r^     r  z&SpeechT5DecoderWithTextPrenet.__init__c                 C   r  ro   r  r   r%   r%   r&   r+    r1  z2SpeechT5DecoderWithTextPrenet.get_input_embeddingsc                 C   r  ro   r  r.  r%   r%   r&   r0    r  z2SpeechT5DecoderWithTextPrenet.set_input_embeddingsNr(   r*   rj  rk  r  r  r6  rm  rO  r  r  r1   c                 C   s6   |  |||\}}| j|||||||||	|
|d}|S r  r  )rj   r(   r*   rj  rk  r  r  r6  rm  rO  r  r  r  rd  r%   r%   r&   rr     s   z%SpeechT5DecoderWithTextPrenet.forwardr  )rv   rw   rx   r   r   r^   r+  r0  r   r   r  r
  r   r   rK   r   r   r   rr   ry   r%   r%   rm   r&   r    sR    	

r  c                       r  )SpeechT5DecoderWithoutPrenetr  rk   c                    r  ro   )r]   r^   r  r  r  r   rm   r%   r&   r^     r  z%SpeechT5DecoderWithoutPrenet.__init__Nr(   r*   rj  rk  r  r  r6  rm  rO  r  r  r1   c                 C   s$   | j |||||||||	|
|d}|S r  )r  )rj   r(   r*   rj  rk  r  r  r6  rm  rO  r  r  rd  r%   r%   r&   rr     s   z$SpeechT5DecoderWithoutPrenet.forwardr  r  r%   r%   rm   r&   r    sN    		

r  c                       s\   e Zd ZdZdef fddZdejdejdejdej	fd	d
Z
dd Zedd Z  ZS )$SpeechT5GuidedMultiheadAttentionLossz
    Guided attention loss from the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
    Networks with Guided Attention](https://arxiv.org/abs/1710.08969), adapted for multi-head attention.
    rk   c                    s   t    |j| _|j| _d S ro   )r]   r^   Zguided_attention_loss_sigmasigmaZguided_attention_loss_scalescaler   rm   r%   r&   r^   6  s   
z-SpeechT5GuidedMultiheadAttentionLoss.__init__r  input_masksoutput_masksr1   c                 C   sX   |  |||j}|d|d@ }||jd}|| }t||}| j| S )aY  
        Compute the attention loss.

        Args:
            attentions (`torch.FloatTensor` of shape `(batch_size, layers * heads, output_sequence_length, input_sequence_length)`):
                Batch of multi-head attention weights
            input_masks (`torch.BoolTensor` of shape `(batch_size, input_sequence_length)`):
                Input attention mask as booleans.
            output_masks (`torch.BoolTensor` of shape `(batch_size, output_sequence_length)`):
                Target attention mask as booleans.

        Returns:
            `torch.Tensor` with the loss value
        r   r}   r   )_make_guided_attention_masksr   r   r   r   rq  masked_selectr  )rj   r  r  r  guided_attn_masksmasksZlosseslossr%   r%   r&   rr   ;  s   
z,SpeechT5GuidedMultiheadAttentionLoss.forwardc           
      C   s   | d}| d}tjt||jd |jd f|d}tt||D ]\}\}}	| ||	| j|||d |	d |f< q#|	dS )Nr   r   r   )
rG   r   rJ   rN   r!   r  r  _make_guided_attention_maskr  r   )
rj   r  r  r   rT   r  r  r  ZilenZolenr%   r%   r&   r  T  s   

$&
zASpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_masksc                 C   sd   t jt j| |dt j||ddd\}}| | }| |  }dt || d  d|d    S )Nr   Zxy)Zindexingr   r   )r   ZmeshgridrM   r   r   )r5   Zoutput_lengthr  r   Zgrid_yZgrid_xr%   r%   r&   r  _  s   
$z@SpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_mask)rv   rw   rx   r   r   r^   r   r  Z
BoolTensorr   rr   r  r   r  ry   r%   r%   rm   r&   r  0  s    
r  c                       sb   e Zd ZdZdef fddZ	ddejdejdejd	ejd
ejde	ej dej
fddZ  ZS )SpeechT5SpectrogramLossz;
    Loss computation used by SpeechT5ForTextToSpeech.
    rk   c                    sT   t    |j| _|j| _|j| _t | _tt	dd| _
| jr(t|| _d S d S )Ng      @)Z
pos_weight)r]   r^   use_guided_attention_lossguided_attention_loss_num_headsr)   r	   l1_criterionr   r   r   bce_criterionr  attn_criterionr   rm   r%   r&   r^   p  s   
z SpeechT5SpectrogramLoss.__init__Nr*   r"  r#  r$  labelsr  r1   c                    s<  |dk}| |}| |}| |} || || }|d d d d df }	tj|	 d t|	dd|	jgdd}
|
d d dd f  |	}
| |	} ||
}|| } j	rtj fdd|D dd}|dk}|d d d d df } j
dkr|d d  j
d d  j
f } |||}||7 }|S )Nr+   r   r   r   r   c                    s"   g | ]}|d d d  j f qS ro   )r  )r<   xr   r%   r&   r>     s   " z3SpeechT5SpectrogramLoss.forward.<locals>.<listcomp>)r  r  r   r   rO   r   r   r   r  r  r)   r  )rj   r*   r"  r#  r$  r  r  r   Zl1_lossr  stop_labelsZbce_lossr  Zattnr  r  Z	attn_lossr%   r   r&   rr   |  s(   	


.

zSpeechT5SpectrogramLoss.forwardro   )rv   rw   rx   r   r   r^   r   r
  r  r   r   rr   ry   r%   r%   rm   r&   r  k  s&    r  zv
    The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.
    Zcustom_introc                "       s6  e Zd Z		d"dedeej deej f fddZdd Zd	d
 Z	dd Z
dd Zdd Ze														d#deej deej deej deej deej deej deej deeeej   deeeej   dee deej dee dee dee deeej ef fd d!Z  ZS )$SpeechT5ModelNrk   encoderdecoderc                    sJ   t  | || _|du rt|n|| _|du rt|n|| _|   dS )z
        encoder (`PreTrainedModel`, *optional*):
            The encoder model to use.
        decoder (`PreTrainedModel`, *optional*):
            The decoder model to use.
        N)r]   r^   rk   r  r  r  r  r  )rj   rk   r  r  rm   r%   r&   r^     s
   zSpeechT5Model.__init__c                 C   s0   t | jtr| j S t | jtr| j S tro   )rV  r  r  r+  r  r  NotImplementedErrorr   r%   r%   r&   r+    s
   

z"SpeechT5Model.get_input_embeddingsc                 C   s8   t | jtr| j| t | jtr| j| d S d S ro   )rV  r  r  r0  r  r  r.  r%   r%   r&   r0    s
   z"SpeechT5Model.set_input_embeddingsc                 C   r)  ro   )r  r   r%   r%   r&   get_encoder  r,  zSpeechT5Model.get_encoderc                 C   r)  ro   )r  r   r%   r%   r&   get_decoder  r,  zSpeechT5Model.get_decoderc                 C   s    t | jtr| jj  dS dS z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)rV  r  r  r  r   r   r%   r%   r&   r     s   z$SpeechT5Model.freeze_feature_encoderr(   r*   decoder_input_valuesdecoder_attention_maskr  decoder_head_maskr  encoder_outputsr6  rm  r  rO  r  r  r1   c                 C   sf  |dur|n| j j}|dur|n| j j}|
dur|
n| j j}
|dur$|n| j j}|du r8| j||||||d}n$|r\t|ts\t|d t|dkrM|d ndt|dkrX|d ndd}|durtt| jt	rt| jj
|d jd |}n|}t| jtrd|i}ni }| jd
|||d ||||	|
|||d|}|s|| S t|j|j|j|j|j|j|j|jd	S )a  
        input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
            Depending on which encoder is being used, the `input_values` are either: float values of the input raw
            speech waveform, or indices of input sequence tokens in the vocabulary, or hidden states.
        decoder_input_values (`torch.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Depending on which decoder is being used, the `decoder_input_values` are either: float values of log-mel
            filterbank features extracted from the raw speech waveform, or indices of decoder input sequence tokens in
            the vocabulary, or hidden states.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        N)r(   r*   r  rO  r  r  r   r   r   r  r  )r(   r*   rj  rk  r  r  r6  rm  rO  r  r  )r  r6  r  decoder_attentionsr  encoder_last_hidden_staterj  encoder_attentionsr%   )rk   rO  r  rm  r  r  rV  r   rN   r  r  r   r!   r  r  r   r  r6  rq   r  r  )rj   r(   r*   r  r  r  r  r  r  r6  rm  r  rO  r  r  rk  Zdecoder_argsZdecoder_outputsr%   r%   r&   rr     sn   )	
zSpeechT5Model.forwardr	  NNNNNNNNNNNNNN)rv   rw   rx   r   r   r   Moduler^   r+  r0  r  r  r   r   r   r   r
  r  r   rK   r   r   rr   ry   r%   r%   rm   r&   r    sz    		
r  zB
    SpeechT5 Model with a speech encoder and a text decoder.
    c                "       s(  e Zd ZdgZdef fddZdd Zdd Zd	d
 Zdd Z	dd Z
e														d#deej deej deej deej deej deej deej deeeej   deeeej   dee dee dee dee deej deeef fdd Zed!d" Z  ZS )$SpeechT5ForSpeechToTextz#text_decoder_postnet.lm_head.weightrk   c                    \   t  | |jd u rtd| j dt|}t|}t|||| _t	|| _
|   d S )NYou are trying to instantiate a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForSpeechToText.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)r]   r^   r&  r#   rn   r  r  r  ro  r8  text_decoder_postnetr  )rj   rk   speech_encoderZtext_decoderrm   r%   r&   r^   R     

z SpeechT5ForSpeechToText.__init__c                 C   r  ro   ro  r  r   r%   r%   r&   r  f  r1  z#SpeechT5ForSpeechToText.get_encoderc                 C   r  ro   ro  r  r   r%   r%   r&   r  i  r1  z#SpeechT5ForSpeechToText.get_decoderc                 C      |   j  dS r  r  r  r   r   r%   r%   r&   r   l     z.SpeechT5ForSpeechToText.freeze_feature_encoderc                 C   r  ro   )r  r<  r   r%   r%   r&   r<  s  r1  z-SpeechT5ForSpeechToText.get_output_embeddingsc                 C   r  ro   )r  r>  r=  r%   r%   r&   r>  v  r  z-SpeechT5ForSpeechToText.set_output_embeddingsNr(   r*   decoder_input_idsr  r  r  r  r  r6  rm  rO  r  r  r  r1   c                 C   s   |dur|n| j j}|dur|du rt|| j j| j j}| j|||||||||	|
||dd}| |d }d}|durLt }||d| j j	|d}|sb|f|dd  }|dur`|f| S |S t
|||j|j|j|j|j|j|jd	S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install
            soundfile*). To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            SpeechT5 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

            Label indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToText
        >>> from datasets import load_dataset

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
        >>> model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
        >>> predicted_ids = model.generate(**inputs, max_length=100)

        >>> # transcribe speech
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        >>> transcription[0]
        'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
        ```

        ```python
        >>> inputs["labels"] = processor(text_target=dataset[0]["text"], return_tensors="pt").input_ids

        >>> # compute loss
        >>> loss = model(**inputs).loss
        >>> round(loss.item(), 2)
        19.68
        ```
        NT)r(   r*   r  r  r  r  r  r  r6  rm  rO  r  r  r   r   r   )	r  r$  r6  r  r  r  r  rj  r  )rk   r  r'   r   r   ro  r  r   r   r&  r   r6  r  r  r  r  rj  r  )rj   r(   r*   r  r  r  r  r  r  r6  rm  rO  r  r  r  rd  r$  r  Zloss_fctoutputr%   r%   r&   rr   y  sP   YzSpeechT5ForSpeechToText.forwardc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr%   c                 3   s$    | ]}| d  |jV  qdS )r   N)r   r   r   )r<   Z
past_statebeam_idxr%   r&   r  	  s   " z9SpeechT5ForSpeechToText._reorder_cache.<locals>.<genexpr>)r  )r6  r  Zreordered_pastZ
layer_pastr%   r  r&   _reorder_cache	  s   z&SpeechT5ForSpeechToText._reorder_cacher  )rv   rw   rx   Z_tied_weights_keysr   r^   r  r  r   r<  r>  r   r   r   r  r
  r   r   rK   r   r   rr   r   r  ry   r%   r%   rm   r&   r  J  sr    	

 r        ?r,         4@Fmodelr  	thresholdminlenratiomaxlenratiovocoderoutput_cross_attentionsreturn_output_lengthsc
           "   
      s  |d u rt d|d u rd|| jjk  }
n|}
|d}| jj||
dd}|j}t| jjt	r?| jjj
|d jd |
}
t|d| | jj }t|d| | jj }||d| jj}g }g }d }d}i  	 |d7 }| jj
||}| jjj|d d dd f d ||
|d|dd}|r|tj|jdd |jd}|j}| j|}||| jj| jj}|| |d d dd d f |d| jj}tj||fdd}t| j|}||k rql||k rtj|dd|k}t|d  }nt t!|} fd	d
|D }t!|dkr3t"|}|#dd$dd}| j%|}|D ]	}||  |< q)t! |kr;nqm fdd
t t! D }|	s|dkrU|d n	tj&j'j(j)|dd}|d uri||}n|}|rtj|dd}|dkr|j|t|d| g| dd  R  }||f}|S g t |D ]} ||  d q|d u rtj&j'j(j)|dd}|f}ng tj&j'j(j)|dd}||fdd
D }!|!f}|rtj|dd}|j|t|d| g| dd  R  }g ||R }|S )Na  `speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
                    the code snippet provided in this link:
                    https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
                    r   r   T)r(   r*   r  r   )rq   r*   rj  rk  r6  rm  rO  r  r   c                    s   g | ]}| vr|qS r%   r%   r   result_spectrogramr%   r&   r>   l	  s    z$_generate_speech.<locals>.<listcomp>r   c                    s   g | ]} | qS r%   r%   r   r  r%   r&   r>   u	  r  )Zbatch_firstc                    s&   g | ]}t d t  | qS r   )r3   r   r4   r   )spectrogram_lengths	waveformsr%   r&   r>   	  s   & )*r#   rk   r   r3   r   ro  r  r  rV  r  r  r   r!   r)   r    r  r  r  rP   r   r   r  squeezer6  speech_decoder_postnetr  r   Zsigmoidr   rG   r  rH   rI   rN   stackr~   flattenr!  r   r   ZrnnZpad_sequence)"r  r(   r  r*   r  r  r  r  r  r  rk  r   Zencoder_outr  maxlenZminlenZoutput_sequencespectrogramr  r6  r  r  Zdecoder_outZlast_decoder_outputZspectrumZnew_spectrogramZprobZmeet_thresholdsZmeet_indexesZspectrogramsZ
meet_indexrd  r   Zwaveform_lengthsr%   )r  r  r  r&   _generate_speech	  s   


$
5&




r  zB
    SpeechT5 Model with a text encoder and a speech decoder.
    c                &       s  e Zd ZdZdef fddZedefddZdd	 Z	d
d Z
e																d,deej deej deej deej deej deej deej deeeej   deeeej   dee dee dee dee deej deej deej deeef f"ddZe 					 		!	!d-dejdeej deej d"ed#ed$ed%eej d&ed'edeejeejejf f fd(d)Ze 					 		!	!d-dejdeej deej d"ed#ed$ed%eej d&ed'edeejeejejf f fd*d+Z  ZS ).SpeechT5ForTextToSpeechr   rk   c                    r  )Nr  a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForTextToSpeech.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)r]   r^   r&  r#   rn   r  r  r  ro  r  r  r  )rj   rk   Ztext_encoderspeech_decoderrm   r%   r&   r^   	  r  z SpeechT5ForTextToSpeech.__init__r1   c                 C   s   dS r   r%   )clsr%   r%   r&   can_generate	  s   z$SpeechT5ForTextToSpeech.can_generatec                 C   r  ro   r  r   r%   r%   r&   r  	  r1  z#SpeechT5ForTextToSpeech.get_encoderc                 C   r  ro   r  r   r%   r%   r&   r  	  r1  z#SpeechT5ForTextToSpeech.get_decoderNr*   r  r  r  r  r  r  r6  rm  rO  r  r  r  r  r  c                 C   s   |dur|n| j j}|dur"|du rt|| j j|\}}| j jr"d}| j|||||||||	|
|||dd}| |d \}}}d}|durTt| j }|||||||j}|sj|f|dd  }|durh|f| S |S t	|||j
|j|j|j|j|j|jd	S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
            [`~PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation. Spectrograms can be obtained using [`SpeechT5Processor`]. See [`SpeechT5Processor.__call__`]
            for details.
        stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Binary tensor indicating the position of the stop token in the sequence.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, set_seed
        >>> import torch

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        >>> model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate(inputs["input_ids"], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([15872])
        ```
        NTr(   r*   r  r  r  r  r  r  r6  rm  r  rO  r  r  r   r   	r  r  r6  r  r  r  r  rj  r  )rk   r  r-   r)   r  ro  r  r  r  r   r6  r  r  r  rj  r  )rj   r   r*   r  r  r  r  r  r  r6  rm  rO  r  r  r  r  r  rd  r"  r#  r$  r  	criterionr  r%   r%   r&   rr   	  sd   L

	zSpeechT5ForTextToSpeech.forwardr  r,   r  Fr  r  r  r  r  r  c
                 K   s^   |dur"| d}| d|kr"| ddkr||d}ntdt| |||||||||	
S )aE  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Attention mask from the tokenizer, required for batched inference to signal to the model where to
                ignore padded tokens from the input_ids.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `List[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `List[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        Nr   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch_size.r   r  r#   r  )rj   r   r*   r  r  r  r  r  r  r  kwargsrS   r%   r%   r&   generateH
  s(   E
z SpeechT5ForTextToSpeech.generatec
                 C   s^   |dur"| d}
| d|
kr"| ddkr||
d}ntdt| |||||||||	
S )a  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `List[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `List[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        Nr   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch size.r  )rj   r   r  r*   r  r  r  r  r  r  rS   r%   r%   r&   generate_speech
  s(   I
z'SpeechT5ForTextToSpeech.generate_speechNNNNNNNNNNNNNNNNNNr  r,   r  NFF)rv   rw   rx   r}  r   r^   classmethodrK   r
  r  r  r   r   r   r
  r  r   r   r   r   rr   r   r   r   r  r  r  ry   r%   r%   rm   r&   r  	  s    	

 	
[	
r  zD
    SpeechT5 Model with a speech encoder and a speech decoder.
    c                &       s  e Zd Zdef fddZdd Zdd Zdd	 Ze	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
d*de	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	eee
j   de	eee
j   de	e de	e de	e de	e de	e
j de	e
j de	e
j deeef f"ddZe
 	
	
			 	
	!	!d+de
jde	e
j de	e
j d"ed#ed$ed%e	ej d&ed'ede
jfd(d)Z  ZS ),SpeechT5ForSpeechToSpeechrk   c                    s@   t  | t|}t|}t|||| _t|| _|   d S ro   )	r]   r^   r  r  r  ro  r  r  r  )rj   rk   r  r  rm   r%   r&   r^     s   
z"SpeechT5ForSpeechToSpeech.__init__c                 C   r  ro   r  r   r%   r%   r&   r    r1  z%SpeechT5ForSpeechToSpeech.get_encoderc                 C   r  ro   r  r   r%   r%   r&   r    r1  z%SpeechT5ForSpeechToSpeech.get_decoderc                 C   r  r  r  r   r%   r%   r&   r     r  z0SpeechT5ForSpeechToSpeech.freeze_feature_encoderNr(   r*   r  r  r  r  r  r  r6  rm  rO  r  r  r  r  r  r1   c                 C   s   |dur|n| j j}|dur|du rt|| j j|\}}| j|||||||||	|
|||dd}| |d \}}}d}|sQ|f|dd  }|durO|f| S |S t|||j|j|j	|j
|j|j|jd	S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install
            soundfile*). To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Spectrograms can be obtained using [`SpeechT5Processor`]. See
            [`SpeechT5Processor.__call__`] for details.
        stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Binary tensor indicating the position of the stop token in the sequence.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, set_seed
        >>> from datasets import load_dataset
        >>> import torch

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
        >>> model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([77824])
        ```
        NTr  r   r   r  )rk   r  r-   r)   ro  r  r   r6  r  r  r  r  rj  r  )rj   r(   r*   r  r  r  r  r  r  r6  rm  rO  r  r  r  r  r  rd  r=   r  r$  r  r  r%   r%   r&   rr   $  sL   R
z!SpeechT5ForSpeechToSpeech.forwardr  r,   r  Fr  r  r  r  r  r  c
           
      C   s2   |du rt jd|jd}t| |||||||||	
S )a  
        Converts a raw speech waveform into a sequence of mel spectrograms, which are subsequently turned back into a
        speech waveform using a vocoder.

        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                Float values of input raw speech waveform.

                Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type `List[float]` or
                a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install soundfile*). To prepare the array
                into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into a tensor
                of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `List[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `List[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        N)r   i   r   )r   rJ   r   r  )
rj   r(   r  r*   r  r  r  r  r  r  r%   r%   r&   r    s   Iz)SpeechT5ForSpeechToSpeech.generate_speechr  r  )rv   rw   rx   r   r^   r  r  r   r   r   r   r  r
  r   r   rK   r   r   rr   r   r   r   r  r  ry   r%   r%   rm   r&   r    s    	

~	
r  c                       s@   e Zd Zd fdd	ZdddZd	d
 Zdd Zdd Z  ZS )HifiGanResidualBlockr
   r   r
      皙?c                    sb   t    |_t fddttD _t fddttD _d S )Nc                    s2   g | ]}t j  d | | dqS r   )r[   dilationr   r   rb   get_paddingr   channelsr  rZ   rj   r%   r&   r>     s    	z1HifiGanResidualBlock.__init__.<locals>.<listcomp>c                    s*   g | ]}t j  d d d dqS r  r  r;   )r  rZ   rj   r%   r&   r>     s    	
)	r]   r^   leaky_relu_sloper   r   rI   rN   convs1convs2)rj   r  rZ   r  r   rm   r  r&   r^     s   

	
	
zHifiGanResidualBlock.__init__r   c                 C   s   || | d S r   r%   )rj   rZ   r  r%   r%   r&   r    r  z HifiGanResidualBlock.get_paddingc                 C   sL   t jj}tt jjdrt jjj}| jD ]}|| q| jD ]}|| qd S Nr   )r   r   r   r   r   r!  r"  rj   r   r   r%   r%   r&   apply_weight_norm   s   




z&HifiGanResidualBlock.apply_weight_normc                 C   s4   | j D ]}tj| q| jD ]}tj| qd S ro   )r!  r   r   remove_weight_normr"  rj   r   r%   r%   r&   r&  *  s
   

z'HifiGanResidualBlock.remove_weight_normc                 C   sX   t | j| jD ]"\}}|}tj|| j}||}tj|| j}||}|| }q|S ro   )r  r!  r"  r   r  
leaky_relur   )rj   rq   Zconv1Zconv2rc  r%   r%   r&   rr   0  s   
zHifiGanResidualBlock.forward)r
   r  r  r  )	rv   rw   rx   r^   r  r%  r&  rr   ry   r%   r%   rm   r&   r    s    

r  z
    HiFi-GAN vocoder.
    c                       sb   e Zd ZeZdZdef fddZdd Zdd Zd	d
 Z	e
dddejdejfddZ  ZS )SpeechT5HifiGanr  rk   c              
      sN  t  | t|j| _t|j| _tj|j	|j
dddd| _t | _tt|j|jD ]$\}\}}| jtj|j
d|  |j
d|d   |||| d d q-t | _tt| jD ]#}|j
d|d   }t|j|jD ]\}}| jt||||j qpq^tj|ddddd| _| dt|j	 | dt|j	 |   d S )N   r   r
   )rZ   r[   r   r   rq  r  )r]   r^   rN   Zresblock_kernel_sizesnum_kernelsZupsample_ratesnum_upsamplesr   rb   Zmodel_in_dimZupsample_initial_channelconv_prer   	upsamplerr  r  Zupsample_kernel_sizesrP   ZConvTranspose1d	resblocksrI   Zresblock_dilation_sizesr  r   	conv_postr   r   rJ   rO   r  )rj   rk   r   Zupsample_raterZ   r  r  rm   r%   r&   r^   D  s>   



zSpeechT5HifiGan.__init__c                 C   sJ   t |tjtjfr!|jjjd| jjd |j	dur#|j	j
  dS dS dS )zInitialize the weights.r,   rp  N)rV  r   r   rb   r   rv  ru  rk   rw  r\   rx  )rj   ry  r%   r%   r&   r{  j  s   
zSpeechT5HifiGan._init_weightsc                 C   s`   t jj}tt jjdrt jjj}|| j | jD ]}|| q| jD ]}|  q"|| j	 d S r#  )
r   r   r   r   r   r-  r.  r/  r%  r0  r$  r%   r%   r&   r%  q  s   





z!SpeechT5HifiGan.apply_weight_normc                 C   sL   t j| j | jD ]}t j| q
| jD ]}|  qt j| j d S ro   )r   r   r&  r-  r.  r/  r0  r'  r%   r%   r&   r&  }  s   


z"SpeechT5HifiGan.remove_weight_norma  
        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
        waveform.
        r  r1   c                 C   s  | j jr|| j | j }| dk}|s|d}|dd}| |}t| j	D ]8}t
j|| j j}| j| |}| j|| j  |}td| jD ]}|| j|| j |  |7 }qK|| j }q)t
j|}| |}t|}|s|dddd}|S |d}|S )a  
        spectrogram (`torch.FloatTensor`):
            Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
            config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.

        Returns:
            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
        r
   r   r   r   r   )rk   Znormalize_beforerq  r  r   r   r~   r-  rI   r,  r   r  r(  r   r.  r/  r+  r0  r   tanhr   r   )rj   r  Z
is_batchedrq   r   Z	res_statejZwaveformr%   r%   r&   rr     s,   




zSpeechT5HifiGan.forward)rv   rw   rx   r   r|  r}  r^   r{  r%  r&  r   r   r  rr   ry   r%   r%   rm   r&   r)  ;  s    & r)  )r  r  r  r  rn  r)  )r   Nr   r  )`r   r   typingr   r   r   r   numpyrB   r   Ztorch.utils.checkpointr   Ztorch.nnr   r   r	   Zactivationsr   Z
generationr   Zintegrations.deepspeedr   Zintegrations.fsdpr   Zmodeling_attn_mask_utilsr   r   Zmodeling_outputsr   r   r   r   r   Zmodeling_utilsr   r   r   r   Zconfiguration_speecht5r   r   Z
get_loggerrv   r  Z_HIDDEN_STATES_START_POSITIONr   r3   r'   r-   r   r
  ZndarrayrV   r  rX   rz   r   r   r   r   r   r   r   r   r   r  r  r  r%  r2  r8  r?  rS  r\  re  rn  r  r  r  r  r  r  r  r  r  r  r  r  r  rK   r  r  r  r  r)  __all__r%   r%   r%   r&   <module>   s  



xE-/ 4(+ !=l% 
%*  P04+;=  @	

   h t>w