o
    ZŽh;G  ã                   @   s–  d dl Z d dlZd dlmZ d dlmZmZmZ d dlZd dl	m
Z
 ddlmZmZ ddlmZ ddlmZmZ dd	lmZmZmZmZmZmZmZmZmZ d
dlmZ e e ¡Z!eG dd„ deƒƒZ"G dd„ deƒZ#G dd„ deƒZ$G dd„ deƒZ%G dd„ deƒZ&G dd„ deƒZ'G dd„ deƒZ(eG dd„ deƒƒZ)eZ*G dd„ de)eƒZ+eddG d d!„ d!e)ƒƒZ,G d"d#„ d#eƒZ-G d$d%„ d%eƒZ.g d&¢Z/dS )'é    N)Ú	dataclass)ÚOptionalÚTupleÚUnioné   )ÚModelOutputÚWav2Vec2BaseModelOutput)ÚPreTrainedModel)Úauto_docstringÚloggingé   )	ÚWav2Vec2EncoderÚWav2Vec2EncoderStableLayerNormÚWav2Vec2FeatureEncoderÚWav2Vec2FeatureProjectionÚWav2Vec2ForCTCÚ!Wav2Vec2ForSequenceClassificationÚWav2Vec2GumbelVectorQuantizerÚWav2Vec2ModelÚWav2Vec2PositionalConvEmbeddingé   )ÚUniSpeechConfigc                   @   s†   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeeej  ed< dZeeej  ed< dS )	ÚUniSpeechForPreTrainingOutputaL  
    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.

    Args:
        loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
            projected quantized states.
        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
            target vectors for contrastive loss.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    NÚlossÚprojected_statesÚprojected_quantized_statesÚcodevector_perplexityÚhidden_statesÚ
attentions)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   ÚtorchÚFloatTensorÚ__annotations__r   r   r   r   r   r   © r&   r&   ú^/var/www/auris/lib/python3.10/site-packages/transformers/models/unispeech/modular_unispeech.pyr      s   
 r   c                   @   ó   e Zd ZdS )Ú UniSpeechPositionalConvEmbeddingN©r   r    r!   r&   r&   r&   r'   r)   A   ó    r)   c                   @   r(   )ÚUniSpeechFeatureEncoderNr*   r&   r&   r&   r'   r,   E   r+   r,   c                   @   r(   )ÚUniSpeechFeatureProjectionNr*   r&   r&   r&   r'   r-   I   r+   r-   c                   @   r(   )ÚUniSpeechEncoderNr*   r&   r&   r&   r'   r.   M   r+   r.   c                   @   r(   )ÚUniSpeechEncoderStableLayerNormNr*   r&   r&   r&   r'   r/   Q   r+   r/   c                   @   s    e Zd Zedd„ ƒZdd„ ZdS )ÚUniSpeechGumbelVectorQuantizerc                 C   s8   | j dd}t tj|t |d ¡ dd ¡ ¡ }|S )Nr   ©ÚdimgH¯¼šò×z>éÿÿÿÿ)Úmeanr#   ÚexpÚsumÚlog)ZprobsZmarginal_probsÚ
perplexityr&   r&   r'   Ú_compute_perplexityV   s   (z2UniSpeechGumbelVectorQuantizer._compute_perplexityc                 C   s  |j \}}}|  |¡}| || | j d¡}| jr?tjj| ¡ | j	dd 
|¡}tj| || | jd¡ ¡ dd}|  |¡}n$|jdd}|j|j Ž  d| dd¡d¡}| || | jd¡}|  |¡}| || d¡}| d¡| j }	|	 || | j| jd¡}
|
 d¡ ||d¡}
|
|fS )Nr3   T)ÚtauZhardr1   r   ç      ð?éþÿÿÿ)ÚshapeÚweight_projÚviewZ
num_groupsZtrainingÚnnZ
functionalZgumbel_softmaxÚfloatÚtemperatureÚtype_asr#   Zsoftmaxr9   ZargmaxZ	new_zerosZscatter_Ú	unsqueezeÚcodevectorsZnum_varsr6   )Úselfr   Ú
batch_sizeZsequence_lengthÚhidden_sizeZcodevector_probsZcodevector_soft_distr8   Zcodevector_idxZcodevectors_per_grouprE   r&   r&   r'   Úforward\   s0   
ÿþÿÿ
z&UniSpeechGumbelVectorQuantizer.forwardN)r   r    r!   Ústaticmethodr9   rI   r&   r&   r&   r'   r0   U   s    
r0   c                   @   sX   e Zd ZeZdZdZdZdZdZ	dd„ Z
deejef fdd„Zd	ed
ejfdd„ZdS )ÚUniSpeechPreTrainedModelÚ	unispeechÚinput_valuesTc              	   C   s´  t |tƒr|jjjjddd |jjj ¡  tj	 
|j¡ dS t |tƒrItj	j|jjddt d|jjd |jj  ¡ d tj	 |jjd¡ dS t |tƒrqt d|jj ¡}tj	j
|jj| |d tj	j
|jj| |d dS t |tjƒr‘|jjjd| jjd |jdur|jj ¡  dS dS t |tjtjfƒr©|jj ¡  |jj d¡ dS t |tjƒrÖtj	 |j¡ |jdurØt |j|j|jd   ¡}tj	j
|j| |d dS dS dS )	zInitialize the weightsç        r   )r4   Zstdr   r   )ÚaÚbNr;   )Ú
isinstancer0   r>   ÚweightÚdataZnormal_ZbiasZzero_r@   ÚinitÚuniform_rE   r)   ÚconvÚmathÚsqrtÚkernel_sizeZin_channelsZ	constant_r-   Z
projectionZin_featuresÚLinearÚconfigZinitializer_rangeZ	LayerNormZ	GroupNormÚfill_ZConv1dZkaiming_normal_Úgroups)rF   ÚmoduleÚkr&   r&   r'   Ú_init_weights‹   s<   

 ý

ÿ
ûz&UniSpeechPreTrainedModel._init_weightsÚinput_lengthsc                 C   s4   dd„ }t | jj| jjƒD ]
\}}||||ƒ}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )NÚfloor)Zrounding_moder   )r#   Údiv)Zinput_lengthrY   Ústrider&   r&   r'   Ú_conv_out_length±   s   zSUniSpeechPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length)Úzipr[   Zconv_kernelZconv_stride)rF   ra   re   rY   rd   r&   r&   r'   Ú _get_feat_extract_output_lengths¬   s   z9UniSpeechPreTrainedModel._get_feat_extract_output_lengthsÚfeature_vector_lengthÚattention_maskc                 C   s   |j ddd d …df }|  |¡ tj¡}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< | 
dg¡  d¡ 
dg¡ ¡ }|S )Nr3   r1   r   )ÚdtypeÚdevicer   )rk   )Zcumsumrg   Útor#   Úlongr=   Zzerosrj   rk   ZarangeÚflipÚbool)rF   rh   ri   Znon_padded_lengthsZoutput_lengthsrG   r&   r&   r'   Ú"_get_feature_vector_attention_mask»   s   
ÿ"z;UniSpeechPreTrainedModel._get_feature_vector_attention_maskN)r   r    r!   r   Zconfig_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_supports_flash_attn_2Z_supports_sdpar`   r   r#   Z
LongTensorÚintrg   rp   r&   r&   r&   r'   rK   ‚   s    !rK   c                   @   s‚   e Zd Zdefdd„Zdd„ Zdd„ Z					dd	eej	 d
eej	 deej
 dee dee dee deeef fdd„ZdS )ÚUniSpeechModelr[   c                 C   sx   t  |¡ || _t|ƒ| _t|ƒ| _|jdks|jdkr(t	 
t |j¡ ¡ ¡| _|jr1t|ƒ| _nt|ƒ| _|  ¡  d S )NrN   )rK   Ú__init__r[   r,   Úfeature_extractorr-   Úfeature_projectionZmask_time_probZmask_feature_probr@   Ú	Parameterr#   ÚTensorrH   rU   Zmasked_spec_embedZdo_stable_layer_normr/   Úencoderr.   Ú	post_init©rF   r[   r&   r&   r'   rs   Ï   s   



zUniSpeechModel.__init__c                 C   ó   t dƒ‚©NzNot needed for UniSpeech©ÚAttributeError©rF   r&   r&   r'   Úfreeze_feature_extractorà   ó   z'UniSpeechModel.freeze_feature_extractorc                 C   r{   r|   r}   r   r&   r&   r'   Úfreeze_feature_encoderã   r   z%UniSpeechModel.freeze_feature_encoderNrM   ri   Úmask_time_indicesÚoutput_attentionsÚoutput_hidden_statesÚreturn_dictÚreturnc           
      C   sÒ   |dur|n| j j}|dur|n| j j}|dur|n| j j}|  |¡}| dd¡}|dur6|  |jd |¡}|  |¡\}}| j	|||d}| j
|||||d}	|	d }|s_||f|	dd…  S t|||	j|	jdS )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r   )rƒ   ri   ©ri   r„   r…   r†   r   )Zlast_hidden_stateÚextract_featuresr   r   )r[   r„   r…   Úuse_return_dictrt   Ú	transposerp   r=   ru   Z_mask_hidden_statesrx   ÚUniSpeechBaseModelOutputr   r   )
rF   rM   ri   rƒ   r„   r…   r†   r‰   r   Zencoder_outputsr&   r&   r'   rI   æ   s8   ÿ
ÿûüzUniSpeechModel.forward)NNNNN)r   r    r!   r   rs   r€   r‚   r   r#   rw   r$   ro   r   r   rŒ   rI   r&   r&   r&   r'   rr   Î   s0    ùþýüûúù
ørr   zZ
    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
    )Zcustom_introc                       s¸   e Zd Zdef‡ fdd„Zdefdd„Zdd„ Zd	d
„ Ze		dde
jde
jde
jdefdd„ƒZe				ddee
j dee
j dee dee dee deeef fdd„ƒZ‡  ZS )ÚUniSpeechForPreTrainingr[   c                    s~   t ƒ  |¡ t|ƒ| _t |j¡| _t|ƒ| _	t 
|j|j¡| _t 
|j|j¡| _t 
|j|j¡| _t |j¡| _|  ¡  d S )N)Úsuperrs   rr   rL   r@   ZDropoutZfeat_quantizer_dropoutÚdropout_featuresr0   Ú	quantizerrZ   Zcodevector_dimZproj_codevector_dimÚ	project_qrH   Úproject_hidZnum_ctc_classesÚctc_projZfinal_dropoutÚdropoutry   rz   ©Ú	__class__r&   r'   rs   !  s   

z UniSpeechForPreTraining.__init__rB   c                 C   s   || j _dS )zb
        Set the Gumbel softmax temperature to a given value. Only necessary for training
        N)r   rB   )rF   rB   r&   r&   r'   Úset_gumbel_temperature0  s   z.UniSpeechForPreTraining.set_gumbel_temperaturec                 C   s   t  dt¡ |  ¡  dS )z©
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        zžThe method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.N)ÚwarningsÚwarnÚFutureWarningr‚   r   r&   r&   r'   r€   6  s
   ýz0UniSpeechForPreTraining.freeze_feature_extractorc                 C   s   | j j ¡  dS )z¨
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)rL   rt   Z_freeze_parametersr   r&   r&   r'   r‚   B  s   z.UniSpeechForPreTraining.freeze_feature_encoderr   Útarget_featuresÚnegative_featuresÚpredicted_featuresc                 C   s@   t j| |gdd} t j| ¡ |  ¡ dd}| | ¡}|| }|S )zé
        Compute logits for contrastive loss based using cosine similarity as the distance measure between
        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
        r   r1   r3   )r#   ÚcatZcosine_similarityrA   rC   )r›   rœ   r   rB   Úlogitsr&   r&   r'   Úcompute_contrastive_logitsI  s
   
z2UniSpeechForPreTraining.compute_contrastive_logitsNrM   ri   r„   r…   r†   r‡   c                 C   sJ  |dur|n| j j}| j|||||d}|d }|  |d ¡}|  |¡\}	}
|  |	 | jjj¡¡}	|  	|	¡}	t
 | d¡| d¡¡ | j j¡}| dd¡}t
 |¡ ¡  |j¡}| dd¡}| d¡}| |d¡|	 | d¡ }|  |¡}|  |¡}d}|s™|durŽ|||	|
f|dd…  S ||	|
f|dd…  S t|||	|
|j|jdS )	a›  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining

        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> # TODO: Add full pretraining example
        ```Nrˆ   r   r   r3   rN   r   )r   r   r   r   r   r   )r[   rŠ   rL   r   r   r‘   rl   rR   rj   r’   r#   ÚemptyÚsizer\   Zreplace_probr‹   Z	bernoulliro   rk   rD   Zmasked_fillr”   r“   r   r   r   )rF   rM   ri   r„   r…   r†   ZoutputsZtransformer_featuresr‰   Zquantized_featuresr   Zprob_replace_matrixZsampled_replace_matrixrŸ   r   r&   r&   r'   rI   ]  sL   û
ÿ

ÿ

úzUniSpeechForPreTraining.forward)r   )NNNN)r   r    r!   r   rs   rq   r—   r€   r‚   rJ   r#   r$   r    r
   r   rw   ro   r   r   r   rI   Ú__classcell__r&   r&   r•   r'   r     sD    üÿþýüúþýüûú
ùr   c                   @   r(   )ÚUniSpeechForCTCNr*   r&   r&   r&   r'   r¤   ¥  r+   r¤   c                   @   r(   )Ú"UniSpeechForSequenceClassificationNr*   r&   r&   r&   r'   r¥   ©  r+   r¥   )r¤   r   r¥   rr   rK   )0rW   r˜   Údataclassesr   Útypingr   r   r   r#   Ztorch.nnr@   Zmodeling_outputsr   r   Zmodeling_utilsr	   Úutilsr
   r   Zwav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   Zconfiguration_unispeechr   Z
get_loggerr   Úloggerr   r)   r,   r-   r.   r/   r0   rK   rŒ   rr   r   r¤   r¥   Ú__all__r&   r&   r&   r'   Ú<module>   s@    ,
#-HMÿ 