o
    Zh4,                     @   sN  d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZ d	d
lmZmZmZmZmZmZmZ ddlmZ dZG dd dejZG dd deZG dd deZG dd dejZ G dd deZ!G dd deZ"eG dd deZ#G dd dee#Z$G dd deZ%G dd  d eZ&g d!Z'dS )"    )OptionalTupleUnionN   )ACT2FN)is_deepspeed_zero3_enabled)BaseModelOutput)PreTrainedModel)auto_docstring   )Wav2Vec2EncoderWav2Vec2EncoderStableLayerNormWav2Vec2FeatureEncoderWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2ModelWav2Vec2SamePadLayer   )HubertConfigc                       $   e Zd Z fddZdd Z  ZS )HubertPositionalConvEmbeddingc                    s@  t    tj|j|j|j|jd |jd| _d | _|j	r%t
|j| _nmtjj}ttjjdr5tjjj}t rdd l}|jj| jjdd || jddd| _W d    n1 sZw   Y  t| jdrr| jjjj}| jjjj}n| jj}| jj}|j| | |j| | n	|| jddd| _t|j| _t|j | _d S )	Nr   )kernel_sizepaddinggroupsweight_normr   Zmodifier_rankweight)namedimparametrizations)super__init__nnConv1dhidden_sizeZnum_conv_pos_embeddingsZnum_conv_pos_embedding_groupsconv
batch_normZconv_pos_batch_normBatchNorm1dutilsr   hasattrr   r   	deepspeedzeroGatheredParametersr   Z	original0Z	original1weight_gweight_vZregister_external_parameterHubertSamePadLayerr   r   Zfeat_extract_activation
activation)selfconfigr   r*   r-   r.   	__class__ X/var/www/auris/lib/python3.10/site-packages/transformers/models/hubert/modular_hubert.pyr!      s:   

z&HubertPositionalConvEmbedding.__init__c                 C   sN   | dd}| jd ur| |}| |}| |}| |}| dd}|S )Nr   r   )	transposer&   r%   r   r0   r1   hidden_statesr5   r5   r6   forward@   s   




z%HubertPositionalConvEmbedding.forward__name__
__module____qualname__r!   r:   __classcell__r5   r5   r3   r6   r      s    %r   c                   @      e Zd ZdS )r/   Nr<   r=   r>   r5   r5   r5   r6   r/   L       r/   c                   @   r@   )HubertFeatureEncoderNrA   r5   r5   r5   r6   rC   P   rB   rC   c                       r   )HubertFeatureProjectionc                    sX   t    |j| _| jrtj|jd |jd| _t|jd |j	| _
t|j| _d S )N)Zeps)r    r!   feat_proj_layer_normr"   	LayerNormZconv_dimZlayer_norm_eps
layer_normLinearr$   
projectionZDropoutZfeat_proj_dropoutdropoutr1   r2   r3   r5   r6   r!   U   s   
z HubertFeatureProjection.__init__c                 C   s(   | j r| |}| |}| |}|S )N)rF   rH   rJ   rK   r8   r5   r5   r6   r:   ]   s
   


zHubertFeatureProjection.forwardr;   r5   r5   r3   r6   rD   T   s    rD   c                   @   r@   )HubertEncoderNrA   r5   r5   r5   r6   rM   f   rB   rM   c                   @   r@   )HubertEncoderStableLayerNormNrA   r5   r5   r5   r6   rN   j   rB   rN   c                   @   sX   e Zd ZeZdZdZdZdZdZ	dd Z
deejef fddZd	ed
ejfddZdS )HubertPreTrainedModelZhubertinput_valuesTc                 C   s  t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
tjtjfr:|jj	  |jjd dS t |tjrt rddl}t|drvt|drv|jj|j|jgdd tj|jj W d   n1 spw   Y  n*|jj|jdd tj|jj W d   n1 sw   Y  ntj|jj |jdur|jj	  dS dS t |trt|d	r|jj  dS dS t |trt|d
r|jjd| jjd   dS dS dS )zInitialize the weights        )meanZstdNg      ?r   r.   r-   r   masked_spec_embedlayer_weightsr   )
isinstancer"   rI   r   dataZnormal_r2   Zinitializer_rangeZbiasZzero_rG   Z	GroupNormr'   Zfill_r#   r   r*   r)   r+   r,   r.   r-   initZkaiming_normal_HubertModelrS   uniform_HubertForSequenceClassificationrT   Znum_hidden_layers)r1   moduler*   r5   r5   r6   _init_weightsw   sB   





z#HubertPreTrainedModel._init_weightsinput_lengthsc                 C   s4   dd }t | jj| jjD ]
\}}||||}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )Nfloor)Zrounding_moder   )torchdiv)Zinput_lengthr   strider5   r5   r6   _conv_out_length   s   zPHubertPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length)zipr2   Zconv_kernelZconv_stride)r1   r]   rb   r   ra   r5   r5   r6    _get_feat_extract_output_lengths   s   z6HubertPreTrainedModel._get_feat_extract_output_lengthsfeature_vector_lengthattention_maskc                 C   s~   |  |dtj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dgd
dg }|S )NrE   r   )dtypedevicer   )rh   )rd   sumtor_   longshapeZzerosrg   rh   ZarangeflipZcumsumbool)r1   re   rf   Zoutput_lengthsZ
batch_sizer5   r5   r6   "_get_feature_vector_attention_mask   s   
"z8HubertPreTrainedModel._get_feature_vector_attention_maskN)r<   r=   r>   r   Zconfig_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_supports_flash_attn_2Z_supports_sdpar\   r   r_   Z
LongTensorintrd   ro   r5   r5   r5   r6   rO   n   s    !rO   c                       s   e Zd Zdef fddZdd Zdd Z					dd	eej	 d
eej	 deej
 dee dee dee deeef fddZ  ZS )rX   r2   c                    s~   t  | || _t|| _t|| _|jdks|jdkr)t	
t|j | _|jr2t|| _nt|| _|   | `d S )NrQ   )r    r!   r2   rC   feature_extractorrD   feature_projectionZmask_time_probZmask_feature_probr"   	Parameterr_   Tensorr$   rY   rS   Zdo_stable_layer_normrN   encoderrM   Z	post_initadapterrL   r3   r5   r6   r!      s   


zHubertModel.__init__c                 C      t dNzNot needed for HubertAttributeErrorr1   r5   r5   r6   freeze_feature_extractor      z$HubertModel.freeze_feature_extractorc                 C   rw   rx   ry   r{   r5   r5   r6   freeze_feature_encoder   r}   z"HubertModel.freeze_feature_encoderNrP   rf   mask_time_indicesoutput_attentionsoutput_hidden_statesreturn_dictreturnc           
      C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}|dd}|dur6| |jd |}| |}| j	||d}| j
|||||d}	|	d }|s[|f|	dd  S t||	j|	jdS )an  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.

        Example:

        ```python
        >>> from transformers import AutoProcessor, HubertModel
        >>> from datasets import load_dataset
        >>> import soundfile as sf

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(batch):
        ...     speech, _ = sf.read(batch["file"])
        ...     batch["speech"] = speech
        ...     return batch


        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
        ```Nr   r   )r   )rf   r   r   r   r   )Zlast_hidden_stater9   
attentions)r2   r   r   Zuse_return_dictrq   r7   ro   rl   rr   Z_mask_hidden_statesru   r   r9   r   )
r1   rP   rf   r   r   r   r   Zextract_featuresr9   Zencoder_outputsr5   r5   r6   r:      s2   %

zHubertModel.forward)NNNNN)r<   r=   r>   r   r!   r|   r~   r   r_   rt   ZFloatTensorrn   r   r   r   r:   r?   r5   r5   r3   r6   rX      s0    
rX   c                   @   r@   )HubertForCTCNrA   r5   r5   r5   r6   r     rB   r   c                   @   r@   )rZ   NrA   r5   r5   r5   r6   rZ     rB   rZ   )r   rZ   rX   rO   )(typingr   r   r   r_   Ztorch.nnr"   Zactivationsr   Zintegrations.deepspeedr   Zmodeling_outputsr   Zmodeling_utilsr	   r(   r
   Zwav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   Zconfiguration_hubertr   Z_HIDDEN_STATES_START_POSITIONModuler   r/   rC   rD   rM   rN   rO   rX   r   rZ   __all__r5   r5   r5   r6   <module>   s.    $	2Ec