o
    Zh!                     @   s  d dl Z d dlZd dlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZmZmZmZmZmZmZmZmZ d	d
lmZ G dd dejZG dd deZG dd dejZG dd dejZG dd deejZG dd deZG dd deZG dd de
ZG dd deeZ eZ!G dd de eZ"G dd  d e eZ#G d!d" d"eZ$G d#d$ d$eZ%G d%d& d&eZ&g d'Z'dS )(    N)nn   )ACT2FN)Wav2Vec2BaseModelOutput)PreTrainedModel   )Wav2Vec2AdapterWav2Vec2EncoderWav2Vec2FeatureEncoderWav2Vec2FeatureProjection#Wav2Vec2ForAudioFrameClassificationWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2ForXVectorWav2Vec2ModelWav2Vec2PreTrainedModelWav2Vec2SamePadLayer   )Data2VecAudioConfigc                       s&   e Zd Zd fdd	Zdd Z  ZS )Data2VecAudioConvLayerr   c                    s|   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
tj| jdd| _t|j | _d S )Nr   r   )kernel_sizeZstridebiasTZelementwise_affine)super__init__Zconv_dimZin_conv_dimZout_conv_dimr   Conv1dZconv_kernelZconv_strideZ	conv_biasconv	LayerNorm
layer_normr   feat_extract_activation
activation)selfconfiglayer_id	__class__ b/var/www/auris/lib/python3.10/site-packages/transformers/models/data2vec/modular_data2vec_audio.pyr      s   
zData2VecAudioConvLayer.__init__c                 C   s:   |  |}|dd}| |}|dd}| |}|S )N)r   	transposer   r    r!   hidden_statesr&   r&   r'   forward)   s   


zData2VecAudioConvLayer.forward)r   __name__
__module____qualname__r   r-   __classcell__r&   r&   r$   r'   r      s    r   c                   @      e Zd ZdS )Data2VecAudioPadLayerNr/   r0   r1   r&   r&   r&   r'   r4   4       r4   c                       $   e Zd Z fddZdd Z  ZS ) Data2VecAudioPositionalConvLayerc                    s\   t    tj|j|j|j|jd |jd| _t|j| _	t
|j | _tj|jdd| _d S )Nr   )r   paddinggroupsFr   )r   r   r   r   hidden_sizeZconv_pos_kernel_sizeZnum_conv_pos_embedding_groupsr   r4   r9   r   r   r    r   r   r!   r"   r$   r&   r'   r   9   s   
z)Data2VecAudioPositionalConvLayer.__init__c                 C   sD   |  |}| |}|dd}| |}|dd}| |}|S Nr   r   )r   r9   r*   r   r    r+   r&   r&   r'   r-   H   s   



z(Data2VecAudioPositionalConvLayer.forwardr.   r&   r&   r$   r'   r8   8   s    r8   c                       r7   )$Data2VecAudioPositionalConvEmbeddingc                    s.   t    t fddt jD | _d S )Nc                    s   g | ]}t  qS r&   )r8   ).0_r"   r&   r'   
<listcomp>W   s    zAData2VecAudioPositionalConvEmbedding.__init__.<locals>.<listcomp>)r   r   r   
ModuleListrangeZnum_conv_pos_embeddingslayersr<   r$   rA   r'   r   T   s   

z-Data2VecAudioPositionalConvEmbedding.__init__c                 C   s0   | dd}| jD ]}||}q	| dd}|S r=   )r*   rE   )r!   r,   layerr&   r&   r'   r-   Z   s
   

z,Data2VecAudioPositionalConvEmbedding.forwardr.   r&   r&   r$   r'   r>   S   s    r>   c                   @   s   e Zd Zdd ZdS )Data2VecAudioFeatureEncoderc                    s:   t j  t  fddt jD | _d| _d| _d S )Nc                    s   g | ]}t  |d qS ))r#   )r   )r?   irA   r&   r'   rB   f   s    z8Data2VecAudioFeatureEncoder.__init__.<locals>.<listcomp>FT)	r   Moduler   rC   rD   Znum_feat_extract_layersZconv_layersZgradient_checkpointingZ_requires_gradr<   r&   rA   r'   r   c   s   

z$Data2VecAudioFeatureEncoder.__init__N)r/   r0   r1   r   r&   r&   r&   r'   rG   b   s    rG   c                   @   r3   )Data2VecAudioFeatureProjectionNr5   r&   r&   r&   r'   rJ   l   r6   rJ   c                   @   r3   )Data2VecAudioEncoderNr5   r&   r&   r&   r'   rK   p   r6   rK   c                   @   r3   )Data2VecAudioAdapterNr5   r&   r&   r&   r'   rL   t   r6   rL   c                   @   sD   e Zd ZeZdZdZdZdZdZ	dd Z
dd Zdd	 Zd
d ZdS )Data2VecAudioPreTrainedModeldata2vec_audioZinput_valuesTc                 C   sZ  t |tr(td|jj }tjj|jj	| |d tjj|jj
| |d dS t |tr8tj|jj
d dS t |tjrX|j	jjd| jjd |j
durV|j
j  dS dS t |tjtjfr||j
durl|j
j  |j	durz|j	jd dS dS t |tjrtj|j	 |j
durt|j|j|jd   }tjj|j
| |d dS dS dS )zInitialize the weightsr   )abr           )meanZstdNg      ?)
isinstancerJ   mathsqrtZ
projectionZin_featuresr   inituniform_weightr   r8   Z	constant_r   LineardataZnormal_r"   Zinitializer_rangeZzero_r   Z	GroupNormZfill_r   Zkaiming_normal_r:   Zin_channelsr   )r!   modulekr&   r&   r'   _init_weights   s0   





z*Data2VecAudioPreTrainedModel._init_weightsc                 C      t dNzNot needed for Data2VecAudioAttributeErrorr!   r&   r&   r'   _get_adapters      z*Data2VecAudioPreTrainedModel._get_adaptersc                 C   r^   r_   r`   rb   r&   r&   r'   init_adapter_layers   rd   z0Data2VecAudioPreTrainedModel.init_adapter_layersc                 C   r^   r_   r`   rb   r&   r&   r'   load_adapter   rd   z)Data2VecAudioPreTrainedModel.load_adapterN)r/   r0   r1   r   Zconfig_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_supports_flash_attn_2Z_supports_sdpar]   rc   re   rf   r&   r&   r&   r'   rM   x   s    rM   c                       s:   e Zd ZdefddZdd Zdd Z fdd	Z  ZS )
Data2VecAudioModelr"   c                 C   sz   t | || _t|| _t|| _|jdks|jdkr(t	
t|j | _t|| _|jr4t|nd | _|   d S )NrQ   )rM   r   r"   rG   feature_extractorrJ   Zfeature_projectionZmask_time_probZmask_feature_probr   	ParametertorchZTensorr;   rW   Zmasked_spec_embedrK   encoderadd_adapterrL   adapter	post_initr<   r&   r&   r'   r      s   



zData2VecAudioModel.__init__c                 C   r^   r_   r`   rb   r&   r&   r'   freeze_feature_extractor   rd   z+Data2VecAudioModel.freeze_feature_extractorc                 C   s   | j   dS )z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)rh   Z_freeze_parametersrb   r&   r&   r'   freeze_feature_encoder   s   z)Data2VecAudioModel.freeze_feature_encoderc                       t  jdi |S Nr&   r   r-   r!   Zsuper_kwargsr$   r&   r'   r-         zData2VecAudioModel.forward)	r/   r0   r1   r   r   ro   rp   r-   r2   r&   r&   r$   r'   rg      s
    rg   c                       s4   e Zd Zdd Zdd Zdd Z fddZ  ZS )	Data2VecAudioForCTCc                 C   sv   t | t|| _t|j| _|jd u rt	d| j
 dt|dr*|jr*|jn|j}t||j| _|   d S )NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.rl   )rM   r   rg   rN   r   ZDropoutZfinal_dropoutZdropoutZ
vocab_size
ValueErrorr%   hasattrrl   output_hidden_sizer;   rY   Zlm_headrn   )r!   r"   ry   r&   r&   r'   r      s   


zData2VecAudioForCTC.__init__c                 C   r^   r_   r`   rb   r&   r&   r'   freeze_base_model   rd   z%Data2VecAudioForCTC.freeze_base_modelc                 C   r^   r_   r`   rb   r&   r&   r'   tie_weights   rd   zData2VecAudioForCTC.tie_weightsc                    rq   rr   rs   rt   r$   r&   r'   r-      ru   zData2VecAudioForCTC.forward)r/   r0   r1   r   rz   r{   r-   r2   r&   r&   r$   r'   rv      s
    rv   c                   @   r3   )&Data2VecAudioForSequenceClassificationNr5   r&   r&   r&   r'   r|      r6   r|   c                   @   r3   )(Data2VecAudioForAudioFrameClassificationNr5   r&   r&   r&   r'   r}      r6   r}   c                   @   r3   )Data2VecAudioForXVectorNr5   r&   r&   r&   r'   r~      r6   r~   )r}   rv   r|   r~   rg   rM   )(rT   rj   r   Zactivationsr   Zmodeling_outputsr   Zmodeling_utilsr   Zwav2vec2.modeling_wav2vec2r   r	   r
   r   r   r   r   r   r   r   r   Zconfiguration_data2vec_audior   rI   r   r4   r8   r>   rG   rJ   rK   rL   rM   ZData2VecAudioBaseModelOutputrg   rv   r|   r}   r~   __all__r&   r&   r&   r'   <module>   s0    4
+  