o
    Zh:]                     @   s  d dl Z d dlmZmZmZ d dlZd dlmZ d dlm  m	Z
 ddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZ d	d
lmZmZmZmZmZmZmZmZmZ ddl m!Z! e"e#Z$G dd deZ%G dd deZ&G dd dej'Z(G dd deZ)G dd dej'Z*G dd dej'Z+G dd dej'Z,G dd dej'Z-G dd dej'Z.G dd  d eeZ/eZ0G d!d" d"eZ1G d#d$ d$eZ2G d%d& d&eZ3G d'd( d(eZ4G d)d* d*eZ5g d+Z6dS ),    N)OptionalTupleUnion   )is_deepspeed_zero3_enabled)is_fsdp_managed_module)BaseModelOutputWav2Vec2BaseModelOutput)PreTrainedModel)logging   )	Wav2Vec2FeatureProjectionWav2Vec2FeedForward#Wav2Vec2ForAudioFrameClassificationWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2ForXVectorWav2Vec2ModelWav2Vec2PositionalConvEmbeddingWav2Vec2PreTrainedModel   )WavLMConfigc                   @      e Zd ZdS )WavLMPositionalConvEmbeddingN__name__
__module____qualname__ r   r   V/var/www/auris/lib/python3.10/site-packages/transformers/models/wavlm/modular_wavlm.pyr          r   c                   @   r   )WavLMFeatureProjectionNr   r   r   r   r   r!   "   r    r!   c                       s   e Zd ZdZ				d"dededed	ed
edef fddZ				d#dej	de
ej	 de
ej	 dedeej	e
ej	 e
eej	  f f
ddZdejdeejejf dejdedejejff
ddZdededejfddZdejdejfd d!Z  ZS )$WavLMAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        @     T	embed_dim	num_headsdropoutnum_bucketsmax_distancehas_relative_position_biasc                    s   t    || _|| _|| _|| | _| j| | jkr'td| j d| d| jd | _t	||| _
t	||| _t	||| _t	||| _|| _|| _ttd| jdd| _t	| jd| _|rqt| j| j| _d S d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r      )super__init__r&   r'   r(   Zhead_dim
ValueErrorZscalingnnLineark_projv_projq_projout_projr)   r*   	ParametertorchZonesgru_rel_pos_constgru_rel_pos_linearZ	Embeddingrel_attn_embed)selfr&   r'   r(   r)   r*   r+   	__class__r   r   r.   )   s.   
	

zWavLMAttention.__init__NFr   hidden_statesattention_maskposition_biasoutput_attentionsreturnc                 C   s  |  \}}}|du r$| ||}|d|ddd|| j ||}||jdd | jdf }	|	dddd}	| |	}
|
|	jdd d 	d}
t
|
jddd\}}||| j d	  d
 }||| j dd| }|d||f}| ||||\}}|||fS )z'Attention layer with relative attentionNr   r   r   r   )r      dim      ?g       @)sizecompute_bias	unsqueezerepeatviewr'   shapepermuter9   sumr7   Zsigmoidchunkr8   torch_multi_head_self_attention)r;   r>   r?   r@   rA   indexZbszZtgt_len_Zgated_hidden_statesZrelative_position_projZgate_aZgate_bZgate_outputgated_position_biasattn_outputattn_weightsr   r   r   forwardM   s"   	$

zWavLMAttention.forwardrT   c                 C   s   | dd } }}|dur|dnd}d }	}
d}tj|||| j| jtdgt| j	j
| jj
| jj
f|	|
|| j| jj| jj
| j|||d| j	j| jj| jjd\}}| dd}|durz|dddf |jdd | jf |jdd  }||fS )zCsimple wrapper around torch's multi_head_attention_forward functionr   r   NFT)Zuse_separate_proj_weightZq_proj_weightZk_proj_weightZv_proj_weight)Z	transposeneFZmulti_head_attention_forwardr&   r'   r7   emptycatr4   biasr2   r3   r(   r5   weighttrainingZbroadcast_torM   )r;   r>   r?   rT   rA   querykeyvalueZkey_padding_maskZbias_kZbias_vZadd_zero_attnrU   rV   r   r   r   rQ   v   sB   	

"z.WavLMAttention.torch_multi_head_self_attentionquery_length
key_lengthc                 C   sv   t j|t jdd d d f }t j|t jdd d d f }|| }| |}|| jjj}| |}|g d}|S )N)Zdtype)r   r   r   )	r7   Zarangelong_relative_positions_buckettor:   r]   ZdevicerN   )r;   rb   rc   Zcontext_positionZmemory_positionZrelative_positionZrelative_position_bucketvaluesr   r   r   rI      s   

zWavLMAttention.compute_biasrelative_positionsc                 C   s   | j d }|dktj| }t|}|d }||k }t| | }|t| j|  }|||  }|| tj}t	|t
||d }|t|||7 }|S )Nr   r   r   )r)   rf   r7   rd   abslogfloatmathr*   minZ	full_likewhere)r;   rh   r)   Zrelative_bucketsZ	max_exactZis_smallZrelative_positions_if_largeZrelative_position_if_larger   r   r   re      s   

z)WavLMAttention._relative_positions_bucket)r#   r$   r%   TNNFr   )r   r   r   __doc__intrk   boolr.   r7   ZTensorr   r   rW   FloatTensorr   Z
LongTensorZ
BoolTensorrQ   rI   re   __classcell__r   r   r<   r   r"   &   s^    '
)

7
r"   c                   @   r   )WavLMFeedForwardNr   r   r   r   r   ru      r    ru   c                       s2   e Zd Zddedef fddZdd	d
Z  ZS )WavLMEncoderLayerTconfigr+   c                    n   t    t|j|j|j|j|j|d| _t	
|j| _t	j|j|jd| _t|| _t	j|j|jd| _d S N)r&   r'   r(   r)   r*   r+   Zepsr-   r.   r"   hidden_sizeZnum_attention_headsZattention_dropoutr)   Zmax_bucket_distance	attentionr0   Dropouthidden_dropoutr(   	LayerNormlayer_norm_eps
layer_normru   feed_forwardfinal_layer_normr;   rw   r+   r<   r   r   r.         

zWavLMEncoderLayer.__init__NFr   c           	      C   sl   |}| j |||||d\}}}| |}|| }| |}|| | }| |}||f}|r4||f7 }|S )Nr?   r@   rA   rR   )r}   r(   r   r   r   )	r;   r>   r?   r@   rA   rR   attn_residualrV   outputsr   r   r   rW      s"   



zWavLMEncoderLayer.forwardTro   r   r   r   r   rr   r.   rW   rt   r   r   r<   r   rv          rv   c                       s2   e Zd Zd
dedef fddZddd	Z  ZS ) WavLMEncoderLayerStableLayerNormTrw   r+   c                    rx   ry   r{   r   r<   r   r   r.      r   z)WavLMEncoderLayerStableLayerNorm.__init__NFc                 C   sf   |}|  |}| j||||d\}}}| |}|| }|| | | }||f}|r1||f7 }|S )N)r?   r@   rA   )r   r}   r(   r   r   )r;   r>   r?   r@   rA   r   rV   r   r   r   r   rW   	  s   


z(WavLMEncoderLayerStableLayerNorm.forwardr   )NNFr   r   r   r<   r   r      r   r   c                       .   e Zd Z fddZ				dddZ  ZS )	WavLMEncoderc                    f   t     | _t | _tj j jd| _	t
 j| _t fddt jD | _d| _d S )Nrz   c                       g | ]
}t  |d kdqS r   )r+   )rv   .0irw   r   r   
<listcomp>&  s    z)WavLMEncoder.__init__.<locals>.<listcomp>Fr-   r.   rw   r   pos_conv_embedr0   r   r|   r   r   r~   r   r(   Z
ModuleListrangeZnum_hidden_layerslayersgradient_checkpointingr;   rw   r<   r   r   r.     s   


zWavLMEncoder.__init__NFTc                 C   s`  |rdnd }|r
dnd }|d ur"| ddd|jd }d|| < | |}	||	 }| |}| |}t p;t| }
d }t| j	D ]P\}}|rN||f }t
g }| jo_|dko_|| jjk }|rd|
r| jru| jru| |j||||}n	||||||d}|d d \}}|rd}|r||d f }qC|r||f }|stdd	 |||fD S t|||d
S )Nr   rC   r   r   r   r   NNNc                 s       | ]	}|d ur|V  qd S Nr   r   vr   r   r   	<genexpr>i      z'WavLMEncoder.forward.<locals>.<genexpr>Zlast_hidden_stater>   Z
attentions)rJ   rK   rM   r   r   r(   r   r   	enumerater   r7   randr^   rw   	layerdropr   _gradient_checkpointing_func__call__tupler   r;   r>   r?   rA   Zoutput_hidden_statesZreturn_dictZall_hidden_statesZall_self_attentionsZexpand_attention_maskZposition_embeddingsZsynced_gpusr@   r   layerZdropout_probabilityZskip_the_layerZlayer_outputsr   r   r   rW   *  s^   






zWavLMEncoder.forwardNFFTr   r   r   r.   rW   rt   r   r   r<   r   r     s    r   c                       r   )	WavLMEncoderStableLayerNormc                    r   )Nrz   c                    r   r   )r   r   r   r   r   r   y  s    z8WavLMEncoderStableLayerNorm.__init__.<locals>.<listcomp>Fr   r   r<   r   r   r.   r  s   



z$WavLMEncoderStableLayerNorm.__init__NFTc                 C   s^  |rdnd }|r
dnd }|d ur"| ddd|jd }d|| < | |}	||	 }| |}t p6t| }
d }t| jD ]O\}}|rI||f }t	
g }| joZ|dkoZ|| jjk }|r_|
r| jrp| jrp| |j||||}n|||||d}|d d \}}|rd}|r||d f }q>| |}|r||f }|stdd	 |||fD S t|||d
S )Nr   rC   r   r   r   )r?   rA   r@   r   c                 s   r   r   r   r   r   r   r   r     r   z6WavLMEncoderStableLayerNorm.forward.<locals>.<genexpr>r   )rJ   rK   rM   r   r(   r   r   r   r   r7   r   r^   rw   r   r   r   r   r   r   r   r   r   r   r   rW     sX   






z#WavLMEncoderStableLayerNorm.forwardr   r   r   r   r<   r   r   q  s    r   c                       s4   e Zd ZdZ fddZedd Zdd Z  ZS )WavLMGumbelVectorQuantizerz
    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
    c                    s   t    |j| _|j| _|j| j dkr"td|j d| j dt	t
d| j| j |j| j | _t|jd | j| j | _d| _d S )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenation.r   rC   r   )r-   r.   Znum_codevector_groups
num_groupsZnum_codevectors_per_groupnum_varsZcodevector_dimr/   r0   r6   r7   rs   codevectorsr1   Zconv_dimweight_projtemperaturer   r<   r   r   r.     s   


z#WavLMGumbelVectorQuantizer.__init__c                 C   s8   | j dd}ttj|t|d  dd  }|S )Nr   rE   gHz>rC   )meanr7   exprO   rj   )ZprobsZmarginal_probs
perplexityr   r   r   _compute_perplexity  s   (z.WavLMGumbelVectorQuantizer._compute_perplexityc                 C   s  |j \}}}| |}||| | j d}| jrAtjj| | j	dd}|
|}tj||| | jd dd}| |}n$|jdd}|j|j  d|ddd}||| | jd}| |}||| d}|d| j }	|	|| | j| jd}
|
d||d}
|
|fS )NrC   T)tauZhardrE   r   rG   )rM   r   rL   r   r^   r0   
functionalZgumbel_softmaxrk   r   Ztype_asr7   Zsoftmaxr   ZargmaxZ	new_zerosZscatter_rJ   r   r   rO   )r;   r>   Z
batch_sizeZsequence_lengthr|   Zcodevector_probsZcodevector_soft_distr   Zcodevector_idxZcodevectors_per_groupr   r   r   r   rW     s*   


z"WavLMGumbelVectorQuantizer.forward)	r   r   r   rp   r.   staticmethodr   rW   rt   r   r   r<   r   r     s    
r   c                   @   sD   e Zd ZeZdZdZdZdZdZ	dd Z
dd Zd	d
 Zdd ZdS )WavLMPreTrainedModelZwavlmZinput_valuesTFc              	   C   s  t |tr|jjjjddd |jjj  tj	
|j dS t |trItj	j|jjddtd|jjd |jj   d tj	|jjd dS t |trqtd|jj }tj	j
|jj| |d tj	j
|jj| |d dS t |tjr|jjjd| jjd |jdur|jj  dS dS t |tjtjfr|jj  |jjd dS t |tjrtj	|j |jdurt|j|j|jd   }tj	j
|j| |d dS dS dS )	zInitialize the weightsr#   r   )r   Zstdr   r   )abNrG   )
isinstancer   r   r]   dataZnormal_r\   Zzero_r0   initZuniform_r   r   convrl   sqrtZkernel_sizeZin_channelsZ	constant_r!   Z
projectionZin_featuresr1   rw   Zinitializer_ranger   Z	GroupNormZfill_ZConv1dZkaiming_normal_groups)r;   modulekr   r   r   _init_weights  s<   

 


z"WavLMPreTrainedModel._init_weightsc                 C      t dNzNot needed for WavLMAttributeErrorr;   r   r   r   _get_adapters4     z"WavLMPreTrainedModel._get_adaptersc                 C   r   r   r   r   r   r   r   init_adapter_layers7  r   z(WavLMPreTrainedModel.init_adapter_layersc                 C   r   r   r   r   r   r   r   load_adapter:  r   z!WavLMPreTrainedModel.load_adapterN)r   r   r   r   Zconfig_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_supports_flash_attn_2Z_supports_sdpar   r   r   r   r   r   r   r   r     s    !r   c                   @   r   )
WavLMModelNr   r   r   r   r   r   A  r    r   c                   @   r   )WavLMForCTCNr   r   r   r   r   r   E  r    r   c                   @   r   )WavLMForSequenceClassificationNr   r   r   r   r   r   I  r    r   c                   @   r   ) WavLMForAudioFrameClassificationNr   r   r   r   r   r   M  r    r   c                   @   r   )WavLMForXVectorNr   r   r   r   r   r   Q  r    r   )r   r   r   r   r   r   )7rl   typingr   r   r   r7   Ztorch.nnr0   Ztorch.nn.functionalr   rY   Zintegrations.deepspeedr   Zintegrations.fsdpr   Zmodeling_outputsr   r	   Zmodeling_utilsr
   utilsr   Zwav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   Zconfiguration_wavlmr   Z
get_loggerr   loggerr   r!   Moduler"   ru   rv   r   r   r   r   r   ZWavLMBaseModelOutputr   r   r   r   r   __all__r   r   r   r   <module>   s>    ,
 ')%STF3