o
    Zh4                    @   s  d dl Z d dlZd dlmZ d dlmZmZmZ d dlZ	d dl
Z
d dlmZ d dlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z# e rqddlm$Z$ e!%e&Z'eG dd deZ(G dd dej)Z*G dd dej)Z+G dd dej)Z,G dd dej)Z-G dd dej)Z.G dd dej)Z/G dd dej)Z0G d d! d!ej)Z1G d"d# d#e1Z2G d$d% d%e1Z3G d&d' d'ej)Z4e1e3e2d(Z5G d)d* d*ej)Z6G d+d, d,ej)Z7G d-d. d.ej)Z8G d/d0 d0ej)Z9G d1d2 d2ej)Z:G d3d4 d4ej)Z;e G d5d6 d6eZ<		 dMd7ee=e=f d8e>d9e=d:ee
j? d;e=d<e	j@fd=d>ZAeZBe G d?d@ d@e<ZCe dAdBG dCdD dDe<ZDdEZEe dFdBG dGdH dHe<ZFe dIdBG dJdK dKe<ZGg dLZHdS )N    N)	dataclass)OptionalTupleUnion)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)!flash_attn_supports_top_left_maskis_flash_attn_available)BaseModelOutputCausalLMOutputModelOutputSequenceClassifierOutputWav2Vec2BaseModelOutput)PreTrainedModel)auto_docstringlogging   )UniSpeechConfig)_flash_attention_forwardc                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeeej  ed< dZeeej  ed< dS )	UniSpeechForPreTrainingOutputaL  
    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.

    Args:
        loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
            projected quantized states.
        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
            target vectors for contrastive loss.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossprojected_statesprojected_quantized_statescodevector_perplexityhidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r   r   r    r&   r&   _/var/www/auris/lib/python3.10/site-packages/transformers/models/unispeech/modeling_unispeech.pyr   (   s   
 r   c                       $   e Zd Z fddZdd Z  ZS )UniSpeechSamePadLayerc                    s*   t    |d dkrd| _d S d| _d S )N   r   r   )super__init__num_pad_remove)selfnum_conv_pos_embeddings	__class__r&   r'   r,   M   s   
 zUniSpeechSamePadLayer.__init__c                 C   s,   | j dkr|d d d d d | j  f }|S Nr   )r-   r.   r   r&   r&   r'   forwardQ   s   
zUniSpeechSamePadLayer.forwardr   r    r!   r,   r4   __classcell__r&   r&   r0   r'   r)   L   s    r)   c                       r(   ) UniSpeechPositionalConvEmbeddingc                    s$  t    tj|j|j|j|jd |jd| _tjj	}t
tjjdr'tjjj	}t r{dd l}|jj| jjdd || jddd| _W d    n1 sLw   Y  t
| jdrd| jjjj}| jjjj}n| jj}| jj}|j| | |j| | n	|| jddd| _t|j| _t|j | _d S )	Nr*   )kernel_sizepaddinggroupsweight_normr   )Zmodifier_rankweight)namedimparametrizations)r+   r,   nnConv1dhidden_sizer/   Znum_conv_pos_embedding_groupsconvutilsr;   hasattrr?   r	   	deepspeedzeroZGatheredParametersr<   Z	original0Z	original1weight_gweight_vZregister_external_parameterr)   r9   r   feat_extract_activation
activation)r.   configr;   rF   rH   rI   r0   r&   r'   r,   X   s4   

z)UniSpeechPositionalConvEmbedding.__init__c                 C   s:   | dd}| |}| |}| |}| dd}|S Nr   r*   )	transposerC   r9   rK   r3   r&   r&   r'   r4   y   s   


z(UniSpeechPositionalConvEmbedding.forwardr5   r&   r&   r0   r'   r7   W   s    !r7   c                       &   e Zd Zd fdd	Zdd Z  ZS )UniSpeechNoLayerNormConvLayerr   c                    sj   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _d S )Nr   r   r8   stridebias)r+   r,   conv_dimin_conv_dimout_conv_dimr@   rA   conv_kernelconv_stride	conv_biasrC   r   rJ   rK   r.   rL   layer_idr0   r&   r'   r,      s   
z&UniSpeechNoLayerNormConvLayer.__init__c                 C   s   |  |}| |}|S N)rC   rK   r3   r&   r&   r'   r4      s   

z%UniSpeechNoLayerNormConvLayer.forwardr   r5   r&   r&   r0   r'   rP      s    rP   c                       rO   )UniSpeechLayerNormConvLayerr   c                    s|   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
tj| jdd| _t|j | _d S )Nr   r   rQ   T)Zelementwise_affine)r+   r,   rT   rU   rV   r@   rA   rW   rX   rY   rC   	LayerNorm
layer_normr   rJ   rK   rZ   r0   r&   r'   r,      s   
z$UniSpeechLayerNormConvLayer.__init__c                 C   s:   |  |}|dd}| |}|dd}| |}|S )N)rC   rN   r`   rK   r3   r&   r&   r'   r4      s   


z#UniSpeechLayerNormConvLayer.forwardr]   r5   r&   r&   r0   r'   r^      s    r^   c                       rO   )UniSpeechGroupNormConvLayerr   c                    s   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _tj| j| jdd| _d S )Nr   r   rQ   T)
num_groupsZnum_channelsZaffine)r+   r,   rT   rU   rV   r@   rA   rW   rX   rY   rC   r   rJ   rK   	GroupNormr`   rZ   r0   r&   r'   r,      s   
z$UniSpeechGroupNormConvLayer.__init__c                 C   s"   |  |}| |}| |}|S r\   )rC   r`   rK   r3   r&   r&   r'   r4      s   


z#UniSpeechGroupNormConvLayer.forwardr]   r5   r&   r&   r0   r'   rc      s    rc   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )UniSpeechFeatureEncoderz.Construct the features from raw audio waveformc                    s   t     jdkr t ddg fddt jd D  }n jdkr2 fddt jD }n	td	 j d
t|| _	d| _
d| _d S )Ngroupr   r[   c                    s   g | ]
}t  |d  dqS )r   rh   )rP   .0irL   r&   r'   
<listcomp>   s    z4UniSpeechFeatureEncoder.__init__.<locals>.<listcomp>r   layerc                    s   g | ]}t  |d qS )rh   )r^   ri   rl   r&   r'   rm      s    z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r+   r,   Zfeat_extract_normrc   rangeZnum_feat_extract_layers
ValueErrorr@   
ModuleListconv_layersgradient_checkpointing_requires_grad)r.   rL   rr   r0   rl   r'   r,      s   





z UniSpeechFeatureEncoder.__init__c                 C   s   |   D ]}d|_qd| _d S NF)
parametersrequires_gradrt   r.   paramr&   r&   r'   _freeze_parameters   s   
z*UniSpeechFeatureEncoder._freeze_parametersc                 C   s\   |d d d f }| j r| jrd|_| jD ]}| j r'| jr'| jr'| |j|}q||}q|S )NT)rt   trainingrw   rr   rs   _gradient_checkpointing_func__call__)r.   input_valuesr   Z
conv_layerr&   r&   r'   r4      s   

zUniSpeechFeatureEncoder.forward)r   r    r!   r"   r,   rz   r4   r6   r&   r&   r0   r'   rf      s
    rf   c                       r(   )UniSpeechFeatureProjectionc                    sJ   t    tj|jd |jd| _t|jd |j| _	t
|j| _d S )Nrb   Zeps)r+   r,   r@   r_   rT   layer_norm_epsr`   LinearrB   
projectionDropoutZfeat_proj_dropoutdropoutr.   rL   r0   r&   r'   r,      s   
z#UniSpeechFeatureProjection.__init__c                 C   s&   |  |}| |}| |}||fS r\   )r`   r   r   )r.   r   Znorm_hidden_statesr&   r&   r'   r4     s   


z"UniSpeechFeatureProjection.forwardr5   r&   r&   r0   r'   r      s    r   c                       s   e Zd ZdZ					ddededed	ed
ededee f fddZ	de
jdedefddZ					dde
jdee
j deee
j  dee
j dee
j dedee
jee
j eee
j  f fddZ  ZS )UniSpeechAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        FTN	embed_dim	num_headsr   
is_decoderrS   	is_causalrL   c                    s   t    || _|| _|| _|| | _|| _| j| | jkr*td| j d| d| jd | _|| _	|| _
tj|||d| _tj|||d| _tj|||d| _tj|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      )rS   )r+   r,   r   r   r   head_dimrL   rp   scalingr   r   r@   r   k_projv_projq_projout_proj)r.   r   r   r   r   rS   r   rL   r0   r&   r'   r,     s&   



zUniSpeechAttention.__init__tensorseq_lenbszc                 C   s    | ||| j| jdd S rM   )viewr   r   rN   
contiguousr.   r   r   r   r&   r&   r'   _shape,  s    zUniSpeechAttention._shaper   key_value_statespast_key_valueattention_masklayer_head_maskoutput_attentionsreturnc                 C   sr  |du}|  \}}	}
| || j }|r.|dur.|d jd |jd kr.|d }|d }nZ|rE| | |d|}| | |d|}nC|durt| | |d|}| | |d|}tj|d |gdd}tj|d |gdd}n| | |d|}| | |d|}| j	r||f}|| j
 d| jf}| ||	|j| }|j| }|j| }| d}t||dd}|  || j
 |	|fkrtd|| j
 |	|f d|   |dur|  |d|	|fkrtd	|d|	|f d|   ||| j
|	|| }||| j
 |	|}tjj|dd}|durL|  | j
fkr1td
| j
f d|   |dddd||| j
|	| }||| j
 |	|}|rc||| j
|	|}||| j
 |	|}nd}tjj|| j| jd}t||}|  || j
 |	| jfkrtd|| j
 |	| jf d|   ||| j
|	| j}|dd}|||	| j}| |}|||fS )#Input shape: Batch x Time x ChannelNr   r*   r   rb   r>   z$Attention weights should be of size 	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size )pr{    `attn_output` should be of size )sizer   r   shaper   r   r   r#   catr   r   r   r   reshapeZbmmrN   rp   r@   
functionalsoftmaxr   r{   r   r   )r.   r   r   r   r   r   r   is_cross_attentionr   tgt_len_query_states
key_statesvalue_statesZ
proj_shapeZsrc_lenattn_weightsZattn_weights_reshapedZ
attn_probsattn_outputr&   r&   r'   r4   /  s   





"

zUniSpeechAttention.forward)r   FTFNNNNNF)r   r    r!   r"   intfloatboolr   r   r,   r#   Tensorr   r   r4   r6   r&   r&   r0   r'   r   
  sV    r   c                       s   e Zd ZdZ fddZdejdedefddZ									
ddejde	ej de	e
ej  de	ej de	ej dede
eje	ej e	e
ej  f fddZ  ZS )UniSpeechFlashAttention2aN  
    UniSpeech flash attention module. This module inherits from `UniSpeechAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                    s   t  j|i | t | _d S r\   )r+   r,   r   _flash_attn_uses_top_left_mask)r.   argskwargsr0   r&   r'   r,     s   z!UniSpeechFlashAttention2.__init__r   r   r   c                 C   s   | ||| j| jS r\   )r   r   r   r   r&   r&   r'   _reshape  s   z!UniSpeechFlashAttention2._reshapeNFr   r   r   r   r   r   r   c              
   C   s>  |d u}|  \}}	}
| | |d|}|r8|d ur8|d jd |jd kr8|d dd}|d dd}nb|rO| | |d|}| | |d|}nK|d ur| | |d|}| | |d|}tj|d dd|gdd}tj|d dd|gdd}n| | |d|}| | |d|}| j	r|dd|ddf}|jd }|d ur||d jd 7 }|j
}|tjkrt rt }nt| jdr| jj}n| jjj
}td| d	 ||}||}||}t|||||	| jr| jnd
| j| jd}|||	d}| |}|sd }|||fS )Nrb   r   r*   r   r   ra   _pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .r   )r   r   Zuse_top_left_mask)r   r   r   r   rN   r   r   r#   r   r   dtypefloat32Zis_autocast_enabledZget_autocast_gpu_dtyperE   rL   r   r<   loggerwarning_oncetor   r{   r   r   r   r   r   )r.   r   r   r   r   r   r   r   r   Zq_lenr   r   r   r   Z
kv_seq_lenZinput_dtypeZtarget_dtyper   r   r&   r&   r'   r4     sl    









z UniSpeechFlashAttention2.forwardr   )r   r    r!   r"   r,   r#   r   r   r   r   r   r   r4   r6   r&   r&   r0   r'   r     s0    r   c                       s   e Zd Z					ddejdeej deeej  deej deej ded	eejeej eeej  f f fd
dZ  Z	S )UniSpeechSdpaAttentionNFr   r   r   r   r   r   r   c                    s  |rt d t j|||||dS |du}| \}}	}
| |}|r=|dur=|d jd |jd kr=|d }|d }nZ|rT| | |d|}| | 	|d|}nC|dur| | |d|}| | 	|d|}t
j|d |gdd}t
j|d |gdd}n| | |d|}| | 	|d|}| jr||f}| ||	|}| jr|du r|	dkrd	nd
}t
jjj||||| jr| jnd|d}| || j|	| jfkrtd|| j|	| jf d|  |dd}|||	| j}| |}|d|fS )r   a  UniSpeechModel is using UniSpeechSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` . Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r   r   r   r   Nr   r*   r   rb   r   TFr   )Z	attn_maskZ	dropout_pr   r   r   )r   r   r+   r4   r   r   r   r   r   r   r#   r   r   r   r@   r   Zscaled_dot_product_attentionr{   r   r   r   rp   rN   r   r   r   )r.   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r0   r&   r'   r4   #  sh   



	

zUniSpeechSdpaAttention.forwardr   )
r   r    r!   r#   r   r   r   r   r4   r6   r&   r&   r0   r'   r   "  s*    r   c                       r(   )UniSpeechFeedForwardc                    sp   t    t|j| _t|j|j| _	t
|jtr"t|j | _n|j| _t|j|j| _t|j| _d S r\   )r+   r,   r@   r   Zactivation_dropoutintermediate_dropoutr   rB   Zintermediate_sizeintermediate_dense
isinstanceZ
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr   r0   r&   r'   r,     s   
zUniSpeechFeedForward.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S r\   )r   r   r   r   r   r3   r&   r&   r'   r4     s   




zUniSpeechFeedForward.forwardr5   r&   r&   r0   r'   r     s    r   )eagerZsdpaflash_attention_2c                       s&   e Zd Z fddZdddZ  ZS )UniSpeechEncoderLayerc                    sl   t    t|j |j|j|jdd| _t	|j
| _tj|j|jd| _t|| _tj|j|jd| _d S )NFr   r   r   r   r   )r+   r,   UNISPEECH_ATTENTION_CLASSES_attn_implementationrB   num_attention_headsattention_dropout	attentionr@   r   r   r   r_   r   r`   r   feed_forwardfinal_layer_normr   r0   r&   r'   r,     s   

zUniSpeechEncoderLayer.__init__NFc                 C   sf   |}| j |||d\}}}| |}|| }| |}|| | }| |}|f}|r1||f7 }|S Nr   r   )r   r   r`   r   r   r.   r   r   r   Zattn_residualr   r   outputsr&   r&   r'   r4     s   



zUniSpeechEncoderLayer.forwardru   r5   r&   r&   r0   r'   r     s    r   c                       sL   e Zd Z fddZ				ddejdeej ded	ed
ef
ddZ	  Z
S )UniSpeechEncoderc                    r   t     | _t | _tj j jd| _	t
 j| _t fddt jD | _d| _ jdk| _d S )Nr   c                       g | ]}t  qS r&   )r   rj   r   rl   r&   r'   rm         z-UniSpeechEncoder.__init__.<locals>.<listcomp>Fr   r+   r,   rL   r7   pos_conv_embedr@   r_   rB   r   r`   r   r   r   rq   ro   num_hidden_layerslayersrs   r   _use_flash_attention_2r   r0   rl   r'   r,     s   

 zUniSpeechEncoder.__init__NFTr   r   r   output_hidden_statesreturn_dictc                 C   s  |rdnd }|r
dnd }|d ur_| ddd|jd }d|| < | jr2|d ur/d|v r/|nd }n-d|d d d d d d f j|jd }|t|jj }|	|jd d|jd |jd }| 
|}	||	 }| |}| |}t pxt| }
| jD ]G}|r||f }tg }| jr|| jjk rdnd	}|r|
r| jr| jr| |j|||}n||||d
}|d }|rd}|r||d f }q||r||f }|stdd |||fD S t|||dS )Nr&   rb   r   r*   r         ?r   TFr   NNc                 s       | ]	}|d ur|V  qd S r\   r&   rj   vr&   r&   r'   	<genexpr>      z+UniSpeechEncoder.forward.<locals>.<genexpr>last_hidden_stater   r   )	unsqueezerepeatr   r   r   r   r#   finfominexpandr   r`   r   r	   r
   r   randr{   rL   	layerdroprs   r|   r}   tupler   r.   r   r   r   r   r   Zall_hidden_statesZall_self_attentionsZexpand_attention_maskZposition_embeddingsZsynced_gpusrn   Zdropout_probabilityZskip_the_layerZlayer_outputsr&   r&   r'   r4     s`   
&






zUniSpeechEncoder.forwardNFFT)r   r    r!   r,   r#   r   r   r   r   r4   r6   r&   r&   r0   r'   r     s"    r   c                       s,   e Zd Z fddZdejfddZ  ZS )UniSpeechAttnAdapterLayerc                    sZ   t    |j| _|j| _t| j| _t	| j| j| _
t | _t	| j| j| _dS )z
        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
        up training throughput.
        N)r+   r,   adapter_attn_dimZ	input_dimrB   Z
hidden_dimr@   r_   normr   linear_1ZReLUact_fnlinear_2r   r0   r&   r'   r,   #  s   

z"UniSpeechAttnAdapterLayer.__init__r   c                 C   s,   |  |}| |}| |}| |}|S r\   )r   r   r  r  r3   r&   r&   r'   r4   1  s
   



z!UniSpeechAttnAdapterLayer.forward)r   r    r!   r,   r#   r$   r4   r6   r&   r&   r0   r'   r   "  s    r   c                       s@   e Zd Z fddZ		d
dejdeej defdd	Z  Z	S )$UniSpeechEncoderLayerStableLayerNormc                    s   t    t|j |j|j|jdd| _t	|j
| _tj|j|jd| _t|| _tj|j|jd| _t|dd d urCt|| _d S d | _d S )NFr   r   r   )r+   r,   r   r   rB   r   r   r   r@   r   r   r   r_   r   r`   r   r   r   getattrr   adapter_layerr   r0   r&   r'   r,   <  s   


z-UniSpeechEncoderLayerStableLayerNorm.__init__NFr   r   r   c                 C   sz   |}|  |}| j|||d\}}}| |}|| }|| | | }| jd ur1|| | }|f}|r;||f7 }|S r   )r`   r   r   r   r   r  r   r&   r&   r'   r4   N  s   



z,UniSpeechEncoderLayerStableLayerNorm.forwardru   )
r   r    r!   r,   r#   r   r   r   r4   r6   r&   r&   r0   r'   r  ;  s    r  c                       s.   e Zd Z fddZ				dddZ  ZS )	UniSpeechEncoderStableLayerNormc                    r   )Nr   c                    r   r&   )r  r   rl   r&   r'   rm   p  r   z<UniSpeechEncoderStableLayerNorm.__init__.<locals>.<listcomp>Fr   r   r   r0   rl   r'   r,   i  s   

z(UniSpeechEncoderStableLayerNorm.__init__NFTc                 C   s  |rdnd }|r
dnd }|d urc| ddd|jd }||j|jd }| jr6|d ur3d|v r3|nd }n-d|d d d d d d f j|jd }|t|jj }|	|jd d|jd |jd }| 
|}	||	 }| |}t pwt| }
| jD ]G}|r||f }tg }| jr|| jjk rdnd	}|r|
r| jr| jr| |j|||}n||||d
}|d }|rd}|r||d f }q{| |}|r||f }|stdd |||fD S t|||dS )Nr&   rb   r   r*   r   r   r   TFr   r   c                 s   r   r\   r&   r   r&   r&   r'   r     r   z:UniSpeechEncoderStableLayerNorm.forward.<locals>.<genexpr>r   )r   r   r   r   r   r   r#   r   r   r   r   r   r	   r
   r   r   r{   rL   r   rs   r|   r}   r`   r   r   r   r&   r&   r'   r4   u  s`   &






z'UniSpeechEncoderStableLayerNorm.forwardr   r5   r&   r&   r0   r'   r  h  s    r  c                       s4   e Zd ZdZ fddZedd Zdd Z  ZS )UniSpeechGumbelVectorQuantizerz
    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
    c                    s   t    |j| _|j| _|j| j dkr"td|j d| j dt	t
d| j| j |j| j | _t|jd | j| j | _d| _d S )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenationr   rb   r*   )r+   r,   Znum_codevector_groupsrd   Znum_codevectors_per_groupnum_varscodevector_dimrp   r@   	Parameterr#   r$   codevectorsr   rT   weight_projtemperaturer   r0   r&   r'   r,     s   


z'UniSpeechGumbelVectorQuantizer.__init__c                 C   s8   | j dd}ttj|t|d  dd  }|S )Nr   r   gHz>rb   )meanr#   expsumlog)ZprobsZmarginal_probs
perplexityr&   r&   r'   _compute_perplexity  s   (z2UniSpeechGumbelVectorQuantizer._compute_perplexityc                 C   s  |j \}}}| |}||| | j d}| jr?tjj| | j	dd
|}tj||| | jd dd}| |}n$|jdd}|j|j  d|ddd}||| | jd}| |}||| d}|d| j }	|	|| | j| jd}
|
d||d}
|
|fS )Nrb   T)tauZhardr   r   r   ra   )r   r  r   rd   r{   r@   r   Zgumbel_softmaxr   r  type_asr#   r   r  ZargmaxZ	new_zerosZscatter_r   r  r  r  )r.   r   
batch_sizesequence_lengthrB   Zcodevector_probsZcodevector_soft_distr  Zcodevector_idxZcodevectors_per_groupr  r&   r&   r'   r4     s0   

z&UniSpeechGumbelVectorQuantizer.forward)	r   r    r!   r"   r,   staticmethodr  r4   r6   r&   r&   r0   r'   r    s    
r  c                   @   sX   e Zd ZeZdZdZdZdZdZ	dd Z
deejef fddZd	ed
ejfddZdS )UniSpeechPreTrainedModel	unispeechr~   Tc              	   C   s  t |tr|jjjjddd |jjj  tj	
|j dS t |trItj	j|jjddtd|jjd |jj   d tj	|jjd dS t |trqtd|jj }tj	j
|jj| |d tj	j
|jj| |d dS t |tjr|jjjd| jjd |jdur|jj  dS dS t |tjtjfr|jj  |jjd dS t |tjrtj	|j |jdurt|j|j|jd   }tj	j
|j| |d dS dS dS )	zInitialize the weightsr   r   )r  Zstdr   r*   )abNr   )r   r  r  r<   dataZnormal_rS   Zzero_r@   inituniform_r  r7   rC   mathsqrtr8   Zin_channelsZ	constant_r   r   Zin_featuresr   rL   Zinitializer_ranger_   re   fill_rA   Zkaiming_normal_r:   )r.   modulekr&   r&   r'   _init_weights  s<   

 


z&UniSpeechPreTrainedModel._init_weightsinput_lengthsc                 C   s4   dd }t | jj| jjD ]
\}}||||}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )Nfloor)Zrounding_moder   )r#   div)input_lengthr8   rR   r&   r&   r'   _conv_out_length6  s   zSUniSpeechPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length)ziprL   rW   rX   )r.   r&  r*  r8   rR   r&   r&   r'    _get_feat_extract_output_lengths1  s   z9UniSpeechPreTrainedModel._get_feat_extract_output_lengthsfeature_vector_lengthr   c                 C   s   |j ddd d df }| |tj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dg d
dg }|S )Nrb   r   r   )r   devicer   )r.  )Zcumsumr,  r   r#   longr   zerosr   r.  arangeflipr   )r.   r-  r   Znon_padded_lengthsZoutput_lengthsr  r&   r&   r'   "_get_feature_vector_attention_mask@  s   
"z;UniSpeechPreTrainedModel._get_feature_vector_attention_maskN)r   r    r!   r   Zconfig_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_supports_flash_attn_2Z_supports_sdpar%  r   r#   
LongTensorr   r,  r3  r&   r&   r&   r'   r    s    !r  r   	mask_probmask_lengthr   	min_masksr   c                    s  | \}dk rt dkrt d d dtjd   fdd}|dur:| d	 n
fd
dt|D }tj	|ft
d}g }	|}
|
dkrZ|S |D ];}||}tjjt|d  |dd}t|dkr}d }n|d }t|tj|
| tjd| g}|	| q\t|	}	t|	dddddf ||
f}	|	||
 }	tddddf }t|||
f||
 }|	| }	|	 d krd |	|	d k< t||	dd	 |S )af  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    sX   t |     }t|}| kr }| d  |k r*t| d  d}|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)r)  num_masked_spanepsilonr6  r5  r7  r  r&   r'   compute_num_masked_spanv  s   
z6_compute_mask_indices.<locals>.compute_num_masked_spanNrb   c                    s   g | ]} qS r&   r&   r   )r  r&   r'   rm     s    z)_compute_mask_indices.<locals>.<listcomp>r   r   F)replace)rp   nprandomr   itemdetachr  tolistro   r0  r   choicer1  lenZconcatenateonesZint32appendarrayZbroadcast_tor   r9  Zput_along_axis)r   r5  r6  r   r7  r  r=  r&  Zspec_aug_maskZspec_aug_mask_idxsZmax_num_masked_spanr)  r:  Zspec_aug_mask_idxZdummy_mask_idxoffsetsr&   r;  r'   _compute_mask_indicesP  s\   

rJ  c                       s   e Zd Zdef fddZ		ddejdeej deej fdd	Z	e
					dd
eej deej deej dee dee dee deeef fddZ  ZS )UniSpeechModelrL   c                    sz   t  | || _t|| _t|| _|jdks|jdkr)t	
t|j | _|jr2t|| _nt|| _|   d S )Nr   )r+   r,   rL   rf   feature_extractorr   feature_projectionmask_time_probmask_feature_probr@   r
  r#   r   rB   r  masked_spec_embedZdo_stable_layer_normr  encoderr   	post_initr   r0   r&   r'   r,     s   


zUniSpeechModel.__init__Nr   mask_time_indicesr   c                 C   s  t | jdds	|S | \}}}|dur| j|j||< n-| jjdkrK| jrKt||f| jj| jj	|| jj
d}tj||jtjd}| j|j||< | jjdkr| jrt||f| jj| jj| jjd}tj||jtjd}|dddf d|d}d||< |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        Zapply_spec_augmentTNr   )r5  r6  r   r7  )r.  r   )r5  r6  r7  rb   )r  rL   r   rP  r   r   rN  r{   rJ  Zmask_time_lengthZmask_time_min_masksr#   r   r.  r   rO  Zmask_feature_lengthZmask_feature_min_masksr   )r.   r   rS  r   r  r  rB   Zmask_feature_indicesr&   r&   r'   _mask_hidden_states  s4   z"UniSpeechModel._mask_hidden_statesr~   r   r   r   r   c           
      C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}|dd}|dur6| |jd |}| |\}}| j	|||d}| j
|||||d}	|	d }|s_||f|	dd  S t|||	j|	jdS )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r*   )rS  r   r   r   r   r   r   )r   extract_featuresr   r   )rL   r   r   use_return_dictrL  rN   r3  r   rM  rT  rQ  UniSpeechBaseModelOutputr   r   )
r.   r~   r   rS  r   r   r   rV  r   Zencoder_outputsr&   r&   r'   r4     s8   
zUniSpeechModel.forwardr   NNNNN)r   r    r!   r   r,   r#   r$   r   r4  rT  r   r   r   r   r   rX  r4   r6   r&   r&   r0   r'   rK    s@    
.
rK  zZ
    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
    )Zcustom_introc                       s   e Zd Zdef fddZdefddZdd Zd	d
 Ze		dde
jde
jde
jdefddZe				ddee
j dee
j dee dee dee deeef fddZ  ZS )UniSpeechForPreTrainingrL   c                    s~   t  | t|| _t|j| _t|| _	t
|j|j| _t
|j|j| _t
|j|j| _t|j| _|   d S r\   )r+   r,   rK  r  r@   r   Zfeat_quantizer_dropoutdropout_featuresr  	quantizerr   r	  Zproj_codevector_dim	project_qrB   project_hidZnum_ctc_classesctc_projfinal_dropoutr   rR  r   r0   r&   r'   r,   G  s   

z UniSpeechForPreTraining.__init__r  c                 C   s   || j _dS )zb
        Set the Gumbel softmax temperature to a given value. Only necessary for training
        N)r\  r  )r.   r  r&   r&   r'   set_gumbel_temperatureV  s   z.UniSpeechForPreTraining.set_gumbel_temperaturec                 C      t dt |   dS z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.NwarningswarnFutureWarningfreeze_feature_encoderr.   r&   r&   r'   freeze_feature_extractor\  
   z0UniSpeechForPreTraining.freeze_feature_extractorc                 C      | j j  dS 
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        Nr  rL  rz   rj  r&   r&   r'   ri  h     z.UniSpeechForPreTraining.freeze_feature_encoderr   target_featuresnegative_featurespredicted_featuresc                 C   s@   t j| |gdd} t j| |  dd}|| }|| }|S )z
        Compute logits for contrastive loss based using cosine similarity as the distance measure between
        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
        r   r   rb   )r#   r   Zcosine_similarityr   r  )rr  rs  rt  r  logitsr&   r&   r'   compute_contrastive_logitso  s
   
z2UniSpeechForPreTraining.compute_contrastive_logitsNr~   r   r   r   r   r   c                 C   sJ  |dur|n| j j}| j|||||d}|d }| |d }| |\}	}
| |	| jjj}	| 	|	}	t
|d|d| j j}|dd}t
| |j}|dd}|d}||d|	| d }| |}| |}d}|s|dur|||	|
f|dd  S ||	|
f|dd  S t|||	|
|j|jdS )	a  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining

        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> # TODO: Add full pretraining example
        ```NrU  r   r   rb   r   r*   )r   r   r   r   r   r   )rL   rW  r  r[  r\  r]  r   r<   r   r^  r#   emptyr   r"  Zreplace_probrN   Z	bernoullir   r.  r   Zmasked_fillr   r_  r   r   r   )r.   r~   r   r   r   r   r   Ztransformer_featuresrV  Zquantized_featuresr   Zprob_replace_matrixZsampled_replace_matrixru  r   r&   r&   r'   r4     sL   




zUniSpeechForPreTraining.forward)r   )NNNN)r   r    r!   r   r,   r   ra  rk  ri  r  r#   r$   rv  r   r   r   r   r   r   r   r4   r6   r&   r&   r0   r'   rZ  A  sD    
rZ  r*   zq
    UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    c                       s   e Zd Zddee f fddZdd Zdd Zd	d
 Zdd Z	e
					ddeej deej dee dee dee deej deeef fddZ  ZS )UniSpeechForCTCNtarget_langc                    s~   t  | t|| _t|j| _|| _|j	du r#t
d| j dt|dr.|jr.|jn|j}t||j	| _|   dS )a3  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `UniSpeechForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.add_adapter)r+   r,   rK  r  r@   r   r`  r   ry  
vocab_sizerp   r1   rE   rz  output_hidden_sizerB   r   lm_headrR  )r.   rL   ry  r|  r0   r&   r'   r,     s   

zUniSpeechForCTC.__init__c                 C   sv   | j }|durt| jdddu rtd| d|du r,t| jdddur,td dS |dur9| j|dd dS dS )a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        Nr   zCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)Z
force_load)ry  r  rL   rp   r   infoZload_adapter)r.   ry  r&   r&   r'   tie_weights  s   zUniSpeechForCTC.tie_weightsc                 C   rb  )ro  rd  Nre  rj  r&   r&   r'   rk    rl  z(UniSpeechForCTC.freeze_feature_extractorc                 C   rm  rn  rp  rj  r&   r&   r'   ri    rq  z&UniSpeechForCTC.freeze_feature_encoderc                 C      | j  D ]}d|_qdS z
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNr  rv   rw   rx   r&   r&   r'   freeze_base_model     z!UniSpeechForCTC.freeze_base_modelr~   r   r   r   r   labelsr   c              
   C   s|  |dur|n| j j}|dur| | j jkrtd| j j | j|||||d}|d }| |}| |}	d}
|dur|durC|ntj	|tj
d}| |dtj
}|dk}|d}||}tjj|	dtjddd}tjjjd	d
 tjj||||| j j| j j| j jd}
W d   n1 sw   Y  |s|	f|td  }|
dur|
f| S |S t|
|	|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: rU  r   r   rb   )r>   r   r   F)enabled)blankZ	reductionZzero_infinityr   ru  r   r   )rL   rW  r9  r{  rp   r  r   r}  r#   Z	ones_liker/  r,  r  r   Zmasked_selectr@   r   Zlog_softmaxr   rN   backendsZcudnnflagsZctc_lossZpad_token_idZctc_loss_reductionZctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r   r   )r.   r~   r   r   r   r   r  r   r   ru  r   r&  Zlabels_maskZtarget_lengthsZflattened_targetsZ	log_probsoutputr&   r&   r'   r4   !  sN   



zUniSpeechForCTC.forwardr\   rY  )r   r    r!   r   r   r,   r  rk  ri  r  r   r#   r   r   r   r   r   r4   r6   r&   r&   r0   r'   rx    s6    
rx  z
    UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                       s   e Zd Z fddZdd Zdd Zdd Ze										dd
ee	j
 dee	j
 dee dee dee dee	j
 deeef fddZ  ZS )"UniSpeechForSequenceClassificationc                    s   t  | t|dr|jrtdt|| _|jd }|jr*t	
t|| | _t	|j|j| _t	|j|j| _|   d S )Nrz  z`Sequence classification does not support the use of UniSpeech adapters (config.add_adapter=True)r   )r+   r,   rE   rz  rp   rK  r  r   use_weighted_layer_sumr@   r
  r#   rF  layer_weightsr   rB   Zclassifier_proj_size	projector
num_labels
classifierrR  )r.   rL   Z
num_layersr0   r&   r'   r,   p  s   

z+UniSpeechForSequenceClassification.__init__c                 C   rb  rc  re  rj  r&   r&   r'   rk    rl  z;UniSpeechForSequenceClassification.freeze_feature_extractorc                 C   rm  rn  rp  rj  r&   r&   r'   ri    rq  z9UniSpeechForSequenceClassification.freeze_feature_encoderc                 C   r  r  r  rx   r&   r&   r'   r    r  z4UniSpeechForSequenceClassification.freeze_base_modelNr~   r   r   r   r   r  r   c                 C   sz  |dur|n| j j}| j jrdn|}| j|||||d}| j jrB|t }tj|dd}tjj	| j
dd}	||	ddd jdd}n|d }| |}|du rV|jdd}
n+| |jd |}|ddd|jd }d	|| < |jdd|jdddd }
| |
}d}|durt }||d| j j|d}|s|f|td  }|dur|f| S |S t|||j|jd
S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
            conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTrU  r   r   rb   r   r*   r   r  )rL   rW  r  r  r  r#   stackr@   r   r   r  r   r  r  r  r3  r   r   r   r  r   r  r   r   r   )r.   r~   r   r   r   r   r  r   r   Znorm_weightsZpooled_outputZpadding_maskZexpand_padding_maskru  r   Zloss_fctr  r&   r&   r'   r4     sH   

 
z*UniSpeechForSequenceClassification.forwardrY  )r   r    r!   r,   rk  ri  r  r   r   r#   r   r   r   r   r   r4   r6   r&   r&   r0   r'   r  i  s4    
r  )rx  rZ  r  rK  r  r2   )Ir   rf  dataclassesr   typingr   r   r   numpyr?  r#   Ztorch.nnr@   r   Zactivationsr   Zintegrations.deepspeedr	   Zintegrations.fsdpr
   Zmodeling_flash_attention_utilsr   r   Zmodeling_outputsr   r   r   r   r   Zmodeling_utilsr   rD   r   r   Zconfiguration_unispeechr   r   Z
get_loggerr   r   r   Moduler)   r7   rP   r^   rc   rf   r   r   r   r   r   r   r   r   r   r  r  r  r  r   r   r4  ZndarrayrJ  rX  rK  rZ  r  rx  r  __all__r&   r&   r&   r'   <module>   s   
#-/ zi#U-YFL

wv  r