o
    Zh3                    @  s  d Z ddlmZ ddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZmZmZ dd	lmZmZmZmZmZ dd
lmZmZ ddlmZm Z m!Z!m"Z"m#Z# ddl$m%Z% e"&e'Z(dZ)dZ*dZ+dZ,eG dd deZ-dd Z.dd Z/	dWdXd!d"Z0dYdZd&d'Z1G d(d) d)ej2j3Z4G d*d+ d+ej2j5Z6G d,d- d-ej2j3Z7G d.d/ d/ej2j3Z8G d0d1 d1ej2j3Z9G d2d3 d3ej2j3Z:G d4d5 d5ej2j3Z;G d6d7 d7ej2j3Z<G d8d9 d9e<Z=G d:d; d;ej2j3Z>G d<d= d=ej2j3Z?G d>d? d?ej2j3Z@G d@dA dAej2j3ZAG dBdC dCej2j3ZBG dDdE dEej2j3ZCG dFdG dGej2j3ZDeG dHdI dIej2j3ZEG dJdK dKeZFdLZGdMZHe dNeGG dOdP dPeFZIe dQeGG dRdS dSeFZJG dTdU dUeFZKg dVZLdS )[zTensorFlow Wav2Vec2 model.    )annotationsN)	dataclass)AnyOptionalTupleUnion   )get_tf_activation)TFBaseModelOutputTFCausalLMOutputTFSequenceClassifierOutput)TFPreTrainedModelget_initializerkeraskeras_serializableunpack_inputs)
shape_liststable_softmax)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )Wav2Vec2Config   zfacebook/wav2vec2-base-960hr   g    חc                   @  sB   e Zd ZU dZdZded< dZded< dZded< dZded< dS )	TFWav2Vec2BaseModelOutputa1  
    Output type of [`TFWav2Vec2BaseModelOutput`], with potential hidden states and attentions.

    Args:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        extract_features (`tf.Tensor` of shape `(batch_size, sequence_length, conv_dim[-1])`):
            Sequence of extracted feature vectors of the last convolutional layer of the model.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    NzOptional[tf.Tensor]last_hidden_stateextract_featureszTuple[tf.Tensor] | Nonehidden_states
attentions)	__name__
__module____qualname____doc__r   __annotations__r   r   r     r&   r&   `/var/www/auris/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_tf_wav2vec2.pyr   :   s   
 r   c                 C  s8   t jt jt| dd }t j| | |\}}|S )z
    Categorical sampling without replacement is currently not implemented. The gumbel-max trick will do for now - see
    https://github.com/tensorflow/tensorflow/issues/9260 for more info
    r   r   )tfmathlograndomuniformr   nnZtop_k)distributionZnum_samplesz_indicesr&   r&   r'   _sample_without_replacementW   s   r2   c              
   C  sn   t |}tttjt|d dd|ddg}tt|t|ddggd}t|t| dg|S )zT
    Scatter function as in PyTorch with indices in format (batch_dim, indixes)
    r   axisr   )	r   r(   reshapeZbroadcast_toexpand_dimsrange	transposeconcatZ
scatter_nd)valuesZbatch_indicesZoutput_shapeZindices_shapeZbroad_casted_batch_dimsZpair_indicesr&   r&   r'    _scatter_values_on_batch_indicesa   s   $"r<   shapeTuple[int, int]	mask_probfloatmask_lengthint	min_masksreturn	tf.Tensorc                 C  s\  | \}}|dk rt dtjj||d| d| dd |t|tj | tjd }t||}t|tj	}tj
|| |}t|}tj||ftj	d}t|||d  f}t||}	t|	d	}	t|	dd|f}	t|	||| f}	t|tjtjd
d
f }
t|
||df}
t|
||| f}
|	|
 }	tt|	|	t|}|S )a  
    Computes random mask spans for a given shape

    Args:
        shape: the shape for which to compute masks.
            should be of size 2 where first element is batch size and 2nd is timesteps
        attention_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
        mask_prob:
            probability for each token to be chosen as start of the span to be masked. this will be multiplied by
            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
        mask_length: size of the mask
        min_masks: minimum number of masked spans

    Adapted from [fairseq's
    data_utils.py](https://github.com/pytorch/fairseq/blob/e0788f7007a8473a76db573985031f3c94201e79/fairseq/data/data_utils.py#L376).
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `messager   dtyper3   N)
ValueErrorr(   	debuggingZassert_lesscastfloat32r+   r,   maximumint32r)   minimumZsqueezezerosonesr2   r7   tiler6   r8   newaxisr<   	ones_liker=   )r=   r?   rA   rC   
batch_sizesequence_lengthZnum_masked_spansZspec_aug_maskZuniform_distZspec_aug_mask_idxsoffsetsr&   r&   r'   _compute_mask_indicesp   s:   "


r[   masktgt_lenOptional[int]c                 C  sj   t | d }|dur|n|}td}tj| |jd} t| ddddddf dd|df}|| t S )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    r   Ng      ?rJ   )r   r(   constantrN   rK   rU   LARGE_NEGATIVE)r\   r]   src_lenZone_cstZexpanded_maskr&   r&   r'   _expand_mask   s   
(rb   c                      s   e Zd ZdZ											d:d; fddZ fddZdd Z fd d!Zd"d# Zd$d% Z	d&d' Z
d(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zd6d7 Zd8d9 Z  ZS )<TFWav2Vec2GroupNormzp
    From tensorflow-addons https://www.tensorflow.org/addons/api_docs/python/tfa/layers/GroupNormalization
        r3   MbP?TrS   rT   NgroupsrB   r5   epsilonr@   centerboolscalebeta_initializerkeras.initializers.Initializergamma_initializerbeta_regularizerkeras.regularizers.Regularizergamma_regularizerbeta_constraintkeras.constraints.Constraintgamma_constraintc                   s   t  jdi | d| _|| _|| _|| _|| _|| _tj	
|| _tj	
|| _tj
|| _tj
|	| _tj
|
| _tj
|| _|   d S )NTr&   )super__init__Zsupports_maskingrf   r5   rg   rh   rj   r   initializersgetrk   rm   regularizersrn   rp   constraintsrq   rs   _check_axis)selfrf   r5   rg   rh   rj   rk   rm   rn   rp   rq   rs   kwargs	__class__r&   r'   ru      s   zTFWav2Vec2GroupNorm.__init__c                   sR   |  | | | | | | | | | | | d| _t | d S )NT)	_check_if_input_shape_is_none'_set_number_of_groups_for_instance_norm_check_size_of_dimensions_create_input_spec_add_gamma_weight_add_beta_weightbuiltrt   buildr{   input_shaper}   r&   r'   r      s   





zTFWav2Vec2GroupNorm.buildc           	      C  sd   t j|}t|}| |||\}}| ||}|| j | j dk}|s.t	||}|S |}|S Nr   )
r   backend	int_shaper(   r=   _reshape_into_groups_apply_normalizationr5   rf   r6   )	r{   inputsr   tensor_input_shapereshaped_inputsgroup_shapenormalized_inputsis_instance_normoutputsr&   r&   r'   call   s   
zTFWav2Vec2GroupNorm.callc                   sx   | j | j| j| j| jtj| jtj| j	tj
| jtj
| jtj| jtj| jd}t  }i ||S )N)rf   r5   rg   rh   rj   rk   rm   rn   rp   rq   rs   )rf   r5   rg   rh   rj   r   rv   	serializerk   rm   rx   rn   rp   ry   rq   rs   rt   
get_config)r{   configZbase_configr}   r&   r'   r     s   
zTFWav2Vec2GroupNorm.get_configc                 C  s   |S Nr&   r   r&   r&   r'   compute_output_shape  s   z(TFWav2Vec2GroupNorm.compute_output_shapec                   s~    fddt t|D }|| j | j dk}|s;|| j | j || j< || j| j t|}t||}||fS ||fS )Nc                   s   g | ]} | qS r&   r&   .0ir   r&   r'   
<listcomp>  s    z<TFWav2Vec2GroupNorm._reshape_into_groups.<locals>.<listcomp>r   )r8   lenr5   rf   insertr(   stackr6   )r{   r   r   r   r   r   r   r&   r   r'   r     s   
z(TFWav2Vec2GroupNorm._reshape_into_groupsc                 C  s   t j|}ttdt|}|| j | j dk}|s(| jdkr"dn| jd }n| jdkr/dn| jd }|| t	j
j||dd\}}| |\}	}
t	j
j||||	|
| jd}|S )Nr   r3   T)Zkeepdims)meanvariancerj   offsetZvariance_epsilon)r   r   r   listr8   r   r5   rf   popr(   r-   Zmoments_get_reshaped_weightsZbatch_normalizationrg   )r{   r   r   r   Zgroup_reduction_axesr   r5   r   r   gammabetar   r&   r&   r'   r   &  s$   
z(TFWav2Vec2GroupNorm._apply_normalizationc                 C  sB   |  |}d }d }| jrt| j|}| jrt| j|}||fS r   )_create_broadcast_shaperj   r(   r6   r   rh   r   )r{   r   broadcast_shaper   r   r&   r&   r'   r   =  s   
z)TFWav2Vec2GroupNorm._get_reshaped_weightsc                 C  s8   || j  }|d u rtdt| j  d t| d d S )NzAxis z\ of input tensor should have a defined dimension but the layer received an input with shape .)r5   rL   strr{   r   dimr&   r&   r'   r   H  s   
z1TFWav2Vec2GroupNorm._check_if_input_shape_is_nonec                 C  s"   || j  }| jdkr|| _d S d S Nr3   )r5   rf   r   r&   r&   r'   r   S  s   


z;TFWav2Vec2GroupNorm._set_number_of_groups_for_instance_normc                 C  sj   || j  }|| jk rtdt| j d t| d || j dkr3tdt| j d t| d d S )NzNumber of groups (z.) cannot be more than the number of channels ().r   z0) must be a multiple of the number of channels ()r5   rf   rL   r   r   r&   r&   r'   r   Y  s4   

z-TFWav2Vec2GroupNorm._check_size_of_dimensionsc                 C  s   | j dkr	tdd S )Nr   zdYou are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead)r5   rL   r{   r&   r&   r'   rz   m  s
   
zTFWav2Vec2GroupNorm._check_axisc                 C  s*   || j  }tjjt|| j |id| _d S )N)ndimZaxes)r5   r   layersZ	InputSpecr   Z
input_specr   r&   r&   r'   r   s  s   
 z&TFWav2Vec2GroupNorm._create_input_specc                 C  @   || j  }|f}| jr| j|d| j| j| jd| _d S d | _d S )Nr   r=   nameinitializerZregularizer
constraint)r5   rj   
add_weightrm   rp   rs   r   r{   r   r   r=   r&   r&   r'   r   w     

z%TFWav2Vec2GroupNorm._add_gamma_weightc                 C  r   )Nr   r   )r5   rh   r   rk   rn   rq   r   r   r&   r&   r'   r     r   z$TFWav2Vec2GroupNorm._add_beta_weightc                 C  s`   dgt | }|| j | j dk}|s(|| j | j || j< || j| j |S | j|| j< |S r   )r   r5   rf   r   )r{   r   r   r   r&   r&   r'   r     s   z+TFWav2Vec2GroupNorm._create_broadcast_shape)rd   r3   re   TTrS   rT   NNNN)rf   rB   r5   rB   rg   r@   rh   ri   rj   ri   rk   rl   rm   rl   rn   ro   rp   ro   rq   rr   rs   rr   )r!   r"   r#   r$   ru   r   r   r   r   r   r   r   r   r   r   rz   r   r   r   r   __classcell__r&   r&   r}   r'   rc      s:    rc   c                      sH   e Zd ZdZ fddZdd Zdd Z fdd	Z fd
dZ  Z	S )TFWav2Vec2WeightNormConv1DzeAdapted from https://www.tensorflow.org/probability/api_docs/python/tfp/layers/weight_norm/WeightNormc              	     s@   t  jd|||dddd| || _d| _tddg| _d S )	NZvalidTZ	he_normal)filterskernel_sizerf   paddinguse_biasbias_initializerr   r   r   r&   )rt   ru   explicit_paddingfilter_axisr(   r_   kernel_norm_axes)r{   r   r   rf   r   r|   r}   r&   r'   ru     s   	z#TFWav2Vec2WeightNormConv1D.__init__c                 C  s@   t t jt | j| jd}| j|ddt jt jf  dS )z"Set the norm of the weight vector.r4   N)	r(   sqrt
reduce_sumZsquareweight_vr   weight_gZassignrV   )r{   Zkernel_normr&   r&   r'   
_init_norm  s   "z%TFWav2Vec2WeightNormConv1D._init_normc                 C  s0   t jj| j| jdt | j }t || _dS )zGenerate normalized weights.r4   N)r(   r-   Zl2_normalizer   r   r9   r   kernel)r{   r   r&   r&   r'   _normalize_kernel  s    z,TFWav2Vec2WeightNormConv1D._normalize_kernelc                   s   | j sCt | tjt| jddd| _| j| _| jdt	| jj
| j ddfd| jjdd| _|   | jd| jfd	dd
| _d S d S )Nr   T)r   	trainabler   r   rT   )r   r=   r   rK   r   biasrS   )r   r=   r   r   )r   rt   r   r(   Variabler9   r   r   r   rB   r=   r   rK   r   r   r   r   r   r}   r&   r'   r     s   z TFWav2Vec2WeightNormConv1D.buildc                   s2   |    t|d| j| jfdf}t |}|S )N)r   r   )r   r(   padr   rt   r   )r{   r   Zpadded_inputsoutputr}   r&   r'   r     s   zTFWav2Vec2WeightNormConv1D.call)
r!   r"   r#   r$   ru   r   r   r   r   r   r&   r&   r}   r'   r     s    r   c                      4   e Zd Zdd fd
dZdddZdddZ  ZS )TFWav2Vec2NoLayerNormConvLayerr   r   r   layer_idrB   r|   r   rD   Nonec                   sn   t  jdi | |dkr|j| nd| _|j| | _tjj| j|j| |j	| |j
dd| _t|j| _d S )Nr   r   convr   r   stridesr   r   r&   )rt   ru   conv_dimin_conv_dimout_conv_dimr   r   Conv1Dconv_kernelconv_stride	conv_biasr   r	   feat_extract_activation
activationr{   r   r   r|   r}   r&   r'   ru     s   z'TFWav2Vec2NoLayerNormConvLayer.__init__r   rE   c                 C  s   |  |}| |}|S r   )r   r   r{   r   r&   r&   r'   r     s   

z#TFWav2Vec2NoLayerNormConvLayer.callNc                 C  sl   | j rd S d| _ t| dd d ur4t| jj | jd d | jg W d    d S 1 s-w   Y  d S d S NTr   )r   getattrr(   
name_scoper   r   r   r   r   r&   r&   r'   r     s   "z$TFWav2Vec2NoLayerNormConvLayer.buildr   r   r   r   rB   r|   r   rD   r   r   rE   rD   rE   r   r!   r"   r#   ru   r   r   r   r&   r&   r}   r'   r     s    
r   c                      r   )TFWav2Vec2LayerNormConvLayerr   r   r   r   rB   r|   r   rD   r   c                   s   t  jdi | |dkr|j| nd| _|j| | _tjj| j|j| |j	| |j
dd| _tjjd|jd| _t|j| _d S )Nr   r   r   r   
layer_norm)r   rg   r&   )rt   ru   r   r   r   r   r   r   r   r   r   r   LayerNormalizationlayer_norm_epsr   r	   r   r   r   r}   r&   r'   ru     s   z%TFWav2Vec2LayerNormConvLayer.__init__r   rE   c                 C  "   |  |}| |}| |}|S r   r   r   r   r   r&   r&   r'   r        


z!TFWav2Vec2LayerNormConvLayer.callNc                 C     | j rd S d| _ t| dd d ur1t| jj | jd d | jg W d    n1 s,w   Y  t| dd d ur]t| jj | jd d | j	g W d    d S 1 sVw   Y  d S d S NTr   r   
r   r   r(   r   r   r   r   r   r   r   r   r&   r&   r'   r        "z"TFWav2Vec2LayerNormConvLayer.buildr   r   r   r   r   r&   r&   r}   r'   r     s    
r   c                      r   )TFWav2Vec2GroupNormConvLayerr   r   r   r   rB   r|   r   rD   r   c                   s   t  jdi | |dkr|j| nd| _|j| | _tjj| j|j| |j	| |j
dd| _t|j| _t| j|jdd| _d S )Nr   r   r   r   r   )rf   rg   r   r&   )rt   ru   r   r   r   r   r   r   r   r   r   r   r	   r   r   rc   r   r   r   r}   r&   r'   ru     s   
z%TFWav2Vec2GroupNormConvLayer.__init__r   rE   c                 C  r   r   r   r   r&   r&   r'   r   )  r   z!TFWav2Vec2GroupNormConvLayer.callNc                 C  r   r   r   r   r&   r&   r'   r   /  r   z"TFWav2Vec2GroupNormConvLayer.buildr   r   r   r   r   r&   r&   r}   r'   r     s    
r   c                      s2   e Zd Zd fddZdddZdddZ  ZS )!TFWav2Vec2PositionalConvEmbeddingr   r   r|   r   rD   r   c                   sT   t  jdi | t|j|j|j|jd dd| _t|j| _t	|j
| _|| _d S )Nr   r   )r   r   rf   r   r   r&   )rt   ru   r   hidden_sizenum_conv_pos_embeddingsZnum_conv_pos_embedding_groupsr   TFWav2Vec2SamePadLayerr   r	   r   r   r   r{   r   r|   r}   r&   r'   ru   <  s   
z*TFWav2Vec2PositionalConvEmbedding.__init__r   rE   c                 C  r   r   )r   r   r   r   r&   r&   r'   r   I  r   z&TFWav2Vec2PositionalConvEmbedding.callNc                 C  sn   | j rd S d| _ t| dd d ur5t| jj | jd d | jjg W d    d S 1 s.w   Y  d S d S r   )	r   r   r(   r   r   r   r   r   r   r   r&   r&   r'   r   O  s   "z'TFWav2Vec2PositionalConvEmbedding.buildr   r   r|   r   rD   r   r   r   r   r&   r&   r}   r'   r   ;  s    
r   c                      s$   e Zd Z fddZdd Z  ZS )r   c                   s2   t  jdi | |d dkrd| _d S d| _d S )Nr   r   r   r&   )rt   ru   num_pad_remove)r{   r   r|   r}   r&   r'   ru   Y  s    zTFWav2Vec2SamePadLayer.__init__c                 C  s,   | j dkr|d d d | j  d d f }|S )Nr   )r   r   r&   r&   r'   r   ]  s   
zTFWav2Vec2SamePadLayer.call)r!   r"   r#   ru   r   r   r&   r&   r}   r'   r   X  s    r   c                      s0   e Zd Zd fddZd	d
 ZdddZ  ZS )TFWav2Vec2FeatureEncoderr   r   r|   r   rD   r   c                   s   t  jdi |  jdkr(t ddd dg fddt jd D  }n jdkr: fd	dt jD }n	td
 j d|| _d S )Ngroupr   conv_layers.r   r   c                   s(   g | ]}t  |d  d|d   dqS )r   r  r  )r   r   r   r&   r'   r   h  s    z5TFWav2Vec2FeatureEncoder.__init__.<locals>.<listcomp>r   layerc                   s    g | ]}t  |d | dqS )r  r  )r   r   r  r&   r'   r   m  s    z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']r&   )rt   ru   Zfeat_extract_normr   r8   Znum_feat_extract_layersrL   conv_layers)r{   r   r|   r  r}   r  r'   ru   d  s   




z!TFWav2Vec2FeatureEncoder.__init__c                 C  s$   t |d}| jD ]}||}q	|S r   )r(   r7   r  )r{   input_valuesr   
conv_layerr&   r&   r'   r   w  s   

zTFWav2Vec2FeatureEncoder.callNc              	   C  sj   | j rd S d| _ t| dd d ur1| jD ]}t|j |d  W d    n1 s+w   Y  qd S d S )NTr  )r   r   r  r(   r   r   r   )r{   r   r	  r&   r&   r'   r   }  s   
zTFWav2Vec2FeatureEncoder.buildr   r   r   r&   r&   r}   r'   r  c  s    r  c                      s   e Zd Z fddZ  ZS )TFWav2Vec2FeatureExtractorc                   s@   t  j|fi | td| jj d| jjd j dt d S )NzThe class `zD` has been depreciated and will be removed in Transformers v5. Use `r   z
` instead.)rt   ru   warningswarnr~   r!   	__bases__FutureWarningr   r}   r&   r'   ru     s   z#TFWav2Vec2FeatureExtractor.__init__)r!   r"   r#   ru   r   r&   r&   r}   r'   r
    s    r
  c                      4   e Zd Zd fddZddddZdddZ  ZS )TFWav2Vec2FeatureProjectionr   r   c                   s`   t  jdi | tjj|jdd| _tjj|jt	|j
ddd| _tjj|jd| _|| _d S )Nr   rg   r   rS   
projectionunitsZkernel_initializerr   r   )Zrater&   )rt   ru   r   r   r   r   r   Denser   r   initializer_ranger  DropoutZfeat_proj_dropoutdropoutr   r   r}   r&   r'   ru     s   
z$TFWav2Vec2FeatureProjection.__init__Fr   rE   trainingri   rD   c                 C  s*   |  |}| |}| j||d}||fS Nr  )r   r  r  )r{   r   r  Znorm_hidden_statesr&   r&   r'   r     s   

z TFWav2Vec2FeatureProjection.callNc                 C  s   | j rd S d| _ t| dd d ur4t| jj | jd d | jjd g W d    n1 s/w   Y  t| dd d urct| j	j | j	d d | jjd g W d    d S 1 s\w   Y  d S d S )NTr   r3   r  )
r   r   r(   r   r   r   r   r   r   r  r   r&   r&   r'   r     s   "z!TFWav2Vec2FeatureProjection.buildr   r   Fr   rE   r  ri   rD   rE   r   r   r&   r&   r}   r'   r    s    r  c                      sT   e Zd ZdZ			d%d& fddZd'ddZ					d(d)d!d"Zd*d#d$Z  ZS )+TFWav2Vec2Attentionz6Multi-headed attention from "Attention Is All You Need        FT	embed_dimrB   	num_headsr  r@   
is_decoderri   r   c                   s   t  jd
i | || _|| _tj|| _|| | _| j| | jkr/t	d| j d| d| jd | _
|| _tjj||dd| _tjj||dd| _tjj||dd| _tjj||d	d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r   g      k_proj)r   r   q_projv_projout_projr&   )rt   ru   r!  r"  r   r   r  r  head_dimrL   scalingr#  r  r$  r%  r&  r'  )r{   r!  r"  r  r#  r   r|   r}   r&   r'   ru     s"   	

zTFWav2Vec2Attention.__init__tensorrE   seq_lenbszc              	   C  s    t t |||| j| jfdS )Nr   r   r   r   )r(   r9   r6   r"  r(  )r{   r*  r+  r,  r&   r&   r'   _shape  s    zTFWav2Vec2Attention._shapeNr   key_value_statestf.Tensor | Nonepast_key_valueTuple[Tuple[tf.Tensor]] | Noneattention_masklayer_head_maskr  Optional[bool]rD   "Tuple[tf.Tensor, tf.Tensor | None]c              	   C  sT  |du}t |\}}	}
| || j }|r"|dur"|d }|d }nZ|r9| | |d|}| | |d|}nC|durh| | |d|}| | |d|}tj|d |gdd}tj|d |gdd}n| | |d|}| | |d|}| jr||f}|| j	 d| j
f}t| ||	||}t||}t||}t |d }tj||dd}tjjt ||| j	 |	|gd	|| j	 |	|f d
t | d |durtjjt ||d|	|gd|d|	|f d
t | d tj||jd}t||| j	|	|f| }t||| j	 |	|f}t|dd}|durOtjjt || j	gd| j	 d
t | d t|dt||| j	|	|f }t||| j	 |	|f}| j||d}t||}tjjt ||| j	 |	| j
gd|| j	|	| j
f d
t | d tt||| j	|	| j
fd}t|||	|
f}| |}t||| j	|	|f}|||fS )z#Input shape: Batch x Time x ChannelNr   r   r3   r   r4   T)Ztranspose_bz$Attention weights should be of size z	, but is rG   z!Attention mask should be of size rJ   z/Head mask for a single layer should be of size )r   r3   r   r   r  z `attn_output` should be of size r-  )r   r%  r)  r.  r$  r&  r(   r:   r#  r"  r(  r6   matmulrM   Zassert_equalrN   rK   r   r  r9   r'  )r{   r   r/  r1  r3  r4  r  Zis_cross_attentionr,  r]   r!  Zquery_statesZ
key_statesZvalue_statesZ
proj_shapera   attn_weightsZ
attn_probsZattn_outputr&   r&   r'   r     s   

	
	

		

zTFWav2Vec2Attention.callc                 C  sb  | j rd S d| _ t| dd d ur1t| jj | jd d | jg W d    n1 s,w   Y  t| dd d urZt| jj | jd d | jg W d    n1 sUw   Y  t| dd d urt| j	j | j	d d | jg W d    n1 s~w   Y  t| dd d urt| j
j | j
d d | jg W d    d S 1 sw   Y  d S d S )NTr$  r%  r&  r'  )r   r   r(   r   r$  r   r   r!  r%  r&  r'  r   r&   r&   r'   r   K  s(   "zTFWav2Vec2Attention.build)r   FT)
r!  rB   r"  rB   r  r@   r#  ri   r   ri   )r*  rE   r+  rB   r,  rB   )NNNNF)r   rE   r/  r0  r1  r2  r3  r0  r4  r0  r  r5  rD   r6  r   )	r!   r"   r#   r$   ru   r.  r   r   r   r&   r&   r}   r'   r    s    
vr  c                      r  )TFWav2Vec2FeedForwardr   r   c                   s   t  jdi | tj|j| _tjj|jt	|j
ddd| _t|j| _tjj|jt	|j
ddd| _tj|j| _|| _d S )NrS   intermediate_denser  output_denser&   )rt   ru   r   r   r  Zactivation_dropoutintermediate_dropoutr  intermediate_sizer   r  r:  r	   Z
hidden_actintermediate_act_fnr   r;  hidden_dropoutoutput_dropoutr   r   r}   r&   r'   ru   ^  s"   
zTFWav2Vec2FeedForward.__init__Fr   rE   r  ri   rD   c                 C  s>   |  |}| |}| j||d}| |}| j||d}|S r  )r:  r>  r<  r;  r@  )r{   r   r  r&   r&   r'   r   t  s   


zTFWav2Vec2FeedForward.callNc                 C  s   | j rd S d| _ t| dd d ur2t| jj | jd d | jjg W d    n1 s-w   Y  t| dd d ur_t| j	j | j	d d | jj
g W d    d S 1 sXw   Y  d S d S )NTr:  r;  )r   r   r(   r   r:  r   r   r   r   r;  r=  r   r&   r&   r'   r   }  s   "zTFWav2Vec2FeedForward.buildr  r  r  r   r   r&   r&   r}   r'   r9  ]  s    	r9  c                      :   e Zd Zd fddZ			ddddZdddZ  ZS )TFWav2Vec2EncoderLayerr   r   c                   |   t  jd	i | t|j|j|jddd| _tj	|j
| _tjj|jdd| _t|dd| _tjj|jdd| _|| _d S 
NF	attention)r!  r"  r  r#  r   r   r  feed_forwardr   final_layer_normr&   rt   ru   r  r   Znum_attention_headsZattention_dropoutrE  r   r   r  r?  r  r   r   r   r9  rF  rH  r   r   r}   r&   r'   ru        
zTFWav2Vec2EncoderLayer.__init__NFr   rE   r3  r0  output_attentionsr5  r  ri   rD   Tuple[tf.Tensor]c           	      C  sj   |}| j |||d\}}}| j||d}|| }| |}|| | }| |}|f}|r3||f7 }|S N)r3  r  r  )rE  r  r   rF  rH  	r{   r   r3  rK  r  Zattn_residualr8  r0   r   r&   r&   r'   r     s   


zTFWav2Vec2EncoderLayer.callc                 C  V  | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urWt| jj | jd d | jj	g W d    n1 sRw   Y  t| dd d ur|t| j
j | j
d  W d    n1 sww   Y  t| dd d urt| jj | jd d | jj	g W d    d S 1 sw   Y  d S d S NTrE  r   rF  rH  r   r   r(   r   rE  r   r   r   r   r   rF  rH  r   r&   r&   r'   r     (   "zTFWav2Vec2EncoderLayer.buildr  NFF
r   rE   r3  r0  rK  r5  r  ri   rD   rL  r   r   r&   r&   r}   r'   rB    s    rB  c                      rA  )%TFWav2Vec2EncoderLayerStableLayerNormr   r   c                   rC  rD  rI  r   r}   r&   r'   ru     rJ  z.TFWav2Vec2EncoderLayerStableLayerNorm.__init__NFr   rE   r3  r0  rK  r5  r  ri   rD   rL  c           	      C  sf   |}|  |}| j|||d\}}}| j||d}|| }|| | | }|f}|r1||f7 }|S rM  )r   rE  r  rF  rH  rN  r&   r&   r'   r     s   

z*TFWav2Vec2EncoderLayerStableLayerNorm.callc                 C  rO  rP  rQ  r   r&   r&   r'   r     rR  z+TFWav2Vec2EncoderLayerStableLayerNorm.buildr  rS  rT  r   r   r&   r&   r}   r'   rU    s    rU  c                      >   e Zd Zd fddZ					ddddZdddZ  ZS )TFWav2Vec2Encoderr   r   c                   h   t  jdi |  | _t dd| _tjj jdd| _	tj
 j| _ fddt jD | _d S )Npos_conv_embedrG  r   r  c                      g | ]}t  d | dqS zlayers.rG  )rB  r   r  r&   r'   r     s    z.TFWav2Vec2Encoder.__init__.<locals>.<listcomp>r&   rt   ru   r   r   rY  r   r   r   r   r   r  r?  r  r8   num_hidden_layersr  r   r}   r  r'   ru     s   zTFWav2Vec2Encoder.__init__NFTr   rE   r3  r0  rK  r5  output_hidden_statesreturn_dictr  rD   *Union[TFBaseModelOutput, Tuple[tf.Tensor]]c                 C  s  |rdnd }|r
dnd }|d ur|t |d }t|}nd }| |}	||	 }| |}| j||d}t| jD ]0\}
}|rD||f }tj	
dd}|rT|| jjk rTq9|||||d}|d }|ri||d f }q9|rq||f }|stdd |||fD S t|||d	S )
Nr&   r3   r  r   r   r   r3  rK  r  c                 s      | ]	}|d ur|V  qd S r   r&   r   vr&   r&   r'   	<genexpr>6      z)TFWav2Vec2Encoder.call.<locals>.<genexpr>r   r   r    )r(   r7   rb   rY  r   r  	enumerater  npr+   r,   r   	layerdroptupler
   r{   r   r3  rK  r^  r_  r  Zall_hidden_statesZall_self_attentionsZposition_embeddingsr   Zlayer_moduleZdropout_probabilityZlayer_outputsr&   r&   r'   r     sF   	




zTFWav2Vec2Encoder.callc              	   C    | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urWt| jj | jd d | jj	g W d    n1 sRw   Y  t| dd d ur| j
D ]}t|j |d  W d    n1 szw   Y  qbd S d S NTrY  r   r  r   r   r(   r   rY  r   r   r   r   r   r  r{   r   r  r&   r&   r'   r   =  &   
zTFWav2Vec2Encoder.buildr  NFFTFr   rE   r3  r0  rK  r5  r^  r5  r_  r5  r  r5  rD   r`  r   r   r&   r&   r}   r'   rW    s    7rW  c                      rV  ) TFWav2Vec2EncoderStableLayerNormr   r   c                   rX  )NrY  rG  r   r  c                   rZ  r[  )rU  r   r  r&   r'   r   T  s    z=TFWav2Vec2EncoderStableLayerNorm.__init__.<locals>.<listcomp>r&   r\  r   r}   r  r'   ru   N  s   
z)TFWav2Vec2EncoderStableLayerNorm.__init__NFTr   rE   r3  r0  rK  r5  r^  r_  r  rD   r`  c                 C  s  |rdnd }|r
dnd }|d ur|t |d }t|}nd }| |}	||	 }| j||d}t| jD ]0\}
}|r?||f }tj	dd}|rO|| j
jk rOq4|||||d}|d }|rd||d f }q4| |}|rq||f }|stdd |||fD S t|||d	S )
Nr&   r3   r  r   r   ra  c                 s  rb  r   r&   rc  r&   r&   r'   re    rf  z8TFWav2Vec2EncoderStableLayerNorm.call.<locals>.<genexpr>rg  )r(   r7   rb   rY  r  rh  r  ri  r+   r,   r   rj  r   rk  r
   rl  r&   r&   r'   r   X  sF   	




z%TFWav2Vec2EncoderStableLayerNorm.callc              	   C  rm  rn  ro  rp  r&   r&   r'   r     rq  z&TFWav2Vec2EncoderStableLayerNorm.buildr  rr  rs  r   r   r&   r&   r}   r'   rt  M  s    7rt  c                      sd   e Zd ZeZd" fddZd#ddZd$d
dZd#d%ddZe										d&d'd d!Z
  ZS )(TFWav2Vec2MainLayerr   r   c                   s^   t  jdi | || _t|dd| _t|dd| _|jr&t|dd| _	d S t
|dd| _	d S )Nfeature_extractorrG  feature_projectionencoderr&   )rt   ru   r   r  rv  r  rw  Zdo_stable_layer_normrt  rx  rW  r   r}   r&   r'   ru     s   zTFWav2Vec2MainLayer.__init__Nc                 C  s*  | j rd S d| _ | jjdks| jjdkr!| j| jjfdddd| _t| dd d urFt	| j
j | j
d  W d    n1 sAw   Y  t| dd d urkt	| jj | jd  W d    n1 sfw   Y  t| dd d urt	| jj | jd  W d    d S 1 sw   Y  d S d S )	NTr   r,   masked_spec_embedr=   r   r   r   rv  rw  rx  )r   r   mask_time_probmask_feature_probr   r   ry  r   r(   r   rv  r   r   rw  rx  r   r&   r&   r'   r     s(   "zTFWav2Vec2MainLayer.buildinput_lengthsrE   c                 C  s4   dd }t | jj| jjD ]
\}}||||}q|S )H
        Computes the output length of the convolutional layers
        c                 S  s   | | | d S r   r&   Zinput_lengthr   strider&   r&   r'   _conv_out_length  s   zNTFWav2Vec2MainLayer._get_feat_extract_output_lengths.<locals>._conv_out_length)zipr   r   r   )r{   r}  r  r   r  r&   r&   r'    _get_feat_extract_output_lengths  s   z4TFWav2Vec2MainLayer._get_feat_extract_output_lengthsr   mask_time_indicesr0  c                 C  s  t |\}}}t| jdds|S |dur5tt|ddddtjf tj| jtjtjddf |}n4| jj	dkrit
||f| jj	| jjdd}tt|ddddtjf tj| jtjtjddf |}| jjdkrt
||f| jj| jjd}t|ddtjddf |d}|S )z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        Zapply_spec_augmentTNr   r   )r?   rA   rC   )r?   rA   )r   r   r   r(   whererN   rV   ri   ry  r{  r[   Zmask_time_lengthr|  Zmask_feature_length)r{   r   r  rX   rY   r   Zmask_feature_indicesr&   r&   r'   _mask_hidden_states  s:     "z'TFWav2Vec2MainLayer._mask_hidden_statesFr  r3  token_type_idsposition_ids	head_maskinputs_embedsrK  r5  r^  r_  r  ri   r|   r   c                 K  s   | j t|tj|
d}|d ur&| t|d}tj|t|d |jd}| j	||
d\}}|
dd }|
r>| j||d}| j|||||	|
d}|d }|	sY||f|dd   S t|||j|jd	S )
Nr  r3   r   )maxlenrK   r  )r  r3  rK  r^  r_  r  r   )r   r   r   r    )rv  r(   rN   rO   r  r   Zsequence_maskr   rK   rw  rw   r  rx  r   r   r    )r{   r  r3  r  r  r  r  rK  r^  r_  r  r|   r   output_lengthsr   r  Zencoder_outputsr&   r&   r'   r     s6   zTFWav2Vec2MainLayer.callr  r   )r}  rE   )r   rE   r  r0  	NNNNNNNNF)r  rE   r3  r0  r  r0  r  r0  r  r0  r  r0  rK  r5  r^  r5  r_  r5  r  ri   r|   r   )r!   r"   r#   r   config_classru   r   r  r  r   r   r   r&   r&   r}   r'   ru    s"    

,ru  c                      s\   e Zd ZdZeZdZdZedd Z	edd Z
 fdd	ZdddZ	
ddddZ  ZS )TFWav2Vec2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    wav2vec2r  c                 C  s&   t jdt jddt jdt jdddS )N)NNr  rG  r3  r  r3  )r(   Z
TensorSpecrO   r   r&   r&   r'   input_signature:  s   z)TFWav2Vec2PreTrainedModel.input_signaturec                 C  s$   t jjdt jdt jdt jddS )N)r   i  )r=   rK   r  )r(   r+   r,   rO   rT   r   r&   r&   r'   dummy_inputsA  s   z&TFWav2Vec2PreTrainedModel.dummy_inputsc                   s4   t  j|g|R i | td| jj d d S )N
z has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tune this model, you need a GPU or a TPU)rt   ru   loggerwarningr~   r!   r{   r   r   r|   r}   r&   r'   ru   H  s   z"TFWav2Vec2PreTrainedModel.__init__Nc                 C  sn   |du r| j jn|}dd }t| j j| j jD ]
\}}||||}q|r5t| j jD ]
}||d| j j}q*|S )r~  Nc                 S  s   t j| | |d S r   )r(   r)   floordivr  r&   r&   r'   r  U  s   zTTFWav2Vec2PreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_lengthr   )r   add_adapterr  r   r   r8   Znum_adapter_layersZadapter_stride)r{   r}  r  r  r   r  r0   r&   r&   r'   r  O  s   z:TFWav2Vec2PreTrainedModel._get_feat_extract_output_lengthsfeature_vector_lengthrB   r3  rE   c                 C  s   t jj|ddd d df }| j||d}t |t j}t |d }t j||f|jdd}t j	|t j
t ||d gddt j|g|jdd	}t j|dgd}t j|dd}t j|dgd}t |t j}|S )
Nr3   r4   )r  r   r3  )rK   r   r   rJ   )r1   Zupdates)r(   r)   Zcumsumr  rN   rQ   r=   rS   rK   Ztensor_scatter_nd_updater   r8   rT   reverseri   )r{   r  r3  r  Znon_padded_lengthsr  rX   r&   r&   r'   "_get_feature_vector_attention_mask`  s"   z<TFWav2Vec2PreTrainedModel._get_feature_vector_attention_maskr   )r  rB   r3  rE   )r!   r"   r#   r$   r   r  Zbase_model_prefixZmain_input_namepropertyr  r  ru   r  r  r   r&   r&   r}   r'   r  0  s    


r  a	  

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_values` only and nothing else: `model(input_values)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_values, attention_mask])` or `model([input_values, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_values": input_values, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Args:
        config ([`Wav2Vec2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a	  
    Args:
        input_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_values` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_values` indices into associated vectors
            than the model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
            config will be used instead.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
zdThe bare TFWav2Vec2 Model transformer outputting raw hidden-states without any specific head on top.c                      s^   e Zd Zd fddZeeeeede										ddddZ
dddZ  ZS ) TFWav2Vec2Modelr   r   c                   s2   t  j|g|R i | || _t|dd| _d S )Nr  rG  )rt   ru   r   ru  r  r  r}   r&   r'   ru     s   zTFWav2Vec2Model.__init__output_typer  NFr  rE   r3  r0  r  r  r  r  rK  r5  r^  r_  r  ri   rD   r`  c                 C  sR   |r|n| j j}|r|n| j j}|	r|	n| j j}	| j|||||||||	|
d
}|S )a\  

        Returns:

        Example:

        ```python
        >>> from transformers import AutoProcessor, TFWav2Vec2Model
        >>> from datasets import load_dataset
        >>> import soundfile as sf

        >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
        >>> model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")


        >>> def map_to_array(batch):
        ...     speech, _ = sf.read(batch["file"])
        ...     batch["speech"] = speech
        ...     return batch


        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
        ```
r  r3  r  r  r  r  rK  r^  r_  r  )r   r^  rK  r_  r  )r{   r  r3  r  r  r  r  rK  r^  r_  r  r   r&   r&   r'   r     s    ,zTFWav2Vec2Model.callc                 C  sd   | j rd S d| _ t| dd d ur0t| jj | jd  W d    d S 1 s)w   Y  d S d S )NTr  )r   r   r(   r   r  r   r   r   r&   r&   r'   r   #  s   "zTFWav2Vec2Model.buildr  r  )r  rE   r3  r0  r  r0  r  r0  r  r0  r  r0  rK  r5  r^  r5  r_  r5  r  ri   rD   r`  r   )r!   r"   r#   ru   r   WAV2VEC2_INPUTS_DOCSTRINGr   r
   _CONFIG_FOR_DOCr   r   r   r   r&   r&   r}   r'   r    s     
<r  zhTFWav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).c                      sp   e Zd Zd  fddZdd Zdd Zeeee	e
ed		
	
	
	
	
	
	
	
	
	d!d"ddZd#ddZ  ZS )$TFWav2Vec2ForCTCr   r   c                   st   t  j|g|R i | t|dd| _tj|j| _tjj	|j
dd| _t|dr4|jr4|j| _d S |j| _d S )Nr  rG  lm_headr  )rt   ru   ru  r  r   r   r  Zfinal_dropoutr  r  
vocab_sizer  hasattrr  output_hidden_sizer   r  r}   r&   r'   ru   1  s   zTFWav2Vec2ForCTC.__init__c                 C     t dt |   dS z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        zThe method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.Nr  r  r  freeze_feature_encoderr   r&   r&   r'   freeze_feature_extractor;  
   z)TFWav2Vec2ForCTC.freeze_feature_extractorc                 C     d| j j_dS z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        FNr  rv  r   r   r&   r&   r'   r  G     z'TFWav2Vec2ForCTC.freeze_feature_encoderr  NFr  rE   r3  r0  r  r  r  r  rK  r5  labelsr^  r_  r  rD   )Union[TFCausalLMOutput, Tuple[tf.Tensor]]c                 C  s^  |durt || jjkrtd| jj | j||||||||	|
|d
}|d }| j||d}| |}|dur|dur?|nt j|t j	d}| j
t j|dd}t |dkt j}t j|dd}t jj||||| jjd	d
}| jjdkr{t |}| jjdkrt |}t |d}nd}|
s|f|td  }|dur|f| S |S t|||j|jdS )a  
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_values` docstring) Tokens with indices set to `-100` are ignored (masked),
            the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Returns:

        Example:

        ```python
        >>> import tensorflow as tf
        >>> from transformers import AutoProcessor, TFWav2Vec2ForCTC
        >>> from datasets import load_dataset
        >>> import soundfile as sf

        >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
        >>> model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")


        >>> def map_to_array(batch):
        ...     speech, _ = sf.read(batch["file"])
        ...     batch["speech"] = speech
        ...     return batch


        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values  # Batch size 1
        >>> logits = model(input_values).logits
        >>> predicted_ids = tf.argmax(logits, axis=-1)

        >>> transcription = processor.decode(predicted_ids[0])

        >>> # compute loss
        >>> target_transcription = "A MAN SAID TO THE UNIVERSE SIR I EXIST"

        >>> # Pass transcription as `text` to encode labels
        >>> labels = processor(text=transcription, return_tensors="tf").input_ids

        >>> loss = model(input_values, labels=labels).loss
        ```Nz$Label values must be <= vocab_size: r  r   r  rJ   r3   r4   F)logitsr  Zlogit_lengthZlabel_lengthZblank_indexZlogits_time_majorsumr   rI   lossr  r   r    )r(   Z
reduce_maxr   r  rL   r  r  r  rW   rO   r  r   rN   rQ   r-   Zctc_lossZpad_token_idZctc_loss_reductionreduce_meanr6   _HIDDEN_STATES_START_POSITIONr   r   r    )r{   r  r3  r  r  r  r  rK  r  r^  r_  r  r   r   r  r}  Zlabels_maskZtarget_lengthsr  r   r&   r&   r'   r   N  s\   <
	

zTFWav2Vec2ForCTC.callc                 C  s   | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urYt| jj | jd d | jg W d    d S 1 sRw   Y  d S d S )NTr  r  )	r   r   r(   r   r  r   r   r  r  r   r&   r&   r'   r     s   "zTFWav2Vec2ForCTC.buildr  )
NNNNNNNNNF)r  rE   r3  r0  r  r0  r  r0  r  r0  r  r0  rK  r5  r  r0  r^  r5  r_  r5  r  r5  rD   r  r   )r!   r"   r#   ru   r  r  r   r   r  r   r   r  r   r   r   r&   r&   r}   r'   r  ,  s&    

ur  c                      sZ   e Zd Z fddZdd Zdd Zdd Ze											
ddddZdddZ	  Z
S )#TFWav2Vec2ForSequenceClassificationc                   s   t  | t|dd| _|jd | _t|   |j	r*| j
| jfdddd| _W d    n1 s4w   Y  || _tjj|jdd	| _tjj|jd d
d| _d S )Nr  rG  r   rT   Tlayer_weightsrz  	projector)r  r   
classifier)r  r   r   )rt   ru   ru  r  r]  Z
num_layersr(   r   Z_name_scopeuse_weighted_layer_sumr   r  r   r   r   r  classifier_proj_sizer  
num_labelsr  )r{   r   r}   r&   r'   ru     s   z,TFWav2Vec2ForSequenceClassification.__init__c                 C  r  r  r  r   r&   r&   r'   r    r  z<TFWav2Vec2ForSequenceClassification.freeze_feature_extractorc                 C  r  r  r  r   r&   r&   r'   r    r  z:TFWav2Vec2ForSequenceClassification.freeze_feature_encoderc                 C  s   | j jD ]}d|_qdS )z
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FN)r  r   r   )r{   r  r&   r&   r'   freeze_base_model  s   z5TFWav2Vec2ForSequenceClassification.freeze_base_modelNFr  rE   r3  r0  rK  bool | Noner^  r_  r  r  ri   rD   -TFSequenceClassifierOutput | Tuple[tf.Tensor]c                 C  s  |d ur|n| j j}| j jrdn|}| j||||||d}| j jrE|t }	tj|	dd}	tjj| j	dd}
tj
|	t|
g d dd}	n|d }	| |	}	|d u rZtj|	dd}n1| t|	d |}t||	j}t|	tj|dd}	ttj
|	ddtjtj
|dddd}| |}d }|d urtjjdd}|t|dgt|d| j jg}|s|f|td   }|d ur|f| S |S t|||j|jd	S )
NTr  r   r4   r3   )r3   r   r   r   )Zfrom_logitsr  )r   Zuse_return_dictr  r  r  r(   r   r-   Zsoftmaxr  r   r6   r  r  r  r   rN   rK   multiplyr7   divider  r   ZlossesZSparseCategoricalCrossentropyr  r   r   r    )r{   r  r3  rK  r^  r_  r  r  r   r   Znorm_weightsZpooled_outputZpadding_maskZpadding_mask_floatr  r  Zloss_fnr   r&   r&   r'   r     sN    
"
$z(TFWav2Vec2ForSequenceClassification.callc                 C  s  | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urWt| jj | jd d | jj	g W d    n1 sRw   Y  t| dd d urt| j
j | j
d d | jjg W d    d S 1 s}w   Y  d S d S )NTr  r  r  )r   r   r(   r   r  r   r   r  r   r   r  r  r   r&   r&   r'   r   3  s    "z)TFWav2Vec2ForSequenceClassification.build)NNNNNF)r  rE   r3  r0  rK  r  r^  r  r_  r  r  r0  r  ri   rD   r  r   )r!   r"   r#   ru   r  r  r  r   r   r   r   r&   r&   r}   r'   r    s    7r  )r  r  r  r  r   )
r=   r>   r?   r@   rA   rB   rC   rB   rD   rE   r   )r\   rE   r]   r^   )Mr$   
__future__r   r  dataclassesr   typingr   r   r   r   numpyri  Z
tensorflowr(   Zactivations_tfr	   Zmodeling_tf_outputsr
   r   r   Zmodeling_tf_utilsr   r   r   r   r   Ztf_utilsr   r   utilsr   r   r   r   r   Zconfiguration_wav2vec2r   Z
get_loggerr!   r  r  Z_CHECKPOINT_FOR_DOCr  r`   r   r2   r<   r[   rb   r   ZLayerrc   r   r   r   r   r   r   r   r  r
  r  r  r9  rB  rU  rW  rt  ru  r  ZWAV2VEC2_START_DOCSTRINGr  r  r  r  __all__r&   r&   r&   r'   <module>   s|   

K Y8"$$! +,;9PR H*8N #p