
    fTh;G                        S SK r S SKrS SKJr  S SKJrJrJr  S SKrS SK	J
r
  SSKJrJr  SSKJr  SSKJrJr  SS	KJrJrJrJrJrJrJrJrJr  S
SKJr  \R>                  " \ 5      r!\ " S S\5      5       r" " S S\5      r# " S S\5      r$ " S S\5      r% " S S\5      r& " S S\5      r' " S S\5      r(\ " S S\5      5       r)\r* " S S\)\5      r+\" SS9 " S  S!\)5      5       r, " S" S#\5      r- " S$ S%\5      r./ S&Qr/g)'    N)	dataclass)OptionalTupleUnion   )ModelOutputWav2Vec2BaseModelOutput)PreTrainedModel)auto_docstringlogging   )	Wav2Vec2EncoderWav2Vec2EncoderStableLayerNormWav2Vec2FeatureEncoderWav2Vec2FeatureProjectionWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2GumbelVectorQuantizerWav2Vec2ModelWav2Vec2PositionalConvEmbedding   )UniSpeechConfigc                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   S
rg)UniSpeechForPreTrainingOutput   a  
Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.

Args:
    loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
        paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
    projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
        projected quantized states.
    projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
        target vectors for contrastive loss.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nlossprojected_statesprojected_quantized_statescodevector_perplexityhidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   r   r    r   r!   __static_attributes__r"       g/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/unispeech/modular_unispeech.pyr   r      s    4 )-D(5$$
%,48hu0018>B):): ;B9=8E$5$56=8<M8E%"3"345<59Ju00129r,   r   c                       \ rS rSrSrg) UniSpeechPositionalConvEmbeddingA   r"   Nr#   r$   r%   r&   r+   r"   r,   r-   r/   r/   A       r,   r/   c                       \ rS rSrSrg)UniSpeechFeatureEncoderE   r"   Nr1   r"   r,   r-   r4   r4   E   r2   r,   r4   c                       \ rS rSrSrg)UniSpeechFeatureProjectionI   r"   Nr1   r"   r,   r-   r7   r7   I   r2   r,   r7   c                       \ rS rSrSrg)UniSpeechEncoderM   r"   Nr1   r"   r,   r-   r:   r:   M   r2   r,   r:   c                       \ rS rSrSrg)UniSpeechEncoderStableLayerNormQ   r"   Nr1   r"   r,   r-   r=   r=   Q   r2   r,   r=   c                   *    \ rS rSr\S 5       rS rSrg)UniSpeechGumbelVectorQuantizerU   c           	          U R                  SS9n[        R                  " [        R                  " U[        R                  " US-   5      -  SS9* 5      R                  5       nU$ )Nr   dimgHz>)meanr(   expsumlog)probsmarginal_probs
perplexitys      r-   _compute_perplexity2UniSpeechGumbelVectorQuantizer._compute_perplexityV   sR    *YY		.599^VZEZ;[*[ac ddeiik
r,   c                    UR                   u  p#nU R                  U5      nUR                  X#-  U R                  -  S5      nU R                  (       a  [
        R                  R                  UR                  5       U R                  SS9R                  U5      n[        R                  " UR                  X#-  U R                  S5      R                  5       SS9nU R                  U5      nOyUR                  SS9nUR                  " UR                   6 R!                  SUR                  SS5      S5      nUR                  X#-  U R                  S5      nU R                  U5      nUR                  X#-  S5      nUR#                  S5      U R$                  -  n	U	R                  X#-  U R                  U R&                  S5      n
U
R)                  S5      R                  X#S5      n
X4$ )NrE   T)tauhardrC   r         ?)shapeweight_projview
num_groupstrainingnn
functionalgumbel_softmaxfloattemperaturetype_asr(   softmaxrM   argmax	new_zerosscatter_	unsqueezecodevectorsnum_varsrH   )selfr    
batch_sizesequence_lengthhidden_sizecodevector_probscodevector_soft_distrL   codevector_idxcodevectors_per_grouprd   s              r-   forward&UniSpeechGumbelVectorQuantizer.forward\   s   3@3F3F0
[ ((7%**:+G$//+Y[]^==!}};;##%4+;+;$  <  gm$ 
 $)=="":#?RTU[[]ce$  112FGJ +11b19N,668K8KLUUN''A.   044Z5QSWSbSbdfg112BCJ+001MrR 0 : :2 >AQAQ Q+001Mt`d`m`moqr!oob)..zBO&&r,   r"   N)r#   r$   r%   r&   staticmethodrM   rn   r+   r"   r,   r-   r@   r@   U   s     
#'r,   r@   c                       \ rS rSr\rSrSrSrSr	Sr
S rS\\R                  \4   4S jrS\S	\R                  4S
 jrSrg)UniSpeechPreTrainedModel   	unispeechinput_valuesTc           
         [        U[        5      (       a  UR                  R                  R                  R                  SSS9  UR                  R                  R                  R                  5         [        R                  R                  UR                  5        g[        U[        5      (       a  [        R                  R                  UR                  R                  SS[        R                  " SUR                  R                   S   UR                  R"                  -  -  5      -  S9  [        R                  R%                  UR                  R                  S5        g[        U[&        5      (       a  [        R                  " SUR(                  R*                  -  5      n[        R                  R                  UR(                  R                  U* US9  [        R                  R                  UR(                  R                  U* US9  g[        U[        R,                  5      (       ak  UR                  R                  R                  SU R.                  R0                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R2                  [        R4                  45      (       aJ  UR                  R                  R                  5         UR                  R                  R7                  S5        g[        U[        R8                  5      (       a  [        R                  R;                  UR                  5        UR                  bh  [        R                  " UR<                  UR"                  UR                   S   -  -  5      n[        R                  R                  UR                  U* US9  ggg)	zInitialize the weights        r   )rF   stdr   r   )abNrR   )
isinstancer@   rU   weightdatanormal_biaszero_rY   inituniform_rd   r/   convmathsqrtkernel_sizein_channels	constant_r7   
projectionin_featuresLinearconfiginitializer_range	LayerNorm	GroupNormfill_Conv1dkaiming_normal_groups)rf   moduleks      r-   _init_weights&UniSpeechPreTrainedModel._init_weights   s    f<==%%**222C##((..0GGV//0 @AAGGOO""		!v{{'>'>q'AFKKD[D['["\]]  
 GGfkk..2 :;;		!f//;;;<AGGV..55!qAGGV..33rQ?		**MM&&CT[[5R5R&S{{&  &&( 'r|| <==KK""$MM$$S)		**GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' +r,   input_lengthsc                     S n[        U R                  R                  U R                  R                  5       H  u  p4U" XU5      nM     U$ )z8
Computes the output length of the convolutional layers
c                 8    [         R                  " X-
  USS9S-   $ )Nfloor)rounding_moder   )r(   div)input_lengthr   strides      r-   _conv_out_lengthSUniSpeechPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length   s      99\7wWZ[[[r,   )zipr   conv_kernelconv_stride)rf   r   r   r   r   s        r-    _get_feat_extract_output_lengths9UniSpeechPreTrainedModel._get_feat_extract_output_lengths   sG    
	\
 $'t{{'>'>@W@W#XK,]PM $Y r,   feature_vector_lengthattention_maskc                    UR                  SS9S S 2S4   nU R                  U5      R                  [        R                  5      nUR
                  S   n[        R                  " XQ4UR                  UR                  S9nSU[        R                  " UR
                  S   UR                  S9US-
  4'   UR                  S/5      R                  S5      R                  S/5      R                  5       nU$ )NrE   rC   r   )dtypedevicer   )r   )cumsumr   tor(   longrT   zerosr   r   arangeflipbool)rf   r   r   non_padded_lengthsoutput_lengthsrg   s         r-   "_get_feature_vector_attention_mask;UniSpeechPreTrainedModel._get_feature_vector_attention_mask   s     ,22r2:1b5A>>?QRUUV[V`V`a#))!,
/~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr,   r"   N)r#   r$   r%   r&   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_2_supports_sdpar   r   r(   
LongTensorintr   r   r+   r"   r,   r-   rr   rr      s`    "L#$O&*#!N9BeEDTDTVYDY>Z  ]b]m]m r,   rr   c                       \ rS rSrS\4S jrS rS r     SS\\	R                     S\\	R                     S	\\	R                     S
\\   S\\   S\\   S\\\4   4S jjrSrg)UniSpeechModel   r   c                    [         R                  U5        Xl        [        U5      U l        [        U5      U l        UR                  S:  d  UR                  S:  aG  [        R                  " [        R                  " UR                  5      R                  5       5      U l        UR                   (       a  [#        U5      U l        O['        U5      U l        U R)                  5         g )Nrw   )rr   __init__r   r4   feature_extractorr7   feature_projectionmask_time_probmask_feature_probrY   	Parameterr(   Tensorri   r   masked_spec_embeddo_stable_layer_normr=   encoderr:   	post_init)rf   r   s     r-   r   UniSpeechModel.__init__   s     ))&1!8!@"<V"D  3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"&&:6BDL+F3DL 	r,   c                     [        S5      eNzNot needed for UniSpeechAttributeErrorrf   s    r-   freeze_feature_extractor'UniSpeechModel.freeze_feature_extractor       788r,   c                     [        S5      er   r   r   s    r-   freeze_feature_encoder%UniSpeechModel.freeze_feature_encoder   r   r,   Nru   r   mask_time_indicesoutput_attentionsoutput_hidden_statesreturn_dictreturnc                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      nUR                  SS5      nUb  U R                  UR                  S   U5      nU R                  U5      u  pU R                  XUS9nU R                  UUUUUS9n	U	S   nU(       d	  X4U	SS -   $ [        UUU	R                  U	R                  S9$ )a  
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
    masked extracted features in *config.proj_codevector_dim* space.
Nr   r   )r   r   r   r   r   r   r   )last_hidden_stateextract_featuresr    r!   )r   r   r   use_return_dictr   	transposer   rT   r   _mask_hidden_statesr   UniSpeechBaseModelOutputr    r!   )
rf   ru   r   r   r   r   r   r   r    encoder_outputss
             r-   rn   UniSpeechModel.forward   s7    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;%!DDEUE[E[\]E^`noN*.*A*ABR*S'00~ 1 
 ,,)/!5# ' 
 (*!4qr7JJJ'+-)77&11	
 	
r,   )r   r   r   r   r   )NNNNN)r#   r$   r%   r&   r   r   r   r   r   r(   r   r)   r   r   r   r   rn   r+   r"   r,   r-   r   r      s     "99 269=,0/3&*2
u||,2
 !.2
 $E$5$56	2

 $D>2
 'tn2
 d^2
 
u..	/2
 2
r,   r   zZ
    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
    )custom_introc                   8  ^  \ rS rSrS\4U 4S jjrS\4S jrS rS r	\
 SS\R                  S	\R                  S
\R                  S\4S jj5       r\    SS\\R                      S\\R                      S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )UniSpeechForPreTrainingi  r   c                 8  > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        U5      U l	        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                   5      U l        [        R
                  " UR$                  5      U l        U R)                  5         g )N)superr   r   rt   rY   Dropoutfeat_quantizer_dropoutdropout_featuresr@   	quantizerr   codevector_dimproj_codevector_dim	project_qri   project_hidnum_ctc_classesctc_projfinal_dropoutdropoutr   )rf   r   	__class__s     r-   r    UniSpeechForPreTraining.__init__!  s     '/ "

6+H+H I7?6#8#8&:T:TU99V%?%?ASAST		&"4"4f6L6LMzz&"6"67 	r,   r]   c                 $    XR                   l        g)zR
Set the Gumbel softmax temperature to a given value. Only necessary for training
N)r   r]   )rf   r]   s     r-   set_gumbel_temperature.UniSpeechForPreTraining.set_gumbel_temperature0  s     &1"r,   c                 Z    [         R                  " S[        5        U R                  5         g)z
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
zThe method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.N)warningswarnFutureWarningr   r   s    r-   r   0UniSpeechForPreTraining.freeze_feature_extractor6  s'    
 	Q	

 	##%r,   c                 L    U R                   R                  R                  5         g)z
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
N)rt   r   _freeze_parametersr   s    r-   r   .UniSpeechForPreTraining.freeze_feature_encoderB  s    
 	((;;=r,   target_featuresnegative_featurespredicted_featuresc                     [         R                  " X/SS9n [         R                  " UR                  5       U R                  5       SS9nUR	                  U 5      nXC-  nU$ )z
Compute logits for contrastive loss based using cosine similarity as the distance measure between
`[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
r   rC   rE   )r(   catcosine_similarityr\   r^   )r  r  r	  r]   logitss        r-   compute_contrastive_logits2UniSpeechForPreTraining.compute_contrastive_logitsI  s\      ))_$HaP(();)A)A)C_EZEZE\bde0 %r,   ru   r   r   r   r   r   c           	         Ub  UOU R                   R                  nU R                  UUUUUS9nUS   nU R                  US   5      nU R	                  U5      u  pU R                  U	R                  U R
                  R                  R                  5      5      n	U R                  U	5      n	[        R                  " UR                  S5      UR                  S5      5      R                  U R                   R                  5      nUR                  SS5      n[        R                   " U5      R#                  5       R                  UR$                  5      nUR                  SS5      nUR'                  S5      nUR)                  US5      U	R)                  U) S5      -   nU R+                  U5      nU R-                  U5      nSnU(       d  Ub
  XX4USS -   $ XyU
4USS -   $ [/        UUU	U
UR0                  UR2                  S9$ )	a[  
Example:

```python
>>> import torch
>>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining

>>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
>>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
>>> # TODO: Add full pretraining example
```Nr   r   r   rE   rw   r   )r   r   r   r   r    r!   )r   r   rt   r   r   r   r   r|   r   r   r(   emptysizer   replace_probr   	bernoullir   r   rc   masked_fillr   r   r   r    r!   )rf   ru   r   r   r   r   outputstransformer_featuresr   quantized_featuresr   prob_replace_matrixsampled_replace_matrixr  r   s                  r-   rn   UniSpeechForPreTraining.forward]  s   * &1%<k$++B]B]..)/!5# ! 
  'qz  00<48NNCS4T1 "^^,>,A,A$..BWBWB]B],^_!--.@A#kk*>*C*CA*FH\HaHabcHdekkKK$$
 2;;AqA!&1D!E!J!J!L!O!OPdPkPk!l!7!A!A!Q!G!7!A!A"!E%112H#N**,B+BCH

 f%v& 4F^ahijikalll(>STW^_`_aWbbb,1'9"7!//))
 	
r,   )r   r   r   r   r   r   rt   )r   )NNNN)r#   r$   r%   r&   r   r   r   r   r   r   rp   r(   r)   r  r   r   r   r   r   r   r   rn   r+   __classcell__)r   s   @r-   r   r     s    1# 1
&> 
 	** ,, "-- 	 &  26,0/3&*D
u||,D
 !.D
 $D>	D

 'tnD
 d^D
 
u33	4D
 D
r,   r   c                       \ rS rSrSrg)UniSpeechForCTCi  r"   Nr1   r"   r,   r-   r  r    r2   r,   r  c                       \ rS rSrSrg)"UniSpeechForSequenceClassificationi  r"   Nr1   r"   r,   r-   r   r     r2   r,   r   )r  r   r   r   rr   )0r   r   dataclassesr   typingr   r   r   r(   torch.nnrY   modeling_outputsr   r	   modeling_utilsr
   utilsr   r   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   configuration_unispeechr   
get_loggerr#   loggerr   r/   r4   r7   r:   r=   r@   rr   r   r   r   r  r   __all__r"   r,   r-   <module>r,     s6     ! ) )   D - ,
 
 
 5 
		H	%  :K  :  :F	'F 		4 		!: 		 		&D 	*'%B *'Z E E EP 3 J
-} J
Z 
B
6 B

B
J	n 		)J 	r,   