
    fTh                     4   S SK r S SKrS SKJrJrJr  S SKrS SKrS SK	J
r
  S SKJ
s  Jr  S SK	Jr  SSKJr  SSKJr  SSKJr  SSKJrJrJrJrJrJr  SS	KJr  SS
KJrJ r J!r!  SSK"J#r#  \!RH                  " \%5      r& " S S\
RN                  5      r( " S S\
RN                  5      r) " S S\
RN                  5      r* " S S\
RN                  5      r+ " S S\
RN                  5      r, " S S\
RN                  5      r- " S S\
RN                  5      r. " S S\
RN                  5      r/ " S S\
RN                  5      r0 " S S \
RN                  5      r1\ " S! S"\5      5       r2 " S# S$\
RN                  5      r3 " S% S&\
RN                  5      r4 " S' S(\
RN                  5      r5 " S) S*\
RN                  5      r6 " S+ S,\
RN                  5      r7 " S- S.\
RN                  5      r8  SJS/\\9\94   S0\:S1\9S2\\Rv                     S3\9S4\Rx                  4S5 jjr=\r>\ " S6 S7\25      5       r?S8r@\" S9S:9 " S; S<\25      5       rA\" S=S:9 " S> S?\25      5       rB\ " S@ SA\25      5       rC " SB SC\
RN                  5      rD " SD SE\
RN                  5      rE\" SFS:9 " SG SH\25      5       rF/ SIQrGg)K    N)OptionalTupleUnion)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)BaseModelOutputCausalLMOutputSequenceClassifierOutputTokenClassifierOutputWav2Vec2BaseModelOutputXVectorOutput)PreTrainedModel)auto_docstringis_peft_availablelogging   )WavLMConfigc                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )WavLMSamePadLayer$   c                 R   > [         TU ]  5         US-  S:X  a  SU l        g SU l        g N   r   r   )super__init__num_pad_remove)selfnum_conv_pos_embeddings	__class__s     `/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/wavlm/modeling_wavlm.pyr   WavLMSamePadLayer.__init__%   s)    #:Q#>!#Ca    c                 X    U R                   S:  a  US S 2S S 2S U R                   * 24   nU$ Nr   r   r    hidden_statess     r#   forwardWavLMSamePadLayer.forward)   s6    ")!Q0F43F3F2F0F*FGMr%   r(   __name__
__module____qualname____firstlineno__r   r+   __static_attributes____classcell__r"   s   @r#   r   r   $   s    K r%   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )WavLMPositionalConvEmbedding/   c                   > [         TU ]  5         [        R                  " UR                  UR                  UR
                  UR
                  S-  UR                  S9U l        [        R                  R                  n[        [        R                  R                  S5      (       a$  [        R                  R                  R                  n[        5       (       Ga%  SS KnUR                  R                  U R                  R                   SS9   U" U R                  SSS9U l        S S S 5        [        U R                  S5      (       aU  U R                  R                  R                   R"                  nU R                  R                  R                   R$                  nO,U R                  R&                  nU R                  R(                  nUR                  R+                  X5        UR                  R+                  X5        OU" U R                  SSS9U l        [-        UR
                  5      U l        [0        UR2                     U l        g ! , (       d  f       GN,= f)	Nr   )kernel_sizepaddinggroupsweight_normr   )modifier_rankweight)namedimparametrizations)r   r   nnConv1dhidden_sizer!   num_conv_pos_embedding_groupsconvutilsr<   hasattrrA   r	   	deepspeedzeroGatheredParametersr>   	original0	original1weight_gweight_vregister_external_parameterr   r:   r   feat_extract_activation
activation)r    configr<   rI   rN   rO   r"   s         r#   r   %WavLMPositionalConvEmbedding.__init__0   s   II6622a777
	 hh**288,,m<<((33??K%''224993C3CST2U'		aH	 Vtyy"4559955<<FF9955<<FF99--99--NN66tFNN66tF#DIIH!DDI()G)GH !?!?@ VUs   I
Ic                     UR                  SS5      nU R                  U5      nU R                  U5      nU R                  U5      nUR                  SS5      nU$ Nr   r   )	transposerF   r:   rR   r)   s     r#   r+   $WavLMPositionalConvEmbedding.forwardQ   sV    %//15		-0]36%//15r%   )rR   rF   r:   r-   r4   s   @r#   r6   r6   /   s    AB r%   r6   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )WavLMFeatureProjection\   c                 4  > [         TU ]  5         [        R                  " UR                  S   UR
                  S9U l        [        R                  " UR                  S   UR                  5      U l	        [        R                  " UR                  5      U l        g )Neps)r   r   rB   	LayerNormconv_dimlayer_norm_eps
layer_normLinearrD   
projectionDropoutfeat_proj_dropoutdropoutr    rS   r"   s     r#   r   WavLMFeatureProjection.__init__]   sf    ,,vr':@U@UV))FOOB$79K9KLzz&":":;r%   c                 n    U R                  U5      nU R                  U5      nU R                  U5      nX4$ N)rc   re   rh   )r    r*   norm_hidden_statess      r#   r+   WavLMFeatureProjection.forwardc   s7    !__];(:;]300r%   )rh   rc   re   r-   r4   s   @r#   rZ   rZ   \   s    <1 1r%   rZ   c                   (  ^  \ rS rSrSr    SS\S\S\S\S\S\4U 4S	 jjjr    SS
\	R                  S\\	R                     S\\	R                     S\S\\	R                  \\	R                     \\\	R                        4   4
S jjrS
\	R                  S\\	R                   \	R"                  4   S\	R                  S\S\	R                  \	R                  44
S jrS\S\S\	R                  4S jrS\	R                  S\	R                  4S jrSrU =r$ )WavLMAttentionk   z=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsrh   num_bucketsmax_distancehas_relative_position_biasc                   > [         TU ]  5         Xl        X l        X0l        X-  U l        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        [        R                  " X5      U l
        [        R                  " X5      U l        [        R                  " X5      U l        [        R                  " X5      U l        X@l        XPl        [        R                   " ["        R$                  " SU R                  SS5      5      U l        [        R                  " U R
                  S5      U l        U(       a1  [        R*                  " U R                  U R                  5      U l        g g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r      )r   r   rr   rs   rh   head_dim
ValueErrorscalingrB   rd   k_projv_projq_projout_projrt   ru   	Parametertorchonesgru_rel_pos_constgru_rel_pos_linear	Embeddingrel_attn_embed)r    rr   rs   rh   rt   ru   rv   r"   s          r#   r   WavLMAttention.__init__n   s#    	""!.MMI%$..8MdnnM]$YKr3  }}d*ii	5ii	5ii	5		)7&(!#ejjDNNAq.Q!R"$))DMM1"=%"$,,t/?/?"PD &r%   r*   attention_maskposition_biasoutput_attentionsreturnc                    UR                  5       u  pgnUcP  U R                  Xw5      nUR                  S5      R                  USSS5      R	                  X`R
                  -  Xw5      nUR	                  UR                  SS U R
                  S4-   5      n	U	R                  SSSS5      n	U R                  U	5      n
U
R	                  U	R                  SS S-   5      R                  S5      n
[        R                  " U
5      R                  SSS9u  pXU R                  -  S	-
  -  S
-   nUR	                  X`R
                  -  SS5      U-  nUR	                  SXw45      nU R                  XX5      u  nnUUU4$ )z'Attention layer with relative attentionNr   r   r]   r   r   )r      r@         ?g       @)sizecompute_bias	unsqueezerepeatviewrs   shapepermuter   sumr   sigmoidchunkr   torch_multi_head_self_attention)r    r*   r   r   r   indexbsztgt_len_gated_hidden_statesrelative_position_projgate_agate_bgate_outputgated_position_biasattn_outputattn_weightss                    r#   r+   WavLMAttention.forward   s    (,,.a   --g?M''*11#q!Q?DDS>>EY[bl  ,001D1DSb1IT^^]_L`1`a199!Q1E "&!8!89L!M!7!<!<=P=V=VWZXZ=[^d=d!e!i!ijl!m '=>DDQBDO)?)? ?# EFL *..s^^/CRKm[166G7MN$($H$H+>%
!\ L-77r%   r   c                 T   UR                  SS5      =n=pgUb  UR                  S5      OSnS=pSn[        R                  " UUUU R                  U R
                  [        R                  " S/5      [        R                  " U R                  R                  U R                  R                  U R                  R                  45      U	U
UU R                  U R                  R                  U R                  R                  U R                   UUUSU R                  R                  U R                  R                  U R                  R                  S9u  pUR                  SS5      nUbC  USS2S4   R#                  UR$                  SS U R
                  4-   UR$                  SS -   5      nX4$ )zCsimple wrapper around torch's multi_head_attention_forward functionr   r   NFT)use_separate_proj_weightq_proj_weightk_proj_weightv_proj_weight)rW   neFmulti_head_attention_forwardrr   rs   r   emptycatr~   biasr|   r}   rh   r   r>   trainingbroadcast_tor   )r    r*   r   r   r   querykeyvaluekey_padding_maskbias_kbias_vadd_zero_attnr   r   s                 r#   r   .WavLMAttention.torch_multi_head_self_attention   s    ,55a;;;3A3M>,,Q/SW  %&$B$BNNNNKKIIt{{'')9)94;;;K;KLMLLMM  MMMM%)++,,++,,++,,+%
!2 "++Aq1# (40==""2A&$..)::\=O=OPQPR=SSL ((r%   query_length
key_lengthc                 ~   [         R                  " U[         R                  S9S S 2S 4   n[         R                  " U[         R                  S9S S S 24   nXC-
  nU R                  U5      nUR	                  U R
                  R                  R                  5      nU R                  U5      nUR                  / SQ5      nU$ )Ndtype)r   r   r   )	r   arangelong_relative_positions_buckettor   r>   devicer   )r    r   r   context_positionmemory_positionrelative_positionrelative_position_bucketvaluess           r#   r   WavLMAttention.compute_bias   s     <<EJJG4P,,zDT1WM+>#'#B#BCT#U #;#>#>t?R?R?Y?Y?`?`#a $$%=>	*r%   relative_positionsc                 &   U R                   S-  nUS:  R                  [        R                  5      U-  n[        R                  " U5      nUS-  nX:  n[        R
                  " UR                  5       U-  5      nU[        R
                  " U R                  U-  5      -  nXbU-
  -  nXF-   R                  [        R                  5      n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " XQU5      -  nU$ r   )rt   r   r   r   abslogfloatmathru   min	full_likewhere)r    r   rt   relative_buckets	max_exactis_smallrelative_positions_if_largerelative_position_if_larges           r#   r   )WavLMAttention._relative_positions_bucket   s    &&!+.266uzzB[P"YY'9:1$	%1&+ii0B0H0H0JY0V&W#&ADHHTM^M^ajMjDk&k#&AS\E\&]#&/&M%Q%QRWR\R\%]"%*YY&8RbcTc(d&
" 	EKKF`aar%   )rh   rr   r   r   ry   r|   ru   rt   rs   r   r~   r   r{   r}   )        i@  i   TNNFr   )r.   r/   r0   r1   __doc__intr   boolr   r   Tensorr   r   r+   FloatTensorr   
LongTensor
BoolTensorr   r   r   r2   r3   r4   s   @r#   rp   rp   k   s   G +/"Q"Q "Q 	"Q
 "Q "Q %)"Q "QN 2604"''8||'8 !.'8  -	'8
  '8 
u||Xell3XeELL>Q5RR	S'8R5)((5) e..0@0@@A5) #..	5)
  5) 

U..	/5)n # %BSBS  U=N=N  SXSdSd    r%   rp   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )WavLMFeedForwardi  c                   > [         TU ]  5         [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l	        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                   5      U l        g rl   )r   r   rB   rf   activation_dropoutintermediate_dropoutrd   rD   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutri   s     r#   r   WavLMFeedForward.__init__  s    $&JJv/H/H$I!"$))F,>,>@X@X"Yf''--'-f.?.?'@D$'-'8'8D$IIf&>&>@R@RS jj)>)>?r%   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ rl   )r   r   r   r   r   r)   s     r#   r+   WavLMFeedForward.forward  sX    //>00?11-@))-8++M:r%   )r   r   r   r   r   r-   r4   s   @r#   r   r     s    @ r%   r   c                   B   ^  \ rS rSrSS\S\4U 4S jjjrSS jrSrU =r	$ )	WavLMEncoderLayeri)  rS   rv   c           	        > [         TU ]  5         [        UR                  UR                  UR
                  UR                  UR                  US9U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  S9U l        [!        U5      U l        [        R                  " UR                  UR                  S9U l        g N)rr   rs   rh   rt   ru   rv   r^   r   r   rp   rD   num_attention_headsattention_dropoutrt   max_bucket_distance	attentionrB   rf   r   rh   r`   rb   rc   r   feed_forwardfinal_layer_normr    rS   rv   r"   s      r#   r   WavLMEncoderLayer.__init__*      '((00,,**33'A
 zz&"7"78,,v'9'9v?T?TU,V4 "V-?-?VEZEZ [r%   c                     UnU R                  UUUUUS9u  pnU R                  U5      nXa-   nU R                  U5      nXR                  U5      -   nU R	                  U5      nX4nU(       a  X4-  nU$ )Nr   r   r   r   )r   rh   rc   r   r   )	r    r*   r   r   r   r   attn_residualr   outputss	            r#   r+   WavLMEncoderLayer.forward9  s    %59^^)'/ 6D 6
2] ]3%56%(9(9-(HH--m< 0&Gr%   r   rh   r   r   rc   Tr   
r.   r/   r0   r1   r   r   r   r+   r2   r3   r4   s   @r#   r   r   )  s)    \{ \ \ \ r%   r   c                   B   ^  \ rS rSrSS\S\4U 4S jjjrSS jrSrU =r	$ )	 WavLMEncoderLayerStableLayerNormiR  rS   rv   c           	        > [         TU ]  5         [        UR                  UR                  UR
                  UR                  UR                  US9U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  S9U l        [!        U5      U l        [        R                  " UR                  UR                  S9U l        g r   r   r  s      r#   r   )WavLMEncoderLayerStableLayerNorm.__init__S  r  r%   c                     UnU R                  U5      nU R                  UUUUS9u  pnU R                  U5      nXQ-   nXR                  U R	                  U5      5      -   nX4nU(       a  Xv4-  nU$ )N)r   r   r   )rc   r   rh   r   r   )r    r*   r   r   r   r  r   r  s           r#   r+   (WavLMEncoderLayerStableLayerNorm.forwardb  s    %659^^)'/	 6D 6
2] ]3%5%(9(9$:O:OP]:^(__ 0&Gr%   r	  r
  )NNFr  r4   s   @r#   r  r  R  s)    \{ \ \ \ r%   r  c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )WavLMEncoderiw  c           
        > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  S9U l	        [
        R                  " UR                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[!        XS:H  S9PM     sn5      U l        SU l        g s  snf Nr^   r   )rv   F)r   r   rS   r6   pos_conv_embedrB   r`   rD   rb   rc   rf   r   rh   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointingr    rS   ir"   s      r#   r   WavLMEncoder.__init__x  s    :6B,,v'9'9v?T?TUzz&"7"78mmUZ[a[s[sUtuUtPQv6KUtu
 ',# v    C
c           	         U(       a  SOS nU(       a  SOS nUb4  UR                  S5      R                  SSUR                  S   5      nSX) '   U R                  U5      n	X-   nU R	                  U5      nU R                  U5      n[        5       =(       d    [        U 5      n
S n[        U R                  5       H  u  pU(       a  Xa4-   n[        R                  " / 5      nU R                  =(       a$    US:  =(       a    XR                  R                  :  nU(       a  U
(       aS  U R                  (       a1  U R                  (       a   U R!                  UR"                  UUUU5      nO
U" UUUUUS9nUS S u  pU(       a  SnU(       d  M  UWS   4-   nM     U(       a  Xa4-   nU(       d  [%        S XU4 5       5      $ ['        UUUS	9$ )
N r]   r   r   r   r  NNNc              3   .   #    U  H  oc  M  Uv   M     g 7frl   r!  .0vs     r#   	<genexpr>'WavLMEncoder.forward.<locals>.<genexpr>       m$[q$[   	last_hidden_stater*   
attentions)r   r   r   r  rc   rh   r	   r
   	enumerater  r   randr   rS   	layerdropr  _gradient_checkpointing_func__call__tupler   r    r*   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusr   r  layerdropout_probabilityskip_the_layerlayer_outputss                    r#   r+   WavLMEncoder.forward  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M01"11-@%;6]302R6LT6R!$++.HA#$58H$H! #(**R.!]]fq1uf:MP[P[PePe:eN![..4==$($E$E%&%)%M %*%'5&3*;%M 0=Ra/@, 2  &9]1=M<O&O#C /F   14D Dm]GZ$[mmm++*
 	
r%   rS   rh   r  rc   r  r  NFFTr-   r4   s   @r#   r  r  w  s"    	, "D
 D
r%   r  c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )WavLMEncoderStableLayerNormi  c           
        > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  S9U l	        [
        R                  " UR                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[!        XS:H  S9PM     sn5      U l        SU l        g s  snf r  )r   r   rS   r6   r  rB   r`   rD   rb   rc   rf   r   rh   r  r  r  r  r  r  r  s      r#   r   $WavLMEncoderStableLayerNorm.__init__  s    :6B,,v'9'9v?T?TUzz&"7"78mm v7788A 1Z[U[]8
 ',#r  c                    U(       a  SOS nU(       a  SOS nUb4  UR                  S5      R                  SSUR                  S   5      nSX) '   U R                  U5      n	X-   nU R	                  U5      n[        5       =(       d    [        U 5      n
S n[        U R                  5       H  u  pU(       a  Xa4-   n[        R                  " / 5      nU R                  =(       a$    US:  =(       a    XR                  R                  :  nU(       a  U
(       aR  U R                  (       a1  U R                  (       a   U R                  UR                   UUUU5      nO	U" UUUUS9nUS S u  pU(       a  SnU(       d  M  UWS   4-   nM     U R#                  U5      nU(       a  Xa4-   nU(       d  [%        S XU4 5       5      $ ['        XUS	9$ )
Nr!  r]   r   r   r   )r   r   r   r"  c              3   .   #    U  H  oc  M  Uv   M     g 7frl   r!  r$  s     r#   r'  6WavLMEncoderStableLayerNorm.forward.<locals>.<genexpr>  r)  r*  r+  )r   r   r   r  rh   r	   r
   r.  r  r   r/  r   rS   r0  r  r1  r2  rc   r3  r   r4  s                    r#   r+   #WavLMEncoderStableLayerNorm.forward  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M01"11-@%;]302R6LT6R!$++.HA#$58H$H! #(**R.!]]fq1uf:MP[P[PePe:eN![ ..4==$($E$E%&%)%M %*%'5*;&3	%M 0=Ra/@, 2  &9]1=M<O&O#A /D 6 14D Dm]GZ$[mmm+Yl
 	
r%   rA  rB  r-   r4   s   @r#   rD  rD    s"    ," "B
 B
r%   rD  c                   B   ^  \ rS rSrSrU 4S jr\S 5       rS rSr	U =r
$ )WavLMGumbelVectorQuantizeri  z
Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
c                 8  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U R                  -  S:w  a&  [        SUR                   SU R                   S35      e[        R                  " [        R                  " SU R                  U R
                  -  UR                  U R                  -  5      5      U l        [        R                  " UR                  S   U R                  U R
                  -  5      U l        SU l        g )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenation.r   r]   r   )r   r   num_codevector_groups
num_groupsnum_codevectors_per_groupnum_varscodevector_dimrz   rB   r   r   r   codevectorsrd   ra   weight_projtemperatureri   s     r#   r   #WavLMGumbelVectorQuantizer.__init__$  s     6688  4??2a7)&*?*?)@ A66:oo5F G%%  <<a4==!@&BWBW[_[j[jBjk
 99V__R%8$//DMM:YZ r%   c           	          U R                  SS9n[        R                  " [        R                  " U[        R                  " US-   5      -  SS9* 5      R                  5       nU$ )Nr   r   gHz>r]   )meanr   expr   r   )probsmarginal_probs
perplexitys      r#   _compute_perplexity.WavLMGumbelVectorQuantizer._compute_perplexity9  sR    *YY		.599^VZEZ;[*[ac ddeiik
r%   c                    UR                   u  p#nU R                  U5      nUR                  X#-  U R                  -  S5      nU R                  (       a  [
        R                  R                  UR                  5       U R                  SS9nUR                  U5      n[        R                  " UR                  X#-  U R                  S5      R                  5       SS9nU R                  U5      nOyUR                  SS9nUR                  " UR                   6 R!                  SUR                  SS5      S5      nUR                  X#-  U R                  S5      nU R                  U5      nUR                  X#-  S5      nUR#                  S5      U R$                  -  n	U	R                  X#-  U R                  U R&                  S5      n
U
R)                  S5      R                  X#S5      n
X4$ )Nr]   T)tauhardr   r   r   )r   rT  r   rO  r   rB   
functionalgumbel_softmaxr   rU  type_asr   softmaxr]  argmax	new_zerosscatter_r   rS  rQ  r   )r    r*   
batch_sizesequence_lengthrD   codevector_probscodevector_soft_distr\  codevector_idxcodevectors_per_grouprS  s              r#   r+   "WavLMGumbelVectorQuantizer.forward?  s   3@3F3F0
[ ((7%**:+G$//+Y[]^==!}};;M<O<O<QW[WgWgnr;s/77F $)=="":#?RTU[[]ce$  112FGJ +11b19N,668K8KLUUN''A.   044Z5QSWSbSbdfg112BCJ+001MrR 0 : :2 >AQAQ Q+001Mt`d`m`moqr!oob)..zBO&&r%   )rS  rO  rQ  rU  rT  )r.   r/   r0   r1   r   r   staticmethodr]  r+   r2   r3   r4   s   @r#   rL  rL    s+    
*  
"' "'r%   rL  c                       \ rS rSr\rSrSrSrSr	Sr
S r SS\\R                  \4   S	\\   4S
 jjr SS\S\R                  4S jjrSrg)WavLMPreTrainedModelid  wavlminput_valuesTFc           
         [        U[        5      (       a  UR                  R                  R                  R                  SSS9  UR                  R                  R                  R                  5         [        R                  R                  UR                  5        g[        U[        5      (       a  [        R                  R                  UR                  R                  SS[        R                  " SUR                  R                   S   UR                  R"                  -  -  5      -  S9  [        R                  R%                  UR                  R                  S5        g[        U[&        5      (       a  [        R                  " SUR(                  R*                  -  5      n[        R                  R                  UR(                  R                  U* US9  [        R                  R                  UR(                  R                  U* US9  g[        U[        R,                  5      (       ak  UR                  R                  R                  SU R.                  R0                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R2                  [        R4                  45      (       aJ  UR                  R                  R                  5         UR                  R                  R7                  S5        g[        U[        R8                  5      (       a  [        R                  R;                  UR                  5        UR                  bh  [        R                  " UR<                  UR"                  UR                   S   -  -  5      n[        R                  R                  UR                  U* US9  ggg)	zInitialize the weightsr   r   )rX  stdr   r   )abNr   )r   rL  rT  r>   datanormal_r   zero_rB   inituniform_rS  r6   rF   r   sqrtr9   in_channels	constant_rZ   re   in_featuresrd   rS   initializer_ranger`   	GroupNormfill_rC   kaiming_normal_r;   )r    moduleks      r#   _init_weights"WavLMPreTrainedModel._init_weightsm  s    f899%%**222C##((..0GGV//0 <==GGOO""		!v{{'>'>q'AFKKD[D['["\]]  
 GGfkk..2 677		!f//;;;<AGGV..55!qAGGV..33rQ?		**MM&&CT[[5R5R&S{{&  &&( 'r|| <==KK""$MM$$S)		**GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' +r%   Ninput_lengthsadd_adapterc                 d   Uc  U R                   R                  OUnS n[        U R                   R                  U R                   R                  5       H  u  pEU" XU5      nM     U(       aD  [        U R                   R                  5       H!  nU" USU R                   R                  5      nM#     U$ )z8
Computes the output length of the convolutional layers
c                 8    [         R                  " X-
  USS9S-   $ )Nfloor)rounding_moder   )r   divinput_lengthr9   strides      r#   _conv_out_lengthOWavLMPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length  s      99\7wWZ[[[r%   r   )rS   r  zipconv_kernelconv_strider  num_adapter_layersadapter_stride)r    r  r  r  r9   r  r   s          r#    _get_feat_extract_output_lengths5WavLMPreTrainedModel._get_feat_extract_output_lengths  s     2=1Ddkk--+	\
 $'t{{'>'>@W@W#XK,]PM $Y 4;;99: 04;;C]C] ^ ; r%   feature_vector_lengthr   c                    UR                  SS9S S 2S4   nU R                  XCS9nUR                  [        R                  5      nUR
                  S   n[        R                  " Xa4UR                  UR                  S9nSU[        R                  " UR
                  S   UR                  S9US-
  4'   UR                  S/5      R                  S5      R                  S/5      R                  5       nU$ )Nr]   r   r  r   )r   r   r   )r   )cumsumr  r   r   r   r   zerosr   r   r   flipr   )r    r  r   r  non_padded_lengthsoutput_lengthsrj  s          r#   "_get_feature_vector_attention_mask7WavLMPreTrainedModel._get_feature_vector_attention_mask  s    
 ,22r2:1b5A>>?Q>k'**5::6#))!,
/~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr%   r!  rl   )r.   r/   r0   r1   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_2_supports_sdpar  r   r   r   r   r   r   r  r  r2   r!  r%   r#   rs  rs  d  s    L$O&*#"N9D Z^"5#3#3S#89HPQU0 Y]%(:?:J:J r%   rs  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )WavLMNoLayerNormConvLayeri  c                 b  > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        g )Nr   r   r9   r  r   )r   r   ra   in_conv_dimout_conv_dimrB   rC   r  r  	conv_biasrF   r   rQ   rR   r    rS   layer_idr"   s      r#   r   "WavLMNoLayerNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@r%   c                 J    U R                  U5      nU R                  U5      nU$ rl   )rF   rR   r)   s     r#   r+   !WavLMNoLayerNormConvLayer.forward  s$    		-06r%   )rR   rF   r  r  r   r-   r4   s   @r#   r  r    s    A r%   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )WavLMLayerNormConvLayeri  c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [
        R                  " U R                  SS9U l        [        UR                     U l        g )Nr   r   r  T)elementwise_affine)r   r   ra   r  r  rB   rC   r  r  r  rF   r`   rc   r   rQ   rR   r  s      r#   r    WavLMLayerNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r%   c                     U R                  U5      nUR                  SS5      nU R                  U5      nUR                  SS5      nU R                  U5      nU$ )Nrb  r]   )rF   rW   rc   rR   r)   s     r#   r+   WavLMLayerNormConvLayer.forward  sV    		-0%//B76%//B76r%   rR   rF   r  rc   r  r  r-   r4   s   @r#   r  r    s    A r%   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )WavLMGroupNormConvLayeri  c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        [
        R                  " U R                  U R                  SS9U l        g )Nr   r   r  T)rO  num_channelsaffine)r   r   ra   r  r  rB   rC   r  r  r  rF   r   rQ   rR   r  rc   r  s      r#   r    WavLMGroupNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr%   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rl   )rF   rc   rR   r)   s     r#   r+   WavLMGroupNormConvLayer.forward  s2    		-066r%   r  r  r-   r4   s   @r#   r  r    s    r  r%   r  c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )WavLMFeatureEncoderi  z.Construct the features from raw audio waveformc           	        > [         TU ]  5         UR                  S:X  a@  [        USS9/[	        UR
                  S-
  5       Vs/ s H  n[        XS-   S9PM     sn-   nOVUR                  S:X  a-  [	        UR
                  5       Vs/ s H  n[        XS9PM     nnO[        SUR                   S35      e[        R                  " U5      U l        SU l        S	U l        g s  snf s  snf )
Ngroupr   )r  r   r<  z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r   r   feat_extract_normr  r  num_feat_extract_layersr  r  rz   rB   r  conv_layersr  _requires_grad)r    rS   r  r  r"   s       r#   r   WavLMFeatureEncoder.__init__  s    ##w.26AFGKPQWQoQorsQsKtKKta)&q5AKtK K %%0PUV\VtVtPuvPu126FPuKvK01I1I0JJst  ==5&+#"K ws   C C%c                 N    U R                  5        H
  nSUl        M     SU l        g )NF)
parametersrequires_gradr  r    params     r#   _freeze_parameters&WavLMFeatureEncoder._freeze_parameters  s#    __&E"'E '#r%   c                 B   US S 2S 4   nU R                   (       a  U R                  (       a  SUl        U R                   H\  nU R                   (       a@  U R                  (       a/  U R                  (       a  U R                  UR                  U5      nMT  U" U5      nM^     U$ )NT)r  r   r  r  r  r1  r2  )r    ru  r*   
conv_layers       r#   r+   WavLMFeatureEncoder.forward  s    $QW- 4==*.M'**J""t'B'Bt}} $ A A''!!
 !+= 9 + r%   )r  r  r  )
r.   r/   r0   r1   r   r   r  r+   r2   r3   r4   s   @r#   r  r    s    8#"$
 r%   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )WavLMAdapterLayeri.  c                    > [         TU ]  5         [        R                  " UR                  SUR                  -  UR
                  UR                  SS9U l        g )Nr   r   )r  r:   )r   r   rB   rC   output_hidden_sizeadapter_kernel_sizer  rF   ri   s     r#   r   WavLMAdapterLayer.__init__/  sJ    II%%)))&&((
	r%   c                 d    U R                  U5      n[        R                  R                  USS9nU$ )Nr   r   )rF   rB   rc  glur)   s     r#   r+   WavLMAdapterLayer.forward9  s/    		-0))-Q)?r%   )rF   r-   r4   s   @r#   r  r  .  s    
 r%   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )WavLMAdapteri@  c                   >^ [         TU ]  5         TR                  TR                  :w  aV  [        R
                  " TR                  TR                  5      U l        [        R                  " TR                  5      U l        OS =U l        U l        [        R                  " U4S j[        TR                  5       5       5      U l        TR                  U l        g )Nc              3   :   >#    U  H  n[        T5      v   M     g 7frl   )r  )r%  r   rS   s     r#   r'  (WavLMAdapter.__init__.<locals>.<genexpr>K  s     #hGg!$5f$=$=Ggs   )r   r   r  rD   rB   rd   projr`   proj_layer_normr  r  r  r  r0  ri   s    `r#   r   WavLMAdapter.__init__A  s     $$(:(::		&"4"4f6O6OPDI#%<<0I0I#JD /33DI,mm#huVMfMfGg#hh))r%   c                 |   U R                   b/  U R                  b"  U R                  U5      nU R                  U5      nUR                  SS5      nU R                   HK  n[        R
                  R                  5       nU R                  (       a  X0R                  :  d  MC  U" U5      nMM     UR                  SS5      nU$ rV   )r  r  rW   r  nprandomr   r0  )r    r*   r<  layerdrop_probs       r#   r+   WavLMAdapter.forwardN  s    99 T%9%9%E IIm4M 00?M%//15[[EYY--/N==^nn%D %m 4 !
 &//15r%   )r0  r  r  r  r-   r4   s   @r#   r  r  @  s    * r%   r  r   	mask_probmask_lengthr   	min_masksr   c           	        ^^^^^ U u  nmTS:  a  [        S5      eTT:  a  [        ST ST S35      e[        R                  R                  S5      R	                  5       mUUUUU4S jnUb-  UR                  5       R                  S5      R                  5       O[        U5       Vs/ s H  nTPM     snn[        R                  " UT4[        S	9n	/ n
U" T5      nUS
:X  a  U	$ U H  nU" U5      n[        R                  R                  [        R                  " UTS-
  -
  5      USS9n[        U5      S
:X  a  TS-
  nOUS
   n[        R                  " U[        R                  " X-
  [        R                   S	9U-  /5      nU
R#                  U5        M     [        R$                  " U
5      n
[        R&                  " U
SS2SS2S4   X[T45      n
U
R)                  X[T-  5      n
[        R                  " T5      SSSS24   n[        R&                  " UX[T45      R)                  X[T-  5      nU
U-   n
U
R+                  5       TS-
  :  a  TS-
  XTS-
  :  '   [        R,                  " XSS5        U	$ s  snf )a*  
Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
CPU as part of the preprocessing during training.

Args:
    shape: The shape for which to compute masks. This should be of a tuple of size 2 where
           the first element is the batch size and the second element is the length of the axis to span.
    mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                independently generated mask spans of length `mask_length` is computed by
                `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                actual percentage will be smaller.
    mask_length: size of the mask
    min_masks: minimum number of masked spans
    attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                    each batch dimension.
r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    > [        TU -  T-  T-   5      n[        UT5      nUT-  T:  a  TT-  nU TS-
  -
  U:  a  [        U TS-
  -
  S5      nU$ )z;Given input length, compute how many spans should be maskedr   r   )r   max)r  num_masked_spanepsilonr  r  r  rk  s     r#   compute_num_masked_span6_compute_mask_indices.<locals>.compute_num_masked_span  so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr%   Nr]   r   r   F)replace)rz   r  r  r/  itemdetachr   tolistr  r  r   choicer   lenconcatenater   int32appendarrayr   reshaper  put_along_axis)r   r  r  r   r  rj  r  r   r  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr  r  spec_aug_mask_idxdummy_mask_idxoffsetsr  rk  s    `` `            @@r#   _compute_mask_indicesr  _  s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89'8!o'89  HHj/:$GM1/Ba%1,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;(MUWU]U] ^ao op
 	!!"34/ &2 "45 1a:&+(V ,33JVa@ab ii$T4]3Goog
'UV^^+5G ,g5 /A"55GVYZGZ!0CCD mB?w :s   (I0c                   J  ^  \ rS rSrS\4U 4S jjrS rS r  SS\R                  S\
\R                     S\
\R                     4S	 jjr\     SS
\
\R                     S\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\\\4   4S jj5       rSrU =r$ )
WavLMModeli  rS   c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  S:  d  UR                  S:  aG  [        R                  " [        R                  " UR                  5      R                  5       5      U l        UR                   (       a  [#        U5      U l        O['        U5      U l        UR(                  (       a  [+        U5      OS U l        U R/                  5         g )Nr   )r   r   rS   r  feature_extractorrZ   feature_projectionmask_time_probmask_feature_probrB   r   r   r   rD   r~  masked_spec_embeddo_stable_layer_normrD  encoderr  r  r  adapter	post_initri   s     r#   r   WavLMModel.__init__  s     !4V!<"8"@   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"&&6v>DL'/DL/5/A/A|F+t 	r%   c                 Z    [         R                  " S[        5        U R                  5         gz
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.NwarningswarnFutureWarningfreeze_feature_encoderr    s    r#   freeze_feature_extractor#WavLMModel.freeze_feature_extractor  '    
 	Q	

 	##%r%   c                 8    U R                   R                  5         g
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
N)r  r  r'  s    r#   r&  !WavLMModel.freeze_feature_encoder  s    
 	113r%   r*   mask_time_indicesr   c                    [        U R                  SS5      (       d  U$ UR                  5       u  pEnUb(  U R                  R	                  UR
                  5      X'   OU R                  R                  S:  a  U R                  (       a  [        XE4U R                  R                  U R                  R                  UU R                  R                  S9n[        R                  " X!R                  [        R                  S9nU R                  R	                  UR
                  5      X'   U R                  R                  S:  a  U R                  (       a  [        XF4U R                  R                  U R                  R                   U R                  R"                  S9n[        R                  " XqR                  [        R                  S9nUSS2S4   R%                  SUS5      nSX'   U$ )	z
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://arxiv.org/abs/1904.08779).
apply_spec_augmentTNr   )r  r  r   r  )r   r   )r  r  r  r]   )getattrrS   r   r  r   r   r  r   r  mask_time_lengthmask_time_min_masksr   tensorr   r   r  mask_feature_lengthmask_feature_min_masksexpand)r    r*   r/  r   rj  rk  rD   mask_feature_indicess           r#   _mask_hidden_statesWavLMModel._mask_hidden_states  s    t{{$8$??   4A3E3E3G0
[(/3/E/E/H/HI\I\/]M,[[''!+ 5-++44 KK88-++99! !&->G[G[chcmcm n/3/E/E/H/HI\I\/]M,;;((1,#8)++77 KK;;++<<	$  $)<<0DMaMainisis#t #74#@#G#GO]_#` 23M/r%   ru  r   r5  r6  r   c                 >   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      nUR                  SS5      nUb  U R                  UR                  S   USS9nU R                  U5      u  pU R                  XUS9nU R                  UUUUUS9n	U	S   nU R                  b  U R                  U5      nU(       d	  X4U	SS -   $ [        UUU	R                  U	R                  S	9$ )
a  
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
    masked extracted features in *config.proj_codevector_dim* space.
Nr   r   Fr  )r/  r   r   r   r5  r6  r   )r,  extract_featuresr*   r-  )rS   r   r5  use_return_dictr  rW   r  r   r  r:  r  r  WavLMBaseModelOutputr*   r-  )
r    ru  r   r/  r   r5  r6  r>  r*   encoder_outputss
             r#   r+   WavLMModel.forward0  sY    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;%!DD &&q)>u E N +/*A*ABR*S'00~ 1 
 ,,)/!5# ' 
 (*<<# LL7M!4qr7JJJ#+-)77&11	
 	
r%   )r  rS   r  r  r  r  )NNNNNNN)r.   r/   r0   r1   r   r   r(  r&  r   r   r   r   r:  r   r   r   r   r   r@  r+   r2   r3   r4   s   @r#   r  r    s    { (
&4 :>59	,((, $E$5$56, !!1!12	,\  269=,0/3&*7
u||,7
 !.7
 $E$5$56	7

 $D>7
 'tn7
 d^7
 
u**	+7
 7
r%   r  r   zm
    WavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    )custom_introc                      ^  \ rS rSrSS\\   4U 4S jjjrS rS rS r	S r
\     SS\\R                     S	\\R                     S
\\   S\\   S\\   S\\R                     S\\\4   4S jj5       rSrU =r$ )WavLMForCTCin  target_langc                   > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        X l        UR                  c  [        SU R                   S35      e[        US5      (       a  UR                  (       a  UR                  OUR                  n[        R                   " X1R                  5      U l        U R%                  5         g)a  
target_lang (`str`, *optional*):
    Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
    adapter.<lang>.bin. Only relevant when using an instance of [`WavLMForCTC`] with adapters. Uses 'eng' by
    default.
NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `WavLMForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.r  )r   r   r  rt  rB   rf   final_dropoutrh   rG  
vocab_sizerz   r"   rH   r  r  rD   rd   lm_headr  )r    rS   rG  r  r"   s       r#   r   WavLMForCTC.__init__t  s     	 '
zz&"6"67&$00@ AH H  *1)G)GFL^L^F%%djdvdv 	 yy!35F5FG 	r%   c                     U R                   nUb'  [        U R                  SS5      c  [        SU S35      eUc.  [        U R                  SS5      b  [        R                  S5        gUb  U R                  USS9  gg)a  
This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
passing `target_lang=...` to `from_pretrained(...)`.

This method is **not** supposed to be called by the user and is prone to be changed in the future.
Nadapter_attn_dimzCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)rG  r2  rS   rz   loggerinfoload_adapter)r    rG  s     r#   tie_weightsWavLMForCTC.tie_weights  s     &&"wt{{<NPT'U']:;-Gtuvv WT[[:Ld%S%_KKCD$kd; %r%   c                 Z    [         R                  " S[        5        U R                  5         gr-  r!  Nr"  r'  s    r#   r(  $WavLMForCTC.freeze_feature_extractor  r*  r%   c                 L    U R                   R                  R                  5         gr,  rt  r  r  r'  s    r#   r&  "WavLMForCTC.freeze_feature_encoder      
 	

$$779r%   c                 T    U R                   R                  5        H
  nSUl        M     gz
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
FNrt  r  r  r  s     r#   freeze_base_modelWavLMForCTC.freeze_base_model  #    
 ZZ**,E"'E -r%   ru  r   r   r5  r6  labelsr   c                    Ub  UOU R                   R                  nUbJ  UR                  5       U R                   R                  :  a"  [	        SU R                   R                   35      eU R                  UUUUUS9nUS   nU R                  U5      nU R                  U5      n	Sn
UGbX  Ub  UO"[        R                  " U[        R                  S9nU R                  UR                  S5      5      R                  [        R                  5      nUS:  nUR                  S5      nUR                  U5      n[        R                   R#                  U	S[        R$                  S9R'                  SS5      n[        R(                  R*                  R-                  S	S
9   [        R                   R/                  UUUUU R                   R0                  U R                   R2                  U R                   R4                  S9n
SSS5        U(       d  U	4U[6        S -   nU
b  U
4U-   $ U$ [9        XUR:                  UR<                  S9$ ! , (       d  f       NL= f)a  
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
    Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
    the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
    All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
    config.vocab_size - 1]`.
Nz$Label values must be <= vocab_size: r=  r   r   r]   )r@   r   r   F)enabled)blank	reductionzero_infinitylosslogitsr*   r-  )rS   r?  r  rJ  rz   rt  rh   rK  r   	ones_liker   r  r   r   masked_selectrB   rc  log_softmaxfloat32rW   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r*   r-  )r    ru  r   r   r5  r6  rb  r  r*   rj  ri  r  labels_masktarget_lengthsflattened_targets	log_probsoutputs                    r#   r+   WavLMForCTC.forward  s   " &1%<k$++B]B]&**,$++2H2H"HCDKKDZDZC[\]]**)/!5#  
  
]3m, #1"<%//R^fkfpfpBq  !AA.BTBTUWBXY\\]b]g]ghM !A+K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+:}}--%!"++22"kk<<"&++"?"? .  ; Y)F)G!HHF)-)9TGf$EvEG4I4IV]VhVh
 	
 ;:s   A H??
I)rh   rK  rG  rt  rl   rC  )r.   r/   r0   r1   r   r   r   rS  r(  r&  r_  r   r   r   r   r   r   r   r+   r2   r3   r4   s   @r#   rF  rF  n  s    HSM  :<*
&:(  26,0/3&*)-D
u||,D
 !.D
 $D>	D

 'tnD
 d^D
 &D
 
un$	%D
 D
r%   rF  z
    WavLM Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                      ^  \ rS rSrU 4S jrS rS rS r\     SS\	\
R                     S\	\
R                     S\	\   S	\	\   S
\	\   S\	\
R                     S\\\4   4S jj5       rSrU =r$ )WavLMForSequenceClassificationi	  c                 "  > [         TU ]  U5        [        US5      (       a  UR                  (       a  [	        S5      e[        U5      U l        UR                  S-   nUR                  (       a2  [        R                  " [        R                  " U5      U-  5      U l        [        R                  " UR                  UR                   5      U l        [        R                  " UR                   UR$                  5      U l        U R)                  5         g )Nr  z\Sequence classification does not support the use of WavLM adapters (config.add_adapter=True)r   )r   r   rH   r  rz   r  rt  r  use_weighted_layer_sumrB   r   r   r   layer_weightsrd   rD   classifier_proj_size	projector
num_labels
classifierr  r    rS   
num_layersr"   s      r#   r   'WavLMForSequenceClassification.__init__  s     6=))f.@.@n   '
--1
((!#ejj.Dz.Q!RD6#5#5v7R7RS))F$?$?ARARS 	r%   c                 Z    [         R                  " S[        5        U R                  5         gr   r"  r'  s    r#   r(  7WavLMForSequenceClassification.freeze_feature_extractor!  r*  r%   c                 L    U R                   R                  R                  5         gr,  rY  r'  s    r#   r&  5WavLMForSequenceClassification.freeze_feature_encoder-  r[  r%   c                 T    U R                   R                  5        H
  nSUl        M     gr]  r^  r  s     r#   r_  0WavLMForSequenceClassification.freeze_base_model4  ra  r%   ru  r   r   r5  r6  rb  r   c                 0   Ub  UOU R                   R                  nU R                   R                  (       a  SOUnU R                  UUUUUS9nU R                   R                  (       ai  U[           n[
        R                  " USS9n[        R                  R                  U R                  SS9n	XR                  SSS5      -  R                  SS9nOUS   nU R                  U5      nUc  UR                  SS9n
OU R                  UR                   S   U5      nUR#                  S5      R%                  SSUR                   S   5      nS	X) '   UR                  SS9UR                  SS9R                  SS5      -  n
U R'                  U
5      nSnUbF  [)        5       nU" UR                  SU R                   R*                  5      UR                  S5      5      nU(       d  U4U[        S -   nUb  U4U-   $ U$ [-        UUUR.                  UR0                  S
9$ )f  
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
    Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
    into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
    soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
    conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NTr=  r   r   r]   r   r   r   rh  )rS   r?  r  rt  rv  r   stackrB   rc  rf  r  r   r   r  rX  r  r   r   r   r  r   r  r   r*   r-  )r    ru  r   r   r5  r6  rb  r  r*   norm_weightspooled_outputpadding_maskexpand_padding_maskrj  ri  loss_fctr{  s                    r#   r+   &WavLMForSequenceClassification.forward<  s   , &1%<k$++B]B]'+{{'I'ItOc**)/!5#  
 ;;--#$ABM!KK1=M==001C1C0LL*->->r1a-HHMMRSMTM#AJM}5!)..1.5MBB=CVCVWXCY[ijL"."8"8"<"C"CAq-J]J]^_J`"a25M./)--!-4|7G7GA7G7N7S7STVXY7ZZM/')HFKKDKK,B,BCV[[QS_UDY)F)G!HHF)-)9TGf$EvE'!//))	
 	
r%   )r  r  r  rt  rC  )r.   r/   r0   r1   r   r(  r&  r_  r   r   r   r   r   r   r   r   r+   r2   r3   r4   s   @r#   r~  r~  	  s    "
&:(  26,0/3&*)-A
u||,A
 !.A
 $D>	A

 'tnA
 d^A
 &A
 
u..	/A
 A
r%   r~  c                      ^  \ rS rSrU 4S jrS rS rS r\     SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\   S
\	\   S\	\   S\\\4   4S jj5       rSrU =r$ ) WavLMForAudioFrameClassificationi  c                   > [         TU ]  U5        [        US5      (       a  UR                  (       a  [	        S5      e[        U5      U l        UR                  S-   nUR                  (       a2  [        R                  " [        R                  " U5      U-  5      U l        [        R                  " UR                  UR                   5      U l        UR                   U l        U R%                  5         g )Nr  z_Audio frame classification does not support the use of WavLM adapters (config.add_adapter=True)r   )r   r   rH   r  rz   r  rt  r  r  rB   r   r   r   r  rd   rD   r  r  init_weightsr  s      r#   r   )WavLMForAudioFrameClassification.__init__  s     6=))f.@.@q   '
--1
((!#ejj.Dz.Q!RD))F$6$68I8IJ ++r%   c                 Z    [         R                  " S[        5        U R                  5         grV  r"  r'  s    r#   r(  9WavLMForAudioFrameClassification.freeze_feature_extractor  r*  r%   c                 L    U R                   R                  R                  5         gr,  rY  r'  s    r#   r&  7WavLMForAudioFrameClassification.freeze_feature_encoder  r[  r%   c                 T    U R                   R                  5        H
  nSUl        M     gr]  r^  r  s     r#   r_  2WavLMForAudioFrameClassification.freeze_base_model  ra  r%   ru  r   rb  r   r5  r6  r   c           	         Ub  UOU R                   R                  nU R                   R                  (       a  SOUnU R                  UUUUUS9nU R                   R                  (       ai  U[           n[
        R                  " USS9n[        R                  R                  U R                  SS9n	XR                  SSS5      -  R                  SS9nOUS   nU R                  U5      n
SnUbZ  [        5       nU" U
R                  SU R                  5      [
        R                   " UR                  SU R                  5      SS95      nU(       d  U
4U[        S -   nU$ [#        UU
UR$                  UR&                  S	9$ )
r  NTr=  r   r   r]   r   )axisrh  )rS   r?  r  rt  rv  r   r  rB   rc  rf  r  r   r   r  r   r  rg  r   r*   r-  )r    ru  r   rb  r   r5  r6  r  r*   r  rj  ri  r  r{  s                 r#   r+   (WavLMForAudioFrameClassification.forward  sf   , &1%<k$++B]B]'+{{'I'ItOc**)/!5#  
 ;;--#$ABM!KK1=M==001C1C0LL*->->r1a-HHMMRSMTM#AJM/')HFKKDOO<ell6;;WY[_[j[jKkrs>tuDY)F)G!HHFM$!//))	
 	
r%   )r  r  r  rt  rC  )r.   r/   r0   r1   r   r(  r&  r_  r   r   r   r   r   r   r   r   r+   r2   r3   r4   s   @r#   r  r    s     
&:(  26)-,0/3&*8
u||,8
 !.8
 &	8

 $D>8
 'tn8
 d^8
 
u++	,8
 8
r%   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )AMSoftmaxLossi  c                    > [         [        U ]  5         X0l        X@l        X l        [        R                  " [        R                  " X5      SS9U l
        [        R                  " 5       U l        g )NT)r  )r   r  r   scalemarginr  rB   r   r   randnr>   r   ri  )r    	input_dimr  r  r  r"   s        r#   r   AMSoftmaxLoss.__init__  sK    mT+-
$ll5;;y#EUYZ'')	r%   c                    UR                  5       n[        R                  R                  U R                  SS9n[        R                  R                  USS9n[
        R                  " X5      nX@R                  -
  n[        R                  R                  X R                  5      nU R                  [
        R                  " UR                  5       XT5      -  nU R                  Xr5      nU$ )Nr   r   r   )flattenrB   rc  	normalizer>   r   mmr  one_hotr  r  r   r   ri  )	r    r*   rb  r>   	cos_thetapsionehotrj  ri  s	            r#   r+   AMSoftmaxLoss.forward  s    !((!(<//1/EHH]3	++%&&v?ekk&++-HHyy(r%   )ri  r  r  r  r>   )g      >@g?r-   r4   s   @r#   r  r    s    * r%   r  c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )	TDNNLayeri  c                   > [         TU ]  5         US:  a  UR                  US-
     OUR                  U   U l        UR                  U   U l        UR
                  U   U l        UR                  U   U l        [        R                  " U R                  U R                  -  U R                  5      U l        [        R                  " 5       U l        g )Nr   r   )r   r   tdnn_dimr  r  tdnn_kernelr9   tdnn_dilationdilationrB   rd   kernelReLUrR   r  s      r#   r   TDNNLayer.__init__  s    <DqL6??8a<8foo^fNg"OOH5!--h7,,X6ii 0 043C3C CTEVEVW'')r%   r*   r   c                 >   [        5       (       a  SSKJn  [        5       (       a1  [        U R                  W5      (       a  [
        R                  " S5        UR                  SS5      nU R                  R                  R                  U R                  U R                  U R                  5      R                  SS5      n[        R                  R                  XU R                  R                   U R"                  S9nUR                  SS5      nU R%                  U5      nU$ )Nr   )	LoraLayerzDetected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. You should exclude TDNNLayer from LoRA's target modules.r   r   )r  )r   peft.tuners.lorar  r   r  r#  r$  rW   r>   r   r  r9   r  rB   rc  conv1dr   r  rR   )r    r*   r  r>   s       r#   r+   TDNNLayer.forward  s    2$++y11O &//15##(():):D<L<LdN^N^_iijkmno,,]DKKDTDT_c_l_l,m%//156r%   )rR   r  r  r  r9   r  r  )
r.   r/   r0   r1   r   r   r   r+   r2   r3   r4   s   @r#   r  r    s(    $U\\ ell  r%   r  zi
    WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification.
    c                     ^  \ rS rSrU 4S jrS rS rS rS\\	R                  \4   4S jr\     SS\\	R                     S	\\	R                     S
\\   S\\   S\\   S\\	R                     S\\\4   4S jj5       rSrU =r$ )WavLMForXVectori!  c                 2  > [         TU ]  U5        [        U5      U l        UR                  S-   nUR
                  (       a2  [        R                  " [        R                  " U5      U-  5      U l
        [        R                  " UR                  UR                  S   5      U l        [        [!        UR                  5      5       Vs/ s H  n[#        X5      PM     nn[        R$                  " U5      U l        [        R                  " UR                  S   S-  UR(                  5      U l        [        R                  " UR(                  UR(                  5      U l        [/        UR(                  UR0                  5      U l        U R5                  5         g s  snf )Nr   r   r]   r   )r   r   r  rt  r  r  rB   r   r   r   r  rd   rD   r  r  r  r  r  r  tdnnxvector_output_dimr  r  r  r  	objectiver  )r    rS   r  r  tdnn_layersr"   s        r#   r   WavLMForXVector.__init__'  s    '
--1
((!#ejj.Dz.Q!RD6#5#5vq7IJ5:3v;O5PQ5Py+5PQMM+.	!#6??2+>+BFD]D]!^))F$=$=v?X?XY&v'@'@&BSBST Rs   Fc                 Z    [         R                  " S[        5        U R                  5         grV  r"  r'  s    r#   r(  (WavLMForXVector.freeze_feature_extractor:  r*  r%   c                 L    U R                   R                  R                  5         gr,  rY  r'  s    r#   r&  &WavLMForXVector.freeze_feature_encoderF  r[  r%   c                 T    U R                   R                  5        H
  nSUl        M     gr]  r^  r  s     r#   r_  !WavLMForXVector.freeze_base_modelM  ra  r%   r  c                 X    S nU R                   R                   H  nU" XS5      nM     U$ )z/
Computes the output length of the TDNN layers
c                     X-
  U-  S-   $ )Nr   r!  r  s      r#   r  BWavLMForXVector._get_tdnn_output_lengths.<locals>._conv_out_lengthZ  s     !.69A==r%   r   )rS   r  )r    r  r  r9   s       r#   _get_tdnn_output_lengths(WavLMForXVector._get_tdnn_output_lengthsU  s1    
	>
  ;;22K,]KM 3 r%   ru  r   r   r5  r6  rb  r   c                    Ub  UOU R                   R                  nU R                   R                  (       a  SOUnU R                  UUUUUS9nU R                   R                  (       ai  U[           n[
        R                  " USS9n[        R                  R                  U R                  SS9n	XR                  SSS5      -  R                  SS9nOUS   nU R                  U5      nU R                   H  n
U
" U5      nM     Uc  UR                  SS9nUR!                  SS9nOU R#                  UR                  SS95      nU R%                  U5      n/ n/ n['        U5       HN  u  nnUR)                  XSU24   R                  SS95        UR)                  XSU24   R!                  SS95        MP     [
        R                  " U5      n[
        R                  " U5      n[
        R*                  " X/SS9nU R-                  U5      nU R/                  U5      nSnUb  U R1                  UU5      nU(       d  UU4U[        S -   nUb  U4U-   $ U$ [3        UUUUR4                  UR6                  S9$ )	r  NTr=  r   r   r]   r   )ri  rj  
embeddingsr*   r-  )rS   r?  r  rt  rv  r   r  rB   rc  rf  r  r   r   r  r  rX  rw  r  r  r.  r  r   r  r  r  r   r*   r-  )r    ru  r   r   r5  r6  rb  r  r*   r  
tdnn_layermean_featuresstd_featuresfeat_extract_output_lengthstdnn_output_lengthsr  lengthstatistic_poolingoutput_embeddingsrj  ri  r{  s                         r#   r+   WavLMForXVector.forwardd  s   , &1%<k$++B]B]'+{{'I'ItOc**)/!5#  
 ;;--#$ABM!KK1=M==001C1C0LL*->->r1a-HHMMRSMTM#AJM}5))J&}5M $ !)..1.5M(,,,3L*.*O*OP^PbPbghPbPi*j'"&"?"?@["\ML&':;	6$$]gvg:%>%C%C%C%JK##MWfW*$=$A$Aa$A$HI < "KK6M ;;|4L!II}&CL 223DE!23>>&&1D/07;X;Y3ZZF)-)9TGf$EvE(!//))
 	
r%   )r  r  r  r  r  r  rt  rC  )r.   r/   r0   r1   r   r(  r&  r_  r   r   r   r   r  r   r   r   r   r   r   r+   r2   r3   r4   s   @r#   r  r  !  s    &
&:(eE<L<Lc<Q6R   26,0/3&*)-N
u||,N
 !.N
 $D>	N

 'tnN
 d^N
 &N
 
um#	$N
 N
r%   r  )r  rF  r~  r  r  rs  r'   )Hr   r#  typingr   r   r   numpyr  r   torch.nnrB   torch.nn.functionalrc  r   r   activationsr   integrations.deepspeedr	   integrations.fsdpr
   modeling_outputsr   r   r   r   r   r   modeling_utilsr   rG   r   r   r   configuration_wavlmr   
get_loggerr.   rP  Moduler   r6   rZ   rp   r   r   r  r  rD  rL  rs  r  r  r  r  r  r  r   r   r   ndarrayr  r@  r  rv  rF  r~  r  r  r  r  __all__r!  r%   r#   <module>r     s     ) )      % ! @ 7  . ? ? , 
		H	%		 *299 *Z1RYY 1c RYY c Lryy 0&		 &R"ryy "JP
299 P
fQ
")) Q
hC' C'L R? R Rj		 *bii 6bii 0)")) )X		 $299 F 26tc?tt t U--.	t
 t ZZtn /  N
% N
 N
b !"  
S
& S

S
l o
%9 o
o
d e
'; e
 e
PBII .		 @ 
M
* M

M
`r%   