
    fTh*                     :   S r SSKrSSKrSSKJrJrJr  SSKrSSK	r	SSK
r	SSK	Jr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJrJrJr  SSKJr  SSKJrJr  SSK J!r!  SSK"J#r#  \" 5       (       a  SSKJ$r$  \RJ                  " \&5      r'Sr(  SBS\\)\)4   S\*S\)S\\	RV                     S\)S\RX                  4S jjr- " S S\R\                  5      r/ " S S\R\                  5      r0 " S S\R\                  5      r1 " S S \R\                  5      r2 " S! S"\R\                  5      r3 " S# S$\R\                  5      r4 " S% S&\R\                  5      r5 " S' S(\55      r6 " S) S*\R\                  5      r7 " S+ S,\75      r8 " S- S.\75      r9\7\9\8S/.r: " S0 S1\R\                  5      r; " S2 S3\R\                  5      r< " S4 S5\R\                  5      r=\ " S6 S7\5      5       r>\ " S8 S9\>5      5       r?\" S:S;9 " S< S=\>5      5       r@\" S>S;9 " S? S@\>5      5       rA/ SAQrBg)CzPyTorch SEW model.    N)OptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)!flash_attn_supports_top_left_maskis_flash_attn_available)BaseModelOutputCausalLMOutputSequenceClassifierOutput)PreTrainedModel)auto_docstringlogging)deprecate_kwarg   )	SEWConfig)_flash_attention_forwardshape	mask_probmask_lengthattention_mask	min_masksreturnc           	        ^^^^^ U u  nmTS:  a  [        S5      eTT:  a  [        ST ST S35      e[        R                  R                  S5      R	                  5       mUUUUU4S jnUb-  UR                  5       R                  S5      R                  5       O[        U5       Vs/ s H  nTPM     snn[        R                  " UT4[        S	9n	/ n
U" T5      nUS
:X  a  U	$ U H  nU" U5      n[        R                  R                  [        R                  " UTS-
  -
  5      USS9n[        U5      S
:X  a  TS-
  nOUS
   n[        R                  " U[        R                  " X-
  [        R                   S	9U-  /5      nU
R#                  U5        M     [        R$                  " U
5      n
[        R&                  " U
SS2SS2S4   X[T45      n
U
R)                  X[T-  5      n
[        R                  " T5      SSSS24   n[        R&                  " UX[T45      R)                  X[T-  5      nU
U-   n
U
R+                  5       TS-
  :  a  TS-
  XTS-
  :  '   [        R,                  " XSS5        U	$ s  snf )a*  
Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
CPU as part of the preprocessing during training.

Args:
    shape: The shape for which to compute masks. This should be of a tuple of size 2 where
           the first element is the batch size and the second element is the length of the axis to span.
    mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                independently generated mask spans of length `mask_length` is computed by
                `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                actual percentage will be smaller.
    mask_length: size of the mask
    min_masks: minimum number of masked spans
    attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                    each batch dimension.
r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    > [        TU -  T-  T-   5      n[        UT5      nUT-  T:  a  TT-  nU TS-
  -
  U:  a  [        U TS-
  -
  S5      nU$ )z;Given input length, compute how many spans should be maskedr   r   )intmax)input_lengthnum_masked_spanepsilonr   r   r   sequence_lengths     \/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/sew/modeling_sew.pycompute_num_masked_span6_compute_mask_indices.<locals>.compute_num_masked_spanW   so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFO    Ndtyper   F)replace)
ValueErrornprandomranditemdetachsumtolistrangezerosboolchoicearangelenconcatenateonesint32appendarraybroadcast_toreshaper"   put_along_axis)r   r   r   r   r   
batch_sizer(   _input_lengthsspec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr#   r$   spec_aug_mask_idxdummy_mask_idxoffsetsr%   r&   s    `` `            @@r'   _compute_mask_indicesrN   1   s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89'8!o'89  HHj/:$GM1/Ba%1,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;(MUWU]U] ^ao op
 	!!"34/ &2 "45 1a:&+(V ,33JVa@ab ii$T4]3Goog
'UV^^+5G ,g5 /A"55GVYZGZ!0CCD mB?w :s   (I0c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SEWNoLayerNormConvLayer   c                 b  > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        g )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr	   feat_extract_activation
activationselfconfiglayer_id	__class__s      r'   rX    SEWNoLayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@r*   c                 J    U R                  U5      nU R                  U5      nU$ N)r`   rb   rd   hidden_statess     r'   forwardSEWNoLayerNormConvLayer.forward   s$    		-06r*   )rb   r`   rZ   r[   r   __name__
__module____qualname____firstlineno__rX   rm   __static_attributes____classcell__rg   s   @r'   rP   rP      s    A r*   rP   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SEWLayerNormConvLayer   c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [
        R                  " U R                  SS9U l        [        UR                     U l        g )Nr   r   rS   T)elementwise_affine)rW   rX   rY   rZ   r[   r   r\   r]   r^   r_   r`   	LayerNorm
layer_normr	   ra   rb   rc   s      r'   rX   SEWLayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r*   c                     U R                  U5      nUR                  SS5      nU R                  U5      nUR                  SS5      nU R                  U5      nU$ )Nr+   )r`   	transposer~   rb   rk   s     r'   rm   SEWLayerNormConvLayer.forward   sV    		-0%//B76%//B76r*   rb   r`   rZ   r~   r[   ro   rp   rw   s   @r'   ry   ry      s    A r*   ry   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SEWGroupNormConvLayer   c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        [
        R                  " U R                  U R                  SS9U l        g )Nr   r   rS   T)
num_groupsnum_channelsaffine)rW   rX   rY   rZ   r[   r   r\   r]   r^   r_   r`   r	   ra   rb   	GroupNormr~   rc   s      r'   rX   SEWGroupNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr*   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rj   )r`   r~   rb   rk   s     r'   rm   SEWGroupNormConvLayer.forward   s2    		-066r*   r   ro   rp   rw   s   @r'   r   r      s    r  r*   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SEWPositionalConvEmbedding   c           	        > [         TU ]  5         [        R                  " UR                  UR                  UR
                  UR
                  S-  UR                  UR                  S9U l        [        R                  R                  n[        [        R                  R                  S5      (       a$  [        R                  R                  R                  n[        5       (       Ga%  SS KnUR                  R!                  U R                  R"                  SS9   U" U R                  SSS9U l        S S S 5        [        U R                  S5      (       aU  U R                  R                  R"                  R$                  nU R                  R                  R"                  R&                  nO,U R                  R(                  nU R                  R*                  nUR                  R-                  X5        UR                  R-                  X5        OU" U R                  SSS9U l        [/        UR
                  5      U l        [2        UR4                     U l        g ! , (       d  f       GN,= f)	N   )rT   paddinggroupsrU   weight_normr   modifier_rankweight)namedimparametrizations)rW   rX   r   r\   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupssqueeze_factorr`   utilsr   hasattrr   r
   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterSEWSamePadLayerr   r	   ra   rb   )rd   re   r   r   r   r   rg   s         r'   rX   #SEWPositionalConvEmbedding.__init__   s   II6622a777((
	 hh**288,,m<<((33??K%''224993C3CST2U'		aH	 Vtyy"4559955<<FF9955<<FF99--99--NN66tFNN66tF#DIIH!DDI&v'E'EF !?!?@ VUs   I
I"c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rj   )r`   r   rb   rk   s     r'   rm   "SEWPositionalConvEmbedding.forward  s2    		-0]36r*   )rb   r`   r   rp   rw   s   @r'   r   r      s     AD r*   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )r   i  c                 R   > [         TU ]  5         US-  S:X  a  SU l        g SU l        g )Nr   r   r   )rW   rX   num_pad_remove)rd   r   rg   s     r'   rX   SEWSamePadLayer.__init__   s)    #:Q#>!#Car*   c                 X    U R                   S:  a  US S 2S S 2S U R                   * 24   nU$ Nr   r   rk   s     r'   rm   SEWSamePadLayer.forward$  s6    ")!Q0F43F3F2F0F*FGMr*   r   rp   rw   s   @r'   r   r     s    K r*   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SEWUpsamplingi*  c                    > [         TU ]  5         [        R                  " UR                  UR                  UR
                  -  5      U l        [        UR                     U l	        UR
                  U l        g rj   )
rW   rX   r   Linearr   r   
projectionr	   ra   rb   rd   re   rg   s     r'   rX   SEWUpsampling.__init__+  sW    ))F$6$68J8JVMbMb8bc !?!?@$33r*   c                 &   U R                  U5      nU R                  U5      nU R                  S:  a^  UR                  5       u  p#nX0R                  -  nX@R                  -  nUR	                  X#U R                  U5      nUR	                  X%U5      nU$ )Nr   )r   rb   r   sizerC   )rd   rl   bszsrc_lensrc_embed_dimtgt_lentgt_embed_dims          r'   rm   SEWUpsampling.forward1  s    66"*7*<*<*>'C- 3 33G)-@-@@M)11#@S@SUbcM)11#NMr*   )rb   r   r   rp   rw   s   @r'   r   r   *  s    4 r*   r   c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )SEWFeatureEncoderiA  z.Construct the features from raw audio waveformc           	        > [         TU ]  5         UR                  S:X  a@  [        USS9/[	        UR
                  S-
  5       Vs/ s H  n[        XS-   S9PM     sn-   nOVUR                  S:X  a-  [	        UR
                  5       Vs/ s H  n[        XS9PM     nnO[        SUR                   S35      e[        R                  " U5      U l        SU l        S	U l        g s  snf s  snf )
Ngroupr   )rf   r   layerz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)rW   rX   feat_extract_normr   r7   num_feat_extract_layersrP   ry   r/   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)rd   re   ir   rg   s       r'   rX   SEWFeatureEncoder.__init__D  s    ##w.0!DEINvOmOmpqOqIrIIrA'Q?IrI K %%0NSTZTrTrNstNs0DNsKtK01I1I0JJst  ==5&+#"I us   C C%c                 N    U R                  5        H
  nSUl        M     SU l        g NF)
parametersrequires_gradr   rd   params     r'   _freeze_parameters$SEWFeatureEncoder._freeze_parametersU  s#    __&E"'E '#r*   c                 B   US S 2S 4   nU R                   (       a  U R                  (       a  SUl        U R                   H\  nU R                   (       a@  U R                  (       a/  U R                  (       a  U R                  UR                  U5      nMT  U" U5      nM^     U$ )NT)r   trainingr   r   r   _gradient_checkpointing_func__call__)rd   input_valuesrl   
conv_layers       r'   rm   SEWFeatureEncoder.forwardZ  s    $QW- 4==*.M'**J""t'B'Bt}} $ A A''!!
 !+= 9 + r*   )r   r   r   )
rq   rr   rs   rt   __doc__rX   r   rm   ru   rv   rw   s   @r'   r   r   A  s    8#"$
 r*   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )SEWFeatureExtractorim  c                    > [         TU ]  U5        [        R                  " SU R                  R
                   SU R                  R                  S   R
                   S3[        5        g )NzThe class `zD` has been depreciated and will be removed in Transformers v5. Use `r   z
` instead.)rW   rX   warningswarnrg   rq   	__bases__FutureWarningr   s     r'   rX   SEWFeatureExtractor.__init__n  s[     $..112 3NN,,Q/889E 		
r*    )rq   rr   rs   rt   rX   ru   rv   rw   s   @r'   r   r   m  s    
 
r*   r   c                     ^  \ rS rSrSr      SS\S\S\S\S\S\S	\\	   S
\\   4U 4S jjjr
\" SSS9\" SSS9\" SSS9      SS\R                  S\\R                     S\\\R                        S\\R                     S\\R                     S\S\\R                     S\\R                  \\R                     \\\R                        4   4S jj5       5       5       rSrU =r$ )SEWAttentioniy  z=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsdropout
is_decoderrV   	is_causalre   	layer_idxc	                 t  > [         T	U ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        Xl        Uc>  U R                  (       a-  [        R                  SU R                  R                   S35        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.)rV   )rW   rX   r   r   r   head_dimre   r/   scalingr   r   r   loggerwarning_oncerg   rq   r   r   k_projv_projq_projout_proj)
rd   r   r   r   r   rV   r   re   r   rg   s
            r'   rX   SEWAttention.__init__|  s    	""!.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	4@ii	4@ii	4@		)TBr*   key_value_states4.55versionpast_key_valuecache_positionrl   r   layer_head_maskoutput_attentionsr   c                 n   UR                  5       u  pn
U R                  U5      R                  USU R                  U R                  5      R                  SS5      nXR                  -  nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nXR                  -  SU R                  4nUR                  " U6 nUR                  " U6 nUR                  " U6 nUR                  S5      n[        R                  " XR                  SS5      5      nUR                  5       XR                  -  X4:w  a-  [        SXR                  -  X4 SUR                  5        35      eUb[  USS2SS2SS2SUR                  S   24   nUR                  XR                  X5      U-   nUR                  XR                  -  X5      n[        R                  R!                  USS9nUb  UR                  5       U R                  4:w  a*  [        S	U R                  4 SUR                  5        35      eUR                  SSSS5      UR                  XR                  X5      -  nUR                  XR                  -  X5      nU(       a;  UR                  XR                  X5      nUR                  XR                  -  X5      nOSn[        R                  R#                  UU R"                  U R$                  S
9n[        R                  " UU5      nUR                  5       XR                  -  XR                  4:w  a7  [        SXR                  -  XR                  4 SUR                  5        35      eUR                  XR                  XR                  5      nUR                  SS5      nUR                  XU R&                  5      nU R)                  U5      nUUS4$ )#Input shape: Batch x Time x Channelr+   r   r   z$Attention weights should be of size z	, but is Nr   r   z/Head mask for a single layer should be of size )pr   z `attn_output` should be of size )r   r   viewr   r   r   r   r   r   rC   torchbmmr/   r   r   
functionalsoftmaxr   r   r   r   )rd   rl   r   r  r   r  r  r  r   r   rF   query_states
key_statesvalue_states
proj_shaper   attn_weightsattn_weights_reshaped
attn_probsattn_outputs                       r'   rm   SEWAttention.forward  s    (,,.a {{=166sBPTP]P]^hhijlmn#ll2[[/44S"dnndmm\ffghjkl
{{=166sBPTP]P]^hhijlmnNN*B>
#++Z8''4
#++Z8//!$yy/C/CAq/IJ3#7"JJ6nn8Lg7_6` a %%'(* 
 %+Aq!5Kz7G7G7K5K,KLN',,S..'SVddL',,S>>-A7TL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVdVdfm?wwL',,S>>-A7TL
 %1$5$5c>>7$\!055cNN6JG]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2C..4H'S`S`3a2b c$$&') 
 "&&sNNG]]S!++Aq1 "))#GmmK01477r*   )re   r   r   r   r   r   r   r   r   r   r   r   r   )        FTFNNNNNNFN)rq   rr   rs   rt   r   r!   floatr9   r   r   rX   r   r  Tensorr   rm   ru   rv   rw   s   @r'   r   r   y  s   G  &*#'%C%C %C 	%C
 %C %C %C #%C C=%C %CP '8%v6%v6 488<1526"'15P8||P8 #5<<0P8 !u||!45	P8
 !.P8 "%,,/P8  P8 !.P8 
u||Xell3XeELL>Q5RR	SP8 7 7 9P8r*   r   c                     ^  \ rS rSrSrU 4S jr\" SSS9\" SSS9\" SSS9      SS	\R                  S\	\R                     S\	\
\R                        S
\	\R                     S\	\R                     S\S\	\R                     S\
\R                  \	\R                     \	\
\R                        4   4S jj5       5       5       rSrU =r$ )SEWFlashAttention2i  a2  
SEW flash attention module. This module inherits from `SEWAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
c                 D   > [         TU ]  " U0 UD6  [        5       U l        g rj   )rW   rX   r   _flash_attn_uses_top_left_mask)rd   argskwargsrg   s      r'   rX   SEWFlashAttention2.__init__  s#    $)&)
 /P.Q+r*   r   r   r  r  r  rl   r   r  r  r   c                 L   U(       a  [        S5      eUR                  5       u  pn
U R                  U5      R                  USU R                  U R
                  5      nU R                  U5      nU R                  U5      nUR                  USU R                  U R
                  5      nUR                  USU R                  U R
                  5      nUR                  nU[        R                  :X  a  [        R                  " 5       (       a  [        R                  " 5       nOR[        U R                  S5      (       a  U R                  R                  nO U R                  R                   R                  n["        R%                  SU S35        UR'                  U5      nUR'                  U5      nUR'                  U5      n[)        UUUUU	U R*                  (       a  U R,                  OSU R.                  U R0                  S9nUR3                  XS5      nU R5                  U5      nUS S 4$ )NzSEWSdpaAttention2 attention does not support `output_attentions`. Use the argument `attn_implementation='eager'` when loading the model.r+   _pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .r  )r   r   use_top_left_mask)r/   r   r   r  r   r   r   r   r-   r  float32is_autocast_enabledget_autocast_gpu_dtyper   re   r%  r   r   r   tor   r   r   r   r   rC   r   )rd   rl   r   r  r   r  r  r  r   q_lenrF   r  r  r  input_dtypetarget_dtyper  s                    r'   rm   SEWFlashAttention2.forward  s    Y 
 &**,A {{=166sBPTP]P]^[[/
{{=1__S"dnndmmL
#((b$..$--P #((%--'((**$;;=&?@@#{{BB#{{1177 >$ (??<8L#|4J'??<8L.$(MMDLLsnn"AA	
 "))#b9mmK0D$&&r*   )r   r  )rq   rr   rs   rt   r   rX   r   r  r  r   r   r9   rm   ru   rv   rw   s   @r'   r  r    s   R '8%v6%v6 488<1526"'15B'||B' #5<<0B' !u||!45	B'
 !.B' "%,,/B'  B' !.B' 
u||Xell3XeELL>Q5RR	SB' 7 7 9B'r*   r  c                     ^  \ rS rSr\" SSS9\" SSS9\" SSS9      SS\R                  S\\R                     S\\\R                        S\\R                     S	\\R                     S
\	S\\R                     S\\R                  \\R                     \\\R                        4   4U 4S jjj5       5       5       r
SrU =r$ )SEWSdpaAttentioniS  r   r   r  r  r  rl   r   r  r  r   c           	        > U(       a%  [         R                  S5        [        TU ]  UUUUS9$ UR	                  5       u  pn
U R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      nU R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUR                  USU R                  U R                  5      R                  SS5      nSnUb  USS2SS2SS2SUR                  S   24   nUR                  R                  S:X  a3  Ub0  UR                  5       nUR                  5       nUR                  5       nU R                   (       a  Uc  U	S:  a  S	OS
n["        R$                  R&                  R)                  UUUUU R*                  (       a  U R,                  OSUS9nUR                  SS5      R                  5       nUR                  XU R.                  5      nU R1                  U5      nUSS4$ )r  a  SEWModel is using SEWSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` . Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r   r   r  r+   r   r   Nr   cudaTFr  )	attn_mask	dropout_pr   )r   r   rW   rm   r   r   r  r   r   r   r   r   r   devicetype
contiguousr   r  r   r  scaled_dot_product_attentionr   r   r   r   )rd   rl   r   r  r   r  r  r  r   r   rF   r  r  r  causal_maskr   r  rg   s                    r'   rm   SEWSdpaAttention.forwardT  s#    l 7?!1-"3	 #   (,,.a {{=166sBPTP]P]^hhijlmn[[/
{{=1__S"dnndmmLVVWXZ[\
#((b$..$--PZZ[\^_`%(Aq2HJ4D4DR4H2H)HIK ##v-+2I'224L#..0J'224L
 !NN{/BwQR{DX]	 hh))FF!&*mmdll G 
 "++Aq1<<> "&&sT^^DmmK0D$&&r*   r   r  )rq   rr   rs   rt   r   r  r  r   r   r9   rm   ru   rv   rw   s   @r'   r1  r1  S  s   '8%v6%v6 488<1526"'15D'||D' #5<<0D' !u||!45	D'
 !.D' "%,,/D'  D' !.D' 
u||Xell3XeELL>Q5RR	SD' 7 7 9D'r*   r1  )eagersdpaflash_attention_2c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SEWFeedForwardi  c                   > [         TU ]  5         [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l	        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                   5      U l        g rj   )rW   rX   r   Dropoutactivation_dropoutintermediate_dropoutr   r   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr	   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr   s     r'   rX   SEWFeedForward.__init__  s    $&JJv/H/H$I!"$))F,>,>@X@X"Yf''--'-f.?.?'@D$'-'8'8D$IIf&>&>@R@RS jj)>)>?r*   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ rj   )rF  rJ  rD  rK  rM  rk   s     r'   rm   SEWFeedForward.forward  sX    //>00?11-@))-8++M:r*   )rJ  rF  rD  rK  rM  rp   rw   s   @r'   r@  r@    s    @ r*   r@  c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )SEWEncoderLayeri  c                   > [         TU ]  5         [        UR                     " UR                  UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        g )NF)r   r   r   r   eps)rW   rX   SEW_ATTENTION_CLASSES_attn_implementationr   num_attention_headsattention_dropout	attentionr   rB  rL  r   r}   layer_norm_epsr~   r@  feed_forwardfinal_layer_normr   s     r'   rX   SEWEncoderLayer.__init__  s    .v/J/JK((00,,	
 zz&"7"78,,v'9'9v?T?TU*62 "V-?-?VEZEZ [r*   c                     UnU R                  XUS9u  pnU R                  U5      nXA-   nU R                  U5      nXR                  U5      -   nU R	                  U5      nU4nU(       a  Xu4-  nU$ )Nr   r  )rZ  r   r~   r\  r]  )rd   rl   r   r  attn_residualr  rF   outputss           r'   rm   SEWEncoderLayer.forward  s    %)-L] *8 *
&Q ]3%56%(9(9-(HH--m< "&Gr*   )rZ  r   r\  r]  r~   r   rp   rw   s   @r'   rR  rR    s    \ r*   rR  c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )
SEWEncoderi  c                 D  > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  5      U l        [
        R                  " UR                  UR                  S9U l        [
        R                  " UR                  5      U l        [
        R                   " [#        UR$                  5       Vs/ s H  n['        U5      PM     sn5      U l        [+        U5      U l        SU l        UR0                  S:H  U l        g s  snf )NrT  Fr>  )rW   rX   re   r   pos_conv_embedr   	AvgPool1dr   poolr}   r   r[  r~   rB  rL  r   r   r7   num_hidden_layersrR  layersr   upsampler   rW  _use_flash_attention_2)rd   re   rF   rg   s      r'   rX   SEWEncoder.__init__  s    8@LL!6!68M8MN	,,v'9'9v?T?TUzz&"7"78mmeFLdLdFe$fFe_V%<Fe$fg%f-&+#&,&A&AEX&X# %gs   Dc           	         U(       a  SOS nU(       a  SOS nUGb  UR                  S5      R                  SSUR                  S   5      nU R                  (       a  SX) '   Ub  SU;   a  UOS nGO_SX) '   UR	                  5       R                  S5      n	XR                  R                  -  n
UR                  S   U R                  R                  -  n[        R                  " SXR                  S9R                  SS5      R                  U
R                  S   S5      nXR                  SS5      :  R	                  5       nSUS S 2S S S S 24   R                  UR                  S	9-
  nU[        R                  " UR                  5      R                   -  nUR                  UR                  S   SUR                  S   UR                  S   5      nUR                  S   nUR#                  SS5      nU R%                  U5      nU R'                  U5      n[!        UR)                  S5      UR)                  S5      5      nUS
S U24   US
S U24   -   nUR#                  SS5      nU R+                  U5      nU R-                  U5      n[/        5       =(       d    [1        U 5      nU R2                   H  nU(       a  Xa4-   n[        R4                  " / 5      nU R6                  (       a  UU R                  R8                  :  a  SOSnU(       a  U(       aM  U R:                  (       a0  U R6                  (       a  U R=                  UR>                  UUU5      nOU" XUS9nUS   nU(       a  SnU(       d  M  UWS   4-   nM     U(       a  Xa4-   nU RA                  U5      nUR                  S   U:  a3  [B        RD                  RG                  USSSXR                  S   -
  45      nU(       d  [I        S XU4 5       5      $ [K        UUUS9$ )Nr   r+   r   r   r  r   r6        ?r,   .TFr`  NNc              3   .   #    U  H  oc  M  Uv   M     g 7frj   r   ).0vs     r'   	<genexpr>%SEWEncoder.forward.<locals>.<genexpr>I  s     m$[q$[s   	last_hidden_staterl   
attentions)&	unsqueezerepeatr   rm  longr5   re   r   r  r;   r6  r  expandr+  r-   finfominr   rg  ri  r   r~   r   r
   r   rk  r2   r   	layerdropr   r   r   rl  r   r  padtupler   )rd   rl   r   r  output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsexpand_attention_maskrG   output_lengthsmax_encoder_lengthattention_idsn_input_timestepsposition_embeddingspooled_hidden_states
min_lengthsynced_gpusr   dropout_probabilityskip_the_layerlayer_outputss                         r'   rm   SEWEncoder.forward  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!**8;454B4NSTXfSfmq 9<45!/!4!4!6 ; ;B ?!.++2L2L!L%2%8%8%;t{{?Y?Y%Y"LL$6?T?TUT!R[VN003R8 
 #02E2Eb!2L"L!R!R!T "%~atQ6F'G'J'JQ^QdQd'J'e!e!/%++m>Q>Q2R2V2V!V!/!6!6"((+Q0D0DR0H.J^J^_aJb" *//2%//15"11-@#yy7,11"57K7P7PQS7TU
,S+:+-=>ATUXZe[eZeUeAff%//156]302R6LT6R[[E#$58H$H! #(**R.%)]]8KdkkNcNc8cTjoN![..4==$($E$E%&)	%M %*%Xi%M !.a 0 ,  &9]1=M<O&O#7 !:   14D Dm4q!$55MM--maAGX[n[nop[qGq=rsMm]GZ$[mmm++*
 	
r*   )	rm  re   r   r   r~   rk  ri  rg  rl  )NFFTrp   rw   s   @r'   re  re    s#    
Y "_
 _
r*   re  c                       \ rS rSr\rSrSrSrSr	Sr
S rS\\R                  \4   4S jrS\S	\R                  4S
 jrSrg)SEWPreTrainedModeliQ  sewr   Tc           
         [        U[        5      (       a  [        R                  R	                  UR
                  R                  SS[        R                  " SUR
                  R                  S   UR
                  R                  -  -  5      -  S9  [        R                  R                  UR
                  R                  S5        GO)[        U[        R                  5      (       a:  UR                  R                  R	                  SU R                  R                   S9  GO[        U[        R"                  [        R$                  45      (       aK  UR                  R                  R'                  5         UR                  R                  R)                  S5        GOV[        U[        R*                  5      (       Ga6  [-        5       (       a  SSKn[1        US5      (       a~  [1        US	5      (       am  UR2                  R5                  UR6                  UR8                  /SS
9   [        R                  R;                  UR                  R                  5        SSS5        OUR2                  R5                  UR                  SS
9   [        R                  R;                  UR                  R                  5        SSS5        O3[        R                  R;                  UR                  R                  5        [        U[        R                  [        R*                  45      (       a3  UR                  b%  UR                  R                  R'                  5         ggg! , (       d  f       Nq= f! , (       d  f       N= f)zInitialize the weightsr   r   r   )meanstdr  rq  Nr   r   r   )rG  r   r   initnormal_r`   r   mathsqrtrT   in_channels	constant_rV   r   datare   initializer_ranger}   r   zero_fill_r\   r
   r   r   r   r   r   r   kaiming_normal_)rd   moduler   s      r'   _init_weights SEWPreTrainedModel._init_weightsZ  sB   f899GGOO""		!v{{'>'>q'AFKKD[D['["\]]  
 GGfkk..2		** MM&&CT[[5R5R&Sr|| <==KK""$MM$$S)		**)++ 6:..76:3N3N"::FOOV__;]mn:o//0B0BC po #::6==XY:Z//0B0BC [Z ''(:(:;fryy"))455&++:QKK""$ ;R5 po [Zs   4M 4M$
M!$
M2rG   c                     S n[        U R                  R                  U R                  R                  5       H  u  p4U" XU5      nM     U$ )z8
Computes the output length of the convolutional layers
c                 8    [         R                  " X-
  USS9S-   $ )Nfloor)rounding_moder   )r  div)r#   rT   rU   s      r'   _conv_out_lengthMSEWPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length  s      99\7wWZ[[[r*   )zipre   r]   r^   )rd   rG   r  rT   rU   s        r'    _get_feat_extract_output_lengths3SEWPreTrainedModel._get_feat_extract_output_lengthsz  sG    
	\
 $'t{{'>'>@W@W#XK,]PM $Y r*   feature_vector_lengthr   c                    U R                  UR                  S5      5      R                  [        R                  5      nUR
                  S   n[        R                  " XA4UR                  UR                  S9nSU[        R                  " UR
                  S   UR                  S9US-
  4'   UR                  S/5      R                  S5      R                  S/5      R                  5       nU$ )Nr+   r   )r-   r6  r   rp  )r  r5   r+  r  r}  r   r8   r-   r6  r;   flipcumsumr9   )rd   r  r   r  rE   s        r'   "_get_feature_vector_attention_mask5SEWPreTrainedModel._get_feature_vector_attention_mask  s    >>~?Q?QRT?UVYYZ_ZdZde#))!,
/~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr*   r   N)rq   rr   rs   rt   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_2_supports_sdpar  r   r  
LongTensorr!   r  r  ru   r   r*   r'   r  r  Q  s`    L$O&*#!N%@eEDTDTVYDY>Z 
 
]b]m]m 
r*   r  c                   >  ^  \ rS rSrS\4U 4S jjr  SS\R                  S\\R                     S\\R                     4S jjr
\     SS\\R                     S\\R                     S\\R                     S	\\   S
\\   S\\   S\\\4   4S jj5       rSrU =r$ )SEWModeli  re   c                   > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " UR                  S   UR                  S9U l	        UR                  S   UR                  :g  U l        U R                  (       a3  [
        R                  " UR                  S   UR                  5      U l        [
        R                  " UR                  5      U l        UR"                  S:  d  UR$                  S:  aG  [
        R&                  " [(        R*                  " UR                  5      R-                  5       5      U l        [1        U5      U l        U R5                  5         g )Nr+   rT  r  )rW   rX   re   r   feature_extractorr   r}   rY   r[  r~   r   project_featuresr   feature_projectionrB  feat_proj_dropoutfeature_dropoutmask_time_probmask_feature_prob	Parameterr  r  uniform_masked_spec_embedre  encoder	post_initr   s     r'   rX   SEWModel.__init__  s     !26!:,,vr':@U@UV & 3v7I7I I  &(ii0CVEWEW&XD#!zz&*B*BC  3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"!&) 	r*   rl   mask_time_indicesr   c                    [        U R                  SS5      (       d  U$ UR                  5       u  pEnUb(  U R                  R	                  UR
                  5      X'   OU R                  R                  S:  a  U R                  (       a  [        XE4U R                  R                  U R                  R                  UU R                  R                  S9n[        R                  " X!R                  [        R                  S9nU R                  R	                  UR
                  5      X'   U R                  R                  S:  a  U R                  (       a  [        XF4U R                  R                  U R                  R                   U R                  R"                  S9n[        R                  " XqR                  [        R                  S9nUSS2S4   R%                  SUS5      nSX'   U$ )	z
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://arxiv.org/abs/1904.08779).
apply_spec_augmentTNr   )r   r   r   r   )r6  r-   )r   r   r   r+   )getattrre   r   r  r+  r-   r  r   rN   mask_time_lengthmask_time_min_masksr  tensorr6  r9   r  mask_feature_lengthmask_feature_min_masksr~  )rd   rl   r  r   rE   r&   r   mask_feature_indicess           r'   _mask_hidden_statesSEWModel._mask_hidden_states  s    t{{$8$??   4A3E3E3G0
[(/3/E/E/H/HI\I\/]M,[[''!+ 5-++44 KK88-++99! !&->G[G[chcmcm n/3/E/E/H/HI\I\/]M,;;((1,#8)++77 KK;;++<<	$  $)<<0DMaMainisis#t #74#@#G#GO]_#` 23M/r*   r   r  r  r  r   c                 b   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      nUR                  SS5      nU R                  U5      nU R                  (       a  U R                  U5      nU R                  U5      nUb  U R                  UR                  S   U5      nU R                  XS9nU R                  UUUUUS9n	U	S   nU(       d	  U4U	SS -   $ [        UU	R                  U	R                   S9$ )a  
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
    masked extracted features in *config.proj_codevector_dim* space.
Nr   r   )r  r   r  r  r  r   rx  )re   r  r  use_return_dictr  r   r~   r  r  r  r  r   r  r  r   rl   rz  )
rd   r   r   r  r  r  r  extract_featuresrl   encoder_outputss
             r'   rm   SEWModel.forward  sR    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;??+;<  #667GH,,-=>%!DD]EXEXYZE[]klN000d,,)/!5# ' 
 (*!#oab&999+)77&11
 	
r*   )re   r  r  r  r  r~   r  r  rr  NNNNN)rq   rr   rs   rt   r   rX   r  FloatTensorr   r  r  r   r  r9   r   r   r   rm   ru   rv   rw   s   @r'   r  r    s    y . :>59	,((, $E$5$56, !!1!12	,\  269=,0/3&*3
u||,3
 !.3
 $E$5$56	3

 $D>3
 'tn3
 d^3
 
uo%	&3
 3
r*   r  zk
    SEW Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    )custom_introc                      ^  \ rS rSrSS\\   4U 4S jjjrS rS rS r	S r
\     SS\\R                     S	\\R                     S
\\   S\\   S\\   S\\R                     S\\\4   4S jj5       rSrU =r$ )	SEWForCTCi  target_langc                   > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        X l        UR                  c  [        SU R                   S35      e[        US5      (       a  UR                  (       a  UR                  OUR                  n[        R                   " X1R                  5      U l        U R%                  5         g)a  
target_lang (`str`, *optional*):
    Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
    adapter.<lang>.bin. Only relevant when using an instance of [`SEWForCTC`] with adapters. Uses 'eng' by
    default.
NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SEWForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.add_adapter)rW   rX   r  r  r   rB  final_dropoutr   r  
vocab_sizer/   rg   r   r  output_hidden_sizer   r   lm_headr  )rd   re   r  r  rg   s       r'   rX   SEWForCTC.__init__  s     	 F#zz&"6"67&$00@ AH H  *1)G)GFL^L^F%%djdvdv 	 yy!35F5FG 	r*   c                     U R                   nUb'  [        U R                  SS5      c  [        SU S35      eUc.  [        U R                  SS5      b  [        R                  S5        gUb  U R                  USS9  gg)a  
This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
passing `target_lang=...` to `from_pretrained(...)`.

This method is **not** supposed to be called by the user and is prone to be changed in the future.
Nadapter_attn_dimzCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)r  r  re   r/   r   infoload_adapter)rd   r  s     r'   tie_weightsSEWForCTC.tie_weights5  s     &&"wt{{<NPT'U']:;-Gtuvv WT[[:Ld%S%_KKCD$kd; %r*   c                 Z    [         R                  " S[        5        U R                  5         g)
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.Nr   r   r   freeze_feature_encoderrd   s    r'   freeze_feature_extractor"SEWForCTC.freeze_feature_extractorJ  '    
 	Q	

 	##%r*   c                 L    U R                   R                  R                  5         gr  Nr  r  r   r  s    r'   r   SEWForCTC.freeze_feature_encoderV      
 	""557r*   c                 T    U R                   R                  5        H
  nSUl        M     gz
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
FNr  r   r   r   s     r'   freeze_base_modelSEWForCTC.freeze_base_model]  #    
 XX((*E"'E +r*   r   r   r  r  r  labelsr   c                    Ub  UOU R                   R                  nUbJ  UR                  5       U R                   R                  :  a"  [	        SU R                   R                   35      eU R                  UUUUUS9nUS   nU R                  U5      nU R                  U5      n	Sn
UGbX  Ub  UO"[        R                  " U[        R                  S9nU R                  UR                  S5      5      R                  [        R                  5      nUS:  nUR                  S5      nUR                  U5      n[        R                   R#                  U	S[        R$                  S9R'                  SS5      n[        R(                  R*                  R-                  S	S
9   [        R                   R/                  UUUUU R                   R0                  U R                   R2                  U R                   R4                  S9n
SSS5        U(       d  U	4U[6        S -   nU
b  U
4U-   $ U$ [9        XUR:                  UR<                  S9$ ! , (       d  f       NL= f)a  
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
    Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
    the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
    All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
    config.vocab_size - 1]`.
Nz$Label values must be <= vocab_size: r  r   r,   r+   )r   r-   r   F)enabled)blank	reductionzero_infinitylosslogitsrl   rz  )re   r  r"   r  r/   r  r   r  r  	ones_liker}  r  r5   r+  masked_selectr   r  log_softmaxr(  r   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   rl   rz  )rd   r   r   r  r  r  r
  rb  rl   r  r  rG   labels_masktarget_lengthsflattened_targets	log_probsoutputs                    r'   rm   SEWForCTC.forwarde  s   " &1%<k$++B]B]&**,$++2H2H"HCDKKDZDZC[\]](()/!5#  
  
]3m, #1"<%//R^fkfpfpBq  !AA.BTBTUWBXY\\]b]g]ghM !A+K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+:}}--%!"++22"kk<<"&++"?"? .  ; Y)F)G!HHF)-)9TGf$EvEG4I4IV]VhVh
 	
 ;:s   A H??
I)r   r  r  r  rj   r  )rq   rr   rs   rt   r   rI  rX   r  r  r  r  r   r  r  r9   r   r   r   rm   ru   rv   rw   s   @r'   r  r    s    HSM  :<*
&8(  26,0/3&*)-D
u||,D
 !.D
 $D>	D

 'tnD
 d^D
 &D
 
un$	%D
 D
r*   r  z
    SEW Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like SUPERB
    Keyword Spotting.
    c                      ^  \ rS rSrU 4S jrS rS rS r\     SS\	\
R                     S\	\
R                     S\	\   S	\	\   S
\	\   S\	\
R                     S\\\4   4S jj5       rSrU =r$ )SEWForSequenceClassificationi  c                 "  > [         TU ]  U5        [        US5      (       a  UR                  (       a  [	        S5      e[        U5      U l        UR                  S-   nUR                  (       a2  [        R                  " [        R                  " U5      U-  5      U l        [        R                  " UR                  UR                   5      U l        [        R                  " UR                   UR$                  5      U l        U R)                  5         g )Nr  zZSequence classification does not support the use of SEW adapters (config.add_adapter=True)r   )rW   rX   r   r  r/   r  r  rj  use_weighted_layer_sumr   r  r  r>   layer_weightsr   r   classifier_proj_size	projector
num_labels
classifierr  )rd   re   
num_layersrg   s      r'   rX   %SEWForSequenceClassification.__init__  s     6=))f.@.@l  F#--1
((!#ejj.Dz.Q!RD6#5#5v7R7RS))F$?$?ARARS 	r*   c                 Z    [         R                  " S[        5        U R                  5         g)z
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
r  Nr  r  s    r'   r  5SEWForSequenceClassification.freeze_feature_extractor  r  r*   c                 L    U R                   R                  R                  5         gr   r  r  s    r'   r  3SEWForSequenceClassification.freeze_feature_encoder  r  r*   c                 T    U R                   R                  5        H
  nSUl        M     gr  r  r   s     r'   r  .SEWForSequenceClassification.freeze_base_model  r	  r*   r   r   r  r  r  r
  r   c                 0   Ub  UOU R                   R                  nU R                   R                  (       a  SOUnU R                  UUUUUS9nU R                   R                  (       ai  U[           n[
        R                  " USS9n[        R                  R                  U R                  SS9n	XR                  SSS5      -  R                  SS9nOUS   nU R                  U5      nUc  UR                  SS9n
OU R                  UR                   S   U5      nUR#                  S5      R%                  SSUR                   S   5      nS	X) '   UR                  SS9UR                  SS9R                  SS5      -  n
U R'                  U
5      nSnUbF  [)        5       nU" UR                  SU R                   R*                  5      UR                  S5      5      nU(       d  U4U[        S -   nUb  U4U-   $ U$ [-        UUUR.                  UR0                  S
9$ )ad  
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
    Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
    into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
    soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
    conversion into a tensor of type `torch.FloatTensor`. See [`SEWProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NTr  r   r	  r+   r   r   r  r  )re   r  r'  r  r  r  stackr   r  r  r(  r  r5   r*  r  r  r   r{  r|  r,  r   r+  r   rl   rz  )rd   r   r   r  r  r  r
  rb  rl   norm_weightspooled_outputpadding_maskexpand_padding_maskr  r  loss_fctr"  s                    r'   rm   $SEWForSequenceClassification.forward  s   , &1%<k$++B]B]'+{{'I'ItOc(()/!5#  
 ;;--#$ABM!KK1=M==001C1C0LL*->->r1a-HHMMRSMTM#AJM}5!)..1.5MBB=CVCVWXCY[ijL"."8"8"<"C"CAq-J]J]^_J`"a25M./)--!-4|7G7GA7G7N7S7STVXY7ZZM/')HFKKDKK,B,BCV[[QS_UDY)F)G!HHF)-)9TGf$EvE'!//))	
 	
r*   )r,  r(  r*  r  r  )rq   rr   rs   rt   rX   r  r  r  r   r   r  r  r9   r   r   r   rm   ru   rv   rw   s   @r'   r%  r%    s    "
&8(  26,0/3&*)-A
u||,A
 !.A
 $D>	A

 'tnA
 d^A
 &A
 
u..	/A
 A
r*   r%  )r  r%  r  r  r   )Cr   r  r   typingr   r   r   numpyr0   r  torch.utils.checkpointr   torch.nnr   activationsr	   integrations.deepspeedr
   integrations.fsdpr   modeling_flash_attention_utilsr   r   modeling_outputsr   r   r   modeling_utilsr   r   r   r   utils.deprecationr   configuration_sewr   r   
get_loggerrq   r   r  r!   r  r  ndarrayrN   ModulerP   ry   r   r   r   r   r   r   r   r  r1  rV  r@  rR  re  r  r  r  r%  __all__r   r*   r'   <module>rM     sa      ) )     % ! @ 7 h Y Y - , 0 ( J 
		H	% !"  26tc?tt t U--.	t
 t ZZtpbii ,BII 8BII 0( (Xbii BII .)		 )X
+ 
~8299 ~8DU' U'pH'| H'X + RYY 2 bii  Fl
 l
^ A A AH w
! w
 w
t S
" S
S
l o
#5 o
o
d Zr*   