
    hb                     8   S SK r S SKrS SKJrJrJrJr  S SKrS SKJr  S SK	J
r   " S S\R                  R                  5      r " S S\R                  R                  5      r " S	 S
\R                  R                  5      r " S S\R                  R                  5      r " S S\R                  R                  5      r " S S\R                  R                  5      r " S S\R                  5      r " S S\R                  5      rS\R*                  S\S\S\R*                  4S jrS rS'S\R*                  S\S\S\S\R*                  4
S jjrS(S \R*                  S\S!\S\S\R*                  4
S" jjrS#\\   S\4S$ jrS#\\   S\4S% jrS#\\   S\4S& jrg))    N)AnyDictListOptional)nn)
functionalc            	          ^  \ rS rSrSrSS\S\S\S\4U 4S jjjr\	S\
R                  4S	 j5       rS
\
R                  S\
R                  4S jrSrU =r$ )_ScaledEmbedding#   a.  Make continuous embeddings and boost learning rate

Args:
    num_embeddings (int): number of embeddings
    embedding_dim (int): embedding dimensions
    scale (float, optional): amount to scale learning rate (Default: 10.0)
    smooth (bool, optional): choose to apply smoothing (Default: ``False``)
num_embeddingsembedding_dimscalesmoothc                   > [         TU ]  5         [        R                  " X5      U l        U(       a  [
        R                  " U R                  R                  R                  SS9nU[
        R                  " SUS-   5      R                  5       S S 2S 4   -  nXPR                  R                  R                  S S & U R                  R                  =R                  U-  sl        X0l        g )Nr   dim   )super__init__r   	Embedding	embeddingtorchcumsumweightdataarangesqrtr   )selfr   r   r   r   r   	__class__s         R/var/www/auris/envauris/lib/python3.13/site-packages/torchaudio/models/_hdemucs.pyr   _ScaledEmbedding.__init__-   s    nD\\$.."7"7"<"<!DFell1nq.@AFFHDQQF,2NN!!&&q)""e+"
    returnc                 H    U R                   R                  U R                  -  $ N)r   r   r   )r   s    r    r   _ScaledEmbedding.weight8   s    ~~$$tzz11r"   xc                 B    U R                  U5      U R                  -  nU$ )zForward pass for embedding with scale.
Args:
    x (torch.Tensor): input tensor of shape `(num_embeddings)`

Returns:
    (Tensor):
        Embedding output of shape `(num_embeddings, embedding_dim)`
r   r   )r   r'   outs      r    forward_ScaledEmbedding.forward<   s      nnQ$**,
r"   r)   )g      $@F)__name__
__module____qualname____firstlineno____doc__intfloatboolr   propertyr   Tensorr   r+   __static_attributes____classcell__r   s   @r    r
   r
   #   sn    	s 	3 	u 	]a 	 	 2 2 2
 
%,, 
 
r"   r
   c                      ^  \ rS rSrSr         SS\S\S\S\S\S\S	\S
\S\S\\	\\
4      S\4U 4S jjjrSS\R                  S\\R                     S\R                  4S jjrSrU =r$ )
_HEncLayerI   a8  Encoder layer. This used both by the time and the frequency branch.
Args:
    chin (int): number of input channels.
    chout (int): number of output channels.
    kernel_size (int, optional): Kernel size for encoder (Default: 8)
    stride (int, optional): Stride for encoder layer (Default: 4)
    norm_groups (int, optional): number of groups for group norm. (Default: 4)
    empty (bool, optional): used to make a layer with just the first conv. this is used
        before merging the time and freq. branches. (Default: ``False``)
    freq (bool, optional): boolean for whether conv layer is for frequency domain (Default: ``True``)
    norm_type (string, optional): Norm type, either ``group_norm `` or ``none`` (Default: ``group_norm``)
    context (int, optional): context size for the 1x1 conv. (Default: 0)
    dconv_kw (Dict[str, Any] or None, optional): dictionary of kwargs for the DConv class. (Default: ``None``)
    pad (bool, optional): true to pad the input. Padding is done so that the output size is
        always the input size / stride. (Default: ``True``)
chinchoutkernel_sizestridenorm_groupsemptyfreq	norm_typecontextdconv_kwpadc                 x  >^ [         TU ]  5         U
c  0 n
S nUS:X  a  U4S jnU(       a  US-  OSn[        R                  nXpl        X0l        X@l        X`l        Xl        U(       a  US/nUS/nUS/n[        R                  nU" XX4U5      U l
        U" U5      U l        U R                  (       aO  [        R                  " 5       U l        [        R                  " 5       U l        [        R                  " 5       U l        g U" USU-  SSU	-  -   SU	5      U l        U" SU-  5      U l        [!        U40 U
D6U l        g )Nc                 ,    [         R                  " 5       $ r%   r   Identityds    r    <lambda>%_HEncLayer.__init__.<locals>.<lambda>m   
    BKKMr"   
group_normc                 2   > [         R                  " TU 5      $ r%   r   	GroupNormrM   rA   s    r    rN   rO   o       [! <r"      r   r      )r   r   r   Conv1drC   r?   r@   rB   rG   Conv2dconvnorm1rK   rewritenorm2dconv_DConv)r   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   norm_fnpad_valklassr   s        `         r    r   _HEncLayer.__init__\   s    	H)$<G&)+"q			&
&*Ka[FlGIIE${GD	U^
::;;=DLDJDJ E	1q7{?AwODL U+DJ22DJr"   r'   injectr#   c                 :   U R                   (       d6  UR                  5       S:X  a"  UR                  u  p4pVUR                  USU5      nU R                   (       dS  UR                  S   nXpR                  -  S:X  d2  [
        R                  " USU R                  XpR                  -  -
  45      nU R                  U5      nU R                  (       a  U$ Ubc  UR                  S   UR                  S   :w  a  [        S5      eUR                  5       S:X  a   UR                  5       S:X  a  USS2SS2S4   nX-   n[
        R                  " U R                  U5      5      nU R                   (       ai  UR                  u  p4pVUR                  SSSS5      R                  SXF5      nU R                  U5      nUR                  X5XF5      R                  SSSS5      nOU R                  U5      nU R                  U R!                  U5      5      n	[
        R"                  " U	SS	9n	U	$ )
a  Forward pass for encoding layer.

Size depends on whether frequency or time

Args:
    x (torch.Tensor): tensor input of shape `(B, C, F, T)` for frequency and shape
        `(B, C, T)` for time
    inject (torch.Tensor, optional): on last layer, combine frequency and time branches through inject param,
        same shape as x (default: ``None``)

Returns:
    Tensor
        output tensor after encoder layer of shape `(B, C, F / stride, T)` for frequency
            and shape `(B, C, ceil(T / stride))` for time
rW   r   NzInjection shapes do not align   rX   r   r   )rC   r   shapeviewr@   FrG   r[   rB   
ValueErrorgelur\   permutereshaper_   r^   r]   glu)
r   r'   re   BCFrTleyzs
             r    r+   _HEncLayer.forward   s   " yyQUUW\''KA"q"a AyyB#q(EE!a[[0@!ABCIIaL::H||B1772;. !@AAzz|q QUUW\1d
+
AFF4::a=!99''KA"		!Q1%--b!7A

1Aqa#++Aq!Q7A

1AJJt||A'EE!Or"   )
r[   r_   rB   rC   r?   r\   r^   rG   r]   r@   )	   rW   rW   FTrQ   r   NTr%   r-   r.   r/   r0   r1   r2   r4   strr   r   r   r   r   r6   r+   r7   r8   r9   s   @r    r;   r;   I   s    * %-1*3*3 *3 	*3
 *3 *3 *3 *3 *3 *3 4S>**3 *3 *3X, ,x/E ,QVQ]Q] , ,r"   r;   c                      ^  \ rS rSrSr          SS\S\S\S\S\S\S	\S
\S\S\S\\	\\
4      S\4U 4S jjjrS\R                  S\\R                     4S jrSrU =r$ )
_HDecLayer   a{  Decoder layer. This used both by the time and the frequency branches.
Args:
    chin (int): number of input channels.
    chout (int): number of output channels.
    last (bool, optional): whether current layer is final layer (Default: ``False``)
    kernel_size (int, optional): Kernel size for encoder (Default: 8)
    stride (int): Stride for encoder layer (Default: 4)
    norm_groups (int, optional): number of groups for group norm. (Default: 1)
    empty (bool, optional): used to make a layer with just the first conv. this is used
        before merging the time and freq. branches. (Default: ``False``)
    freq (bool, optional): boolean for whether conv layer is for frequency (Default: ``True``)
    norm_type (str, optional): Norm type, either ``group_norm `` or ``none`` (Default: ``group_norm``)
    context (int, optional): context size for the 1x1 conv. (Default: 1)
    dconv_kw (Dict[str, Any] or None, optional): dictionary of kwargs for the DConv class. (Default: ``None``)
    pad (bool, optional): true to pad the input. Padding is done so that the output size is
        always the input size / stride. (Default: ``True``)
r=   r>   lastr?   r@   rA   rB   rC   rD   rE   rF   rG   c                   >^ [         TU ]  5         Uc  0 nS nU	S:X  a  U4S jnU(       a  XE-
  S-  S:w  a  [        S5      eXE-
  S-  nOSnXl        X0l        Xl        Xl        Xpl        XPl        X@l	        [        R                  n[        R                  nU(       a(  US/nUS/n[        R                  n[        R                  nU" XXE5      U l        U" U5      U l        U R                  (       a5  [        R"                  " 5       U l        [        R"                  " 5       U l        g U" USU-  SSU
-  -   SU
5      U l        U" SU-  5      U l        g )Nc                 ,    [         R                  " 5       $ r%   rJ   rL   s    r    rN   %_HDecLayer.__init__.<locals>.<lambda>   rP   r"   rQ   c                 2   > [         R                  " TU 5      $ r%   rS   rU   s    r    rN   r      rV   r"   rX   r   z#Kernel size and stride do not alignr   )r   r   rl   rG   r   rC   r=   rB   r@   r?   r   rY   ConvTranspose1drZ   ConvTranspose2dconv_trr^   rK   r]   r\   )r   r=   r>   r   r?   r@   rA   rB   rC   rD   rE   rF   rG   ra   rc   klass_trr   s         `         r    r   _HDecLayer.__init__   s*    	H)$<G$)Q. !FGG'A-CC			
&		%%&*Ka[FIIE))H[AU^
::;;=DLDJ q4xQ[!WMDL T*DJr"   r'   skipc                    U R                   (       a@  UR                  5       S:X  a,  UR                  u  pEnUR                  X@R                  SU5      nU R
                  (       d8  X-   n[        R                  " U R                  U R                  U5      5      SS9nOUnUb  [        S5      eU R                  U R                  U5      5      nU R                   (       a3  U R                  (       a!  USU R                  U R                  * 2SS24   nO>USU R                  U R                  U-   24   nUR                  S   U:w  a  [        S5      eU R                  (       d  [        R                  " U5      nX4$ )	a  Forward pass for decoding layer.

Size depends on whether frequency or time

Args:
    x (torch.Tensor): tensor input of shape `(B, C, F, T)` for frequency and shape
        `(B, C, T)` for time
    skip (torch.Tensor, optional): on first layer, separate frequency and time branches using param
        (default: ``None``)
    length (int): Size of tensor for output

Returns:
    (Tensor, Tensor):
        Tensor
            output tensor after decoder layer of shape `(B, C, F * stride, T)` for frequency domain except last
                frequency layer shape is `(B, C, kernel_size, T)`. Shape is `(B, C, stride * T)`
                for time domain.
        Tensor
            contains the output just before final transposed convolution, which is used when the
                freq. and time branch separate. Otherwise, does not matter. Shape is
                `(B, C, F, T)` for frequency and `(B, C, T)` for time.
rh   rg   r   r   Nz%Skip must be none when empty is true..z'Last index of z must be equal to length)rC   r   ri   rj   r=   rB   rk   rp   r\   r]   rl   r^   r   rG   r   rm   )	r   r'   r   lengthrq   rr   rt   rv   rw   s	            r    r+   _HDecLayer.forward   s   . 99AggGA!q))R+AzzAdjja1q9AA !HIIJJt||A'99xxc488txxi/23#txx$((V"3334Awwr{f$ !JKKyyq	Atr"   )r=   r   rB   rC   r?   r   r\   r^   rG   r]   r@   )
Fry   rW   r   FTrQ   r   NTrz   r9   s   @r    r}   r}      s    , %-10+0+ 0+ 	0+
 0+ 0+ 0+ 0+ 0+ 0+ 0+ 4S>*0+ 0+ 0+d. .Xell-C . .r"   r}   c            +       *  ^  \ rS rSrSr                    S&S\\   S\S\S\S\S\S	\S
\S\	S\S\S\S\S\S\S\S\S\S\S\S\4*U 4S jjjr
S rS'S jrS(S\R                  S\S\S\S\4
S  jjrS! rS" rS#\R                  4S$ jrS%rU =r$ ))HDemucsi-  a	  Hybrid Demucs model from
*Hybrid Spectrogram and Waveform Source Separation* :cite:`defossez2021hybrid`.

See Also:
    * :class:`torchaudio.pipelines.SourceSeparationBundle`: Source separation pipeline with pre-trained models.

Args:
    sources (List[str]): list of source names. List can contain the following source
        options: [``"bass"``, ``"drums"``, ``"other"``, ``"mixture"``, ``"vocals"``].
    audio_channels (int, optional): input/output audio channels. (Default: 2)
    channels (int, optional): initial number of hidden channels. (Default: 48)
    growth (int, optional): increase the number of hidden channels by this factor at each layer. (Default: 2)
    nfft (int, optional): number of fft bins. Note that changing this requires careful computation of
        various shape parameters and will not work out of the box for hybrid models. (Default: 4096)
    depth (int, optional): number of layers in encoder and decoder (Default: 6)
    freq_emb (float, optional): add frequency embedding after the first frequency layer if > 0,
        the actual value controls the weight of the embedding. (Default: 0.2)
    emb_scale (int, optional): equivalent to scaling the embedding learning rate (Default: 10)
    emb_smooth (bool, optional): initialize the embedding with a smooth one (with respect to frequencies).
        (Default: ``True``)
    kernel_size (int, optional): kernel_size for encoder and decoder layers. (Default: 8)
    time_stride (int, optional): stride for the final time layer, after the merge. (Default: 2)
    stride (int, optional): stride for encoder and decoder layers. (Default: 4)
    context (int, optional): context for 1x1 conv in the decoder. (Default: 4)
    context_enc (int, optional): context for 1x1 conv in the encoder. (Default: 0)
    norm_starts (int, optional): layer at which group norm starts being used.
        decoder layers are numbered in reverse order. (Default: 4)
    norm_groups (int, optional): number of groups for group norm. (Default: 4)
    dconv_depth (int, optional): depth of residual DConv branch. (Default: 2)
    dconv_comp (int, optional): compression of DConv branch. (Default: 4)
    dconv_attn (int, optional): adds attention layers in DConv branch starting at this layer. (Default: 4)
    dconv_lstm (int, optional): adds a LSTM layer in DConv branch starting at this layer. (Default: 4)
    dconv_init (float, optional): initial scale for the DConv branch LayerScale. (Default: 1e-4)
sourcesaudio_channelschannelsgrowthnfftdepthfreq_emb	emb_scale
emb_smoothr?   time_strider@   rE   context_encnorm_startsrA   dconv_depth
dconv_comp
dconv_attn
dconv_lstm
dconv_initc                   > [         T+U ]  5         X`l        XPl        X l        Xl        Xl        Xl        Xl        X0l	        U R                  S-  U l
        S U l        [        R                  " 5       U l        [        R                  " 5       U l        [        R                  " 5       U l        [        R                  " 5       U l        UnUS-  nUnUnU R                  S-  n[%        U R                  5       GH  nUU:  nUU:  nUU:  a  SOSnUS:  nUn U
n!U(       d  US:w  a  ['        S5      eUS-  n!Un Sn"Sn#U(       a  UU
::  a  Un!Sn"Sn#U!U UU"UUUUUUUS	.S
.n$[)        U$5      n%SU%S'   U
U%S'   UU%S'   SU%S'   [)        U$5      n&U#(       a  [+        UU5      nUn[-        UU4SU0U$D6n'U(       a@  U#SL a  US:X  a
  SU%S'   SU%S'   [-        UU4UU#S.U%D6n(U R                   R/                  U(5        U R                  R/                  U'5        US:X  a'  U R                  [1        U R
                  5      -  nUS-  n[3        UU4US:H  US.U&D6n)U(       a0  [3        UU4U#US:H  US.U%D6n*U R"                  R5                  SU*5        U R                  R5                  SU)5        UnUn[7        UU-  5      n[7        UU-  5      nU(       a  UU
::  a  SnOUU-  nUS:X  d  GM  U(       d  GM  [9        UUXS9U l        Xpl        GM     [=        U 5        g )NrW   rX   rQ   noner   z$When freq is false, freqs must be 1.TF)lstmattnr   compressinit)r?   r@   rC   rG   rD   rA   rF   r   rC   r?   r@   rG   rE      )rE   rB   )r   rE   )rB   r   rE   )r   r   )r   r   r   r   r   r   r?   rE   r@   r   
hop_lengthr   r   
ModuleListfreq_encoderfreq_decodertime_encodertime_decoderrangerl   dictmaxr;   appendlenr}   insertr2   r
   freq_emb_scale_rescale_module),r   r   r   r   r   r   r   r   r   r   r?   r   r@   rE   r   r   rA   r   r   r   r   r   r=   chin_zr>   chout_zfreqsindexr   r   rD   rC   strikerrG   	last_freqkwkwtkw_decenctencdectdecr   s,                                              r    r   HDemucs.__init__Q  s6   0 	
	,& ))q.MMOMMOMMOMMO		Q4::&EJ&DJ&D(-(<&I19DDCA:$%KLL!Ao"CI, 	  #&*  ( *&B r(CCK!,C"CMCJ"XFeW-VWHkHRHC$$%CM)*C&!$[{)[WZ[!!((.$$S)z**S->>WfY5A:wYRXYC!%hYUaZY`hdgh!!((D1$$Q,DF'E&7*+GK'Ef$Ezhh 0z c&.#W 'Z 	r"   c                    U R                   nU R                  nUnX#S-  :w  a  [        S5      e[        [        R
                  " UR                  S   U-  5      5      nUS-  S-  nU R                  XXeU-  -   UR                  S   -
  SS9n[        XU5      SS S2S S 24   nUR                  S   US-   :w  a  [        S	5      eUSSSU-   24   nU$ )
NrW   zHop length must be nfft // 4rg   rX   rh   reflect)mode.zESpectrogram's last dimension must be 4 + input size divided by stride)	r   r   rl   r2   mathceilri   _pad1d_spectro)r   r'   hlr   x0ru   rG   rw   s           r    _specHDemucs._spec  s    __yy ?;<<1772;+,-AgkKK2g ;)KLQb!#ssA+.772;"q& deec1q2v:or"   c                    U R                   n[        R                  " U/ SQ5      n[        R                  " USS/5      nUS-  S-  nU[        [        R
                  " X#-  5      5      -  SU-  -   n[        XUS9nUSXDU-   24   nU$ )N)r   r   r   r   rX   rh   )r   .)r   rk   rG   r2   r   r   	_ispectro)r   rw   r   r   rG   ru   r'   s          r    _ispecHDemucs._ispec  s    __EE!\"EE!aVAgk#dii,--C7aB'c3v%%&r"   r'   padding_leftpadding_rightr   valuec                     UR                   S   nUS:X  a.  [        X#5      nXg::  a  [        R                  " USXv-
  S-   45      n[        R                  " XU4XE5      $ )zWrapper around F.pad, in order for reflect padding when num_frames is shorter than max_pad.
Add extra zero padding around in order for padding to not break.rg   r   r   r   )ri   r   rk   rG   )r   r'   r   r   r   r   r   max_pads           r    r   HDemucs._pad1d  s\     9,6G EE!a!1A!567uuQ}5tCCr"   c                     UR                   u  p#pE[        R                  " U5      R                  SSSSS5      nUR	                  X#S-  XE5      nU$ )Nr   r   rW   rX   rh   )ri   r   view_as_realrn   ro   )r   rw   rq   rr   rs   rt   ms          r    
_magnitudeHDemucs._magnitude  sK    ggbq!))!Q1a8IIaQ&r"   c                     UR                   u  p#pEnUR                  X#SSXV5      R                  SSSSSS5      n[        R                  " UR                  5       5      nU$ )Nrg   rX   r   r   rW      rh   )ri   rj   rn   r   view_as_complex
contiguous)r   r   rq   Srr   rs   rt   r*   s           r    _maskHDemucs._mask  sV    aQffQ2q"(00Aq!QB##CNN$45
r"   inputc                    UR                   S:w  a  [        SUR                   35      eUR                  S   U R                  :w  a  [        SUR                  S    S35      eUnUR                  S   nU R	                  U5      nU R                  U5      nUnUR                  u  pgpUR                  SSS	9n
UR                  SSS	9nX*-
  S
U-   -  nUnUR                  SSS	9nUR                  SSS	9nX-
  S
U-   -  n/ n/ n/ n/ n[        U R                  5       GH=  u  nnUR                  UR                  S   5        SnU[        U R                  5      :  aZ  UR                  UR                  S   5        U R                  U   nU" U5      nUR                  (       d  UR                  U5        OUnU" UU5      nUS:X  a  U R                  bv  [        R                   " UR                  S   UR"                  S9nU R                  U5      R%                  5       SSS2SS2S4   R'                  U5      nX R(                  U-  -   nUR                  U5        GM@     [        R*                  " U5      n[        R*                  " U5      n[        U R,                  5       H  u  nnUR/                  S5      nU" UUUR/                  S5      5      u  nnU R0                  [        U R2                  5      -
  nUU:  d  M]  U R2                  UU-
     nUR/                  S5      nUR                  (       aF  UR                  S   S:w  a  [        SUR                   35      eUSS2SS2S4   nU" USU5      u  nnM  UR/                  S5      nU" UUU5      u  nnM     [        U5      S:w  a  [5        S5      e[        U5      S:w  a  [5        S5      e[        U5      S:w  a  [5        S5      e[        U R6                  5      n UR9                  UU SX5      nX+SS2S4   -  U
SS2S4   -   nU R;                  U5      n!U R=                  U!U5      nUR9                  UU SU5      nXSS2S4   -  USS2S4   -   nX-   nU$ )zHDemucs forward call

Args:
    input (torch.Tensor): input mixed tensor of shape `(batch_size, channel, num_frames)`

Returns:
    Tensor
        output tensor split into sources of shape `(batch_size, num_sources, channel, num_frames)`
rh   zDExpected 3D tensor with dimensions (batch, channel, frames). Found: r   zZThe channel dimension of input Tensor must match `audio_channels` of HDemucs model. Found:.rg   )r   rX   rh   T)r   keepdimgh㈵>)r   rX   Nr   )devicerX   z0If tdec empty is True, pre shape does not match zsaved is not emptyzlengths_t is not emptyzsaved_t is not empty)ndimrl   ri   r   r   r   meanstd	enumerater   r   r   r   rB   r   r   r   r   t	expand_asr   
zeros_liker   popr   r   AssertionErrorr   rj   r   r   )"r   r   r'   r   rw   magrq   rr   Fqrt   r   r   xtmeantstdtsavedsaved_tlengths	lengths_tidxencodere   r   frsembdecoder   preoffsetr   length_t_r   zouts"                                     r    r+   HDemucs.forward  s@    ::?cdidodocpqrr;;q>T000Q(+ 
 JJuooa ggb vv)Tv2ee	4e0X$*% FD1vv&$v/jTD[)!	$T%6%67KCNN1772;'FS**++  ".((-"XzzNN2&  Fq&!AaxDMM5 ll1772;qxx@mmC(**,T1a-=>HHK++c11LLO/ 82 Qa  %T%6%67KC99R=DAtW[[_5FAs ZZ#d&7&7"88Ff}((v6$==,::yy|q((+[\_\e\e[f)ghhaAg,C dH5EB";;r?D T84EB! 8$ u:? !566y>Q !9::w<1 !788FF1aR#AtGtAtG},zz!}KKf%WWQ2v&q$w-%4.0Fr"   )r   r   rE   r   r   r   r   r   r   r?   r   r   r@   r   r   )rX   0   rX         g?
   Try   rX   rW   r   r   rW   rW   rX   rW   rW   rW   -C6?r%   )zerog        )r-   r.   r/   r0   r1   r   r{   r2   r3   r4   r   r   r   r   r6   r   r   r   r+   r7   r8   r9   s   @r    r   r   -  s   !L   -~c~ ~ 	~
 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~  !~" #~$ %~& '~( )~* +~, -~ ~@0D DC D DSV Dhm DoU\\ o or"   r   c                   t   ^  \ rS rSrSr         SS\S\S\S\S\S\S	\S
\S\S\4U 4S jjjr	S r
SrU =r$ )r`   i}  am  
New residual branches in each encoder layer.
This alternates dilated convolutions, potentially with LSTMs and attention.
Also before entering each residual branch, dimension is projected on a smaller subspace,
e.g. of dim `channels // compress`.

Args:
    channels (int): input/output channels for residual branch.
    compress (float, optional): amount of channel compression inside the branch. (default: 4)
    depth (int, optional): number of layers in the residual branch. Each layer has its own
        projection, and potentially LSTM and attention.(default: 2)
    init (float, optional): initial scale for LayerNorm. (default: 1e-4)
    norm_type (bool, optional): Norm type, either ``group_norm `` or ``none`` (Default: ``group_norm``)
    attn (bool, optional): use LocalAttention. (Default: ``False``)
    heads (int, optional): number of heads for the LocalAttention.  (default: 4)
    ndecay (int, optional): number of decay controls in the LocalAttention. (default: 4)
    lstm (bool, optional): use LSTM. (Default: ``False``)
    kernel_size (int, optional): kernel size for the (dilated) convolutions. (default: 3)
r   r   r   r   rD   r   headsndecayr   r?   c                 8  > [         TU ]  5         U
S-  S:X  a  [        S5      eXl        X l        [        U5      U l        US:  nS nUS:X  a  S n[        X-  5      n[        R                  n[        R                  " / 5      U l        [        U R                  5       H  nU(       a  [        SU5      OSnUU
S-  -  n[        R                  " XU
UUS9U" U5      U" 5       [        R                  " USU-  S5      U" SU-  5      [        R                  " S5      [!        X5      /nU(       a  UR#                  S	[%        XUS
95        U	(       a  UR#                  S	['        USSS95        [        R(                  " U6 nU R                  R+                  U5        M     g )NrX   r   z(Kernel size should not be divisible by 2c                 ,    [         R                  " 5       $ r%   rJ   rL   s    r    rN   !_DConv.__init__.<locals>.<lambda>  rP   r"   rQ   c                 0    [         R                  " SU 5      $ )Nr   rS   rL   s    r    rN   r    s    Q 2r"   r   )dilationpaddingrh   )r  r  T)layersr   )r   r   rl   r   r   absr   r2   r   GELUr   r  r   powrY   GLU_LayerScaler   _LocalState_BLSTM
Sequentialr   )r   r   r   r   r   rD   r   r  r  r   r?   dilatera   hiddenactrM   r  r  modslayerr   s                       r    r   _DConv.__init__  s^    	?aGHH  Z
 *$2GX()ggmmB'tzz"A$*s1ayH+"23G		(K(T[\		&!h,2H%q	H+D A{6vNOAvfQTBCMM4(EKKu%# #r"   c                 @    U R                    H  nX" U5      -   nM     U$ )zDConv forward call

Args:
    x (torch.Tensor): input tensor for convolution

Returns:
    Tensor
        Output after being run through layers.
)r  )r   r'   r$  s      r    r+   _DConv.forward  s$     [[EE!HA !r"   )r   r   r   r  )	rW   rX   r  rQ   FrW   rW   Frh   )r-   r.   r/   r0   r1   r2   r3   r{   r4   r   r+   r7   r8   r9   s   @r    r`   r`   }  s    . %1&1& 1& 	1&
 1& 1& 1& 1& 1& 1& 1& 1&f r"   r`   c                   v   ^  \ rS rSrSrS
S\S\4U 4S jjjrS\R                  S\R                  4S jr
S	rU =r$ )r  i  aE  
BiLSTM with same hidden units as input dim.
If `max_steps` is not None, input will be splitting in overlapping
chunks and the LSTM applied separately on each chunk.
Args:
    dim (int): dimensions at LSTM layer.
    layers (int, optional): number of LSTM layers. (default: 1)
    skip (bool, optional): (default: ``False``)
r  r   c                    > [         TU ]  5         SU l        [        R                  " SX!US9U l        [        R                  " SU-  U5      U l        X0l        g )N   T)bidirectional
num_layershidden_size
input_sizerX   )	r   r   	max_stepsr   LSTMr   Linearlinearr   )r   r   r  r   r   s       r    r   _BLSTM.__init__  sE    GG$6_bc	iiC-	r"   r'   r#   c           	      X   UR                   u  p#nUnSnSnSnSn	U R                  ba  X@R                  :  aR  U R                  nUS-  n[        XU5      n
U
R                   S   n	SnU
R                  SSSS5      R	                  SX75      nUR                  SSS5      nU R                  U5      S   nU R                  U5      nUR                  SSS5      nU(       a  / nUR	                  USX75      n
US-  n[        U	5       Hp  nUS:X  a!  UR                  U
SS2USS2SU* 24   5        M*  XS-
  :X  a   UR                  U
SS2USS2US24   5        MR  UR                  U
SS2USS2X* 24   5        Mr     [        R                  " US5      nUS	SU24   nUnU R                  (       a  X-   nU$ )
zBLSTM forward call

Args:
    x (torch.Tensor): input tensor for BLSTM shape is `(batch_size, dim, time_steps)`

Returns:
    Tensor
        Output after being run through bidirectional LSTM. Shape is `(batch_size, dim, time_steps)`
Fr   NrX   Tr   rh   rg   .)ri   r/  _unfoldrn   ro   r   r2  r   r   r   catr   )r   r'   rq   rr   rt   rv   framedwidthr@   nframesframesr*   limitks                 r    r+   _BLSTM.forward  s    ''a>>%!nn*<NNEaZFQv.Fll1oGFq!Q*222q@AIIaAIIaLOKKNIIaACYYq"a/FaKE7^6JJvaAww&678A+%JJvaAuvo67JJvaAuV|&;<= $ ))C$Cc2A2g,CA99Ar"   )r2  r   r/  r   )r   F)r-   r.   r/   r0   r1   r2   r4   r   r   r6   r+   r7   r8   r9   s   @r    r  r    s@    C 4  . .%,, . .r"   r  c                   z   ^  \ rS rSrSrSS\S\S\4U 4S jjjrS\R                  S\R                  4S	 jr	S
r
U =r$ )r  i  zLocal state allows to have attention based only on data (no positional embedding),
but while setting a constraint on the time window (e.g. decaying penalty term).
Also a failed experiments with trying to provide some frequency based attention.
r   r  r  c                 |  > [         [        U ]  5         X-  S:w  a  [        S5      eX l        X0l        [        R                  " XS5      U l        [        R                  " XS5      U l	        [        R                  " XS5      U l
        [        R                  " XU-  S5      U l        U(       an  U R                  R                  =R                  S-  sl        U R                  R                  c  [        S5      eSU R                  R                  R                  SS& [        R                  " XS-  -   US5      U l        g)z
Args:
    channels (int): Size of Conv1d layers.
    heads (int, optional):  (default: 4)
    ndecay (int, optional): (default: 4)
r   z$Channels must be divisible by heads.r   g{Gz?Nzbias must not be None.r   )r   r  r   rl   r  r  r   rY   contentquerykeyquery_decayr   r   biasproj)r   r   r  r  r   s       r    r   _LocalState.__init__  s     	k4)+q CDD
yyQ7YYx15
99X399Xv~qA##((D0($$, !9::,.D!!&&q)IIh2Ha@	r"   r'   r#   c                    UR                   u  p#nU R                  n[        R                  " XAR                  UR
                  S9nUSS2S4   USSS24   -
  nU R                  U5      R                  X%SU5      nU R                  U5      R                  X%SU5      n	[        R                  " SX5      n
U
[        R                  " U	R                   S   5      -  n
U R                  (       a  [        R                  " SU R                  S-   UR                  UR
                  S9nU R                  U5      R                  X%SU5      n[        R                  " U5      S-  nUR                  SSS5      * UR                  5       -  [        R                  " U R                  5      -  nU
[        R                  " SX5      -  n
U
R!                  [        R"                  " XJR                  [        R$                  S9S5        [        R&                  " U
SS	9nU R)                  U5      R                  X%SU5      n[        R                  " S
X5      nUR+                  USU5      nXR-                  U5      -   $ )zLocalState forward call

Args:
    x (torch.Tensor): input tensor for LocalState

Returns:
    Tensor
        Output after being run through LocalState layer.
)r   dtypeNrg   zbhct,bhcs->bhtsrX   r   zfts,bhfs->bhtsir   zbhts,bhct->bhcs)ri   r  r   r   r   rH  rA  rj   rB  einsumr   r   r  rC  sigmoidr  masked_fill_eyer4   softmaxr@  ro   rE  )r   r'   rq   rr   rt   r  indexesdeltaquerieskeysdotsdecaysdecay_qdecay_kernelweightsr@  results                    r    r+   _LocalState.forward6  s    ''a

,,qA4 747#33**Q-$$Qr15xx{"a0||-t=		$**Q-((;;\\!T[[1_QXXQWWUF&&q)..qQ?GmmG,q0G"KKAq11EIIK?$))DKKBXXLELL!1<IID 	%))AkkLdS--!,,,q/&&qQ7/B2q)99V$$$r"   )r@  r  rB  r  rE  rA  rC  )rW   rW   )r-   r.   r/   r0   r1   r2   r   r   r6   r+   r7   r8   r9   s   @r    r  r    sL    
A AS Ac A A2#% #%%,, #% #%r"   r  c                   v   ^  \ rS rSrSrS
S\S\4U 4S jjjrS\R                  S\R                  4S jr
S	rU =r$ )r  i\  zLayer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
This rescales diagonally residual outputs close to 0 initially, then learnt.
r   r   c                    > [         TU ]  5         [        R                  " [        R
                  " USS95      U l        X R                  R                  SS& g)zk
Args:
    channels (int): Size of  rescaling
    init (float, optional): Scale to default to (default: 0)
T)requires_gradN)r   r   r   	Parameterr   zerosr   r   )r   r   r   r   s      r    r   _LayerScale.__init__a  s;     	\\%++hd"KL
!

r"   r'   r#   c                 .    U R                   SS2S4   U-  $ )zLayerScale forward call

Args:
    x (torch.Tensor): input tensor for LayerScale

Returns:
    Tensor
        Output after rescaling tensor.
Nr   )r   r'   s     r    r+   _LayerScale.forwardk  s     zz!T'"Q&&r"   r`  )r   )r-   r.   r/   r0   r1   r2   r3   r   r   r6   r+   r7   r8   r9   s   @r    r  r  \  s@    " "E " "
' 
'%,, 
' 
'r"   r  ar?   r@   r#   c                    [        U R                  SS 5      n[        U R                  S   5      n[        R                  " XB-  5      nUS-
  U-  U-   n[
        R                  " U SXd-
  /S9n [        U R                  5       5       Vs/ s H  opR                  U5      PM     nnUS   S:w  a  [        S5      eUSS US/-   nUR                  U5        UR                  U5        U R                  X85      $ s  snf )zGiven input of size [*OT, T], output Tensor of size [*OT, F, K]
with K the kernel size, by extracting frames with the given stride.
This will pad the input so that `F = ceil(T / K)`.
see https://github.com/pytorch/pytorch/issues/60466
Nrg   r   r   )r   rG   zData should be contiguous.)listri   r2   r   r   rk   rG   r   r   r@   rl   r   
as_strided)	rb  r?   r@   ri   r   n_frames
tgt_lengthr   stridess	            r    r5  r5  x  s     "EFyy)HQ,&(;6J	AAz234A(-aeeg7xx}G7r{a566crlfa[(G	LL	LL<<'' 8s   C5c                    U R                  5        H  n[        U[        R                  [        R                  [        R
                  [        R                  45      (       d  MR  UR                  R                  5       R                  5       nUS-  S-  nUR                  =R                  U-  sl
        UR                  c  M  UR                  =R                  U-  sl
        M     g)zA
Rescales initial weight scale for all models within the module.
g?g      ?N)modules
isinstancer   rY   r   rZ   r   r   r   detachr   rD  )modulesubr   r   s       r    r   r     s     ~~cBIIr'9'9299bFXFXYZZ**.."))+C3Y3&EJJOOu$Oxx#&  r"   r'   n_fftr   rG   c                 t   [        U R                  S S 5      n[        U R                  S   5      nU R                  SU5      n [        R
                  " U USU-   -  U[        R                  " U5      R                  U 5      USSSSS9	nUR                  u  pxn	UR                  X/5        UR                  U5      $ )Nrg   r   Tr   )window
win_length
normalizedcenterreturn_complexpad_mode)
rd  ri   r2   ro   r   stfthann_windowtoextendrj   )
r'   ro  r   rG   otherr   rw   r  r   frames
             r    r   r     s    "EF			"fA

	S  '**1-
	A ggOAe	LL% 66%=r"   rw   r   c                    [        U R                  S S 5      n[        U R                  S   5      n[        U R                  S   5      nSU-  S-
  nU R                  SXV5      n USU-   -  n[        R
                  " U UU[        R                  " U5      R                  U R                  5      USUSS9n	U	R                  u  pUR                  U5        U	R                  U5      $ )Nr   rg   rX   r   T)rq  rr  rs  r   rt  )
rd  ri   r2   rj   r   istftrx  ry  realr   )rw   r   r   rG   r{  r   r:  ro  rr  r'   r  s              r    r   r     s    "EEFIME	r5!A1s7#J	  ,//7		A IA	LL66%=r"   r   c                     [        U SSS9$ )zBuilds low nfft (1024) version of :class:`HDemucs`, suitable for sample rates around 8 kHz.

Args:
    sources (List[str]): See :py:func:`HDemucs`.

Returns:
    HDemucs:
        HDemucs model.
i   r   r   r   r   r   r   s    r    hdemucs_lowr         7Q77r"   c                     [        U SSS9$ )a  Builds medium nfft (2048) version of :class:`HDemucs`, suitable for sample rates of 16-32 kHz.

.. note::

    Medium HDemucs has not been tested against the original Hybrid Demucs as this nfft and depth configuration is
    not compatible with the original implementation in https://github.com/facebookresearch/demucs

Args:
    sources (List[str]): See :py:func:`HDemucs`.

Returns:
    HDemucs:
        HDemucs model.
r   r
  r  r  r  s    r    hdemucs_mediumr    s      7Q77r"   c                     [        U SSS9$ )zBuilds medium nfft (4096) version of :class:`HDemucs`, suitable for sample rates of 44.1-48 kHz.

Args:
    sources (List[str]): See :py:func:`HDemucs`.

Returns:
    HDemucs:
        HDemucs model.
r	  r
  r  r  r  s    r    hdemucs_highr    r  r"   )i   r   r   )r   r   r   )r   typingtpr   r   r   r   r   r   torch.nnr   rk   Moduler
   r;   r}   r   r`   r  r  r  r6   r2   r5  r   r   r   r{   r  r  r   r"   r    <module>r     s  4   , ,   $#uxx #Lk k\s slMehhoo M`
TUXX__ Tn@UXX__ @FB%")) B%J'")) '8(u|| (# (s (u|| ((
' S C # V[VbVb ( 3 C # V[VbVb .8c 8w 88DI 8' 8&8$s) 8 8r"   