
    h                        S SK r S SKJrJrJr  S SKrS/rS\R                  S\R                  4S jr SS\R                  S\R                  S	\R                  S\R                  S
\R                  S\\R                     S\\R                     4S jjr	S\
S\R                  R                  4S jrS\\
   S\S\\\      4S jrS\\   S\\   S\S\R$                  S\R                  4
S jr " S S\R                  R                  5      r " S S\R                  R                  5      r " S S\R                  R                  5      r " S S\5      rg)    N)ListOptionalTupleEmformerlengthsreturnc                    U R                   S   n[        [        R                  " U 5      R	                  5       5      n[        R
                  " X R                  U R                  S9R                  X5      U R                  S5      :  nU$ )Nr   )devicedtype   )
shapeinttorchmaxitemaranger
   r   expand	unsqueeze)r   
batch_size
max_lengthpadding_masks       R/var/www/auris/envauris/lib/python3.13/site-packages/torchaudio/models/emformer.py_lengths_to_padding_maskr   
   sm    q!JUYYw',,./J<<
>>W^^			1	L     	utteranceright_contextsummarymemsleft_context_keyc                    UR                  S5      U R                  S5      -   UR                  S5      -   nUR                  S5      nUS:X  a  S nU$ U[        R                  " U5      R                  5       -
  UR                  S5      -
  n	Ub  UR                  S5      OSn
X4R                  S5      -   U	-   U
-   n[	        US9nU$ )Nr   r   )r   )sizer   r   r   r   )r   r   r   r   r   r   TBr   right_context_blocks_lengthleft_context_blocks_lengthklengthss               r   _gen_padding_maskr'      s     	1	q 11GLLOCA1AAv 	 '(%))G*<*@*@*B&BW\\RS_&T#AQA]%5%:%:1%=cd"YYq\),GGJdd/Ar   
activationc                     U S:X  a  [         R                  R                  5       $ U S:X  a  [         R                  R                  5       $ U S:X  a  [         R                  R	                  5       $ [        SU  35      e)NrelugelusiluzUnsupported activation )r   nnReLUGELUSiLU
ValueError)r(   s    r   _get_activation_moduler2   '   s]    Vxx}}	v	xx}}	v	xx}}2:,?@@r   weight_init_scale_strategy
num_layersc                 d   U c  [        U5       Vs/ s H  nS PM     sn$ U S:X  a6  [        U5       Vs/ s H  nS[        R                  " US-   5      -  PM!     sn$ U S:X  a3  [        U5       Vs/ s H  nS[        R                  " S5      -  PM     sn$ [        SU  35      es  snf s  snf s  snf )N	depthwiseg      ?r   constant   z-Unsupported weight_init_scale_strategy value )rangemathsqrtr1   )r3   r4   _	layer_idxs       r   _get_weight_init_gainsr>   2   s    !)#J/0//00	#{	2@Ej@QR@Q9dii	A..@QRR	#z	149*4EF4Eydiil"4EFFHIcHdeff 1RFs   B#&B(/#B-
col_widthscol_masknum_rowsr
   c           
         [        U 5      [        U5      :w  a  [        S5      e[        X5       VVs/ s H6  u  pEU(       a  [        R                  " X$US9O[        R
                  " X$US9PM8     nnn[        R                  " USS9$ s  snnf )Nz0Length of col_widths must match that of col_maskr
   r   dim)lenr1   zipr   oneszeroscat)r?   r@   rA   r
   	col_widthis_ones_col
mask_blocks          r   _gen_attention_mask_blockrN   =   s     :#h-'KLL '**&?	 '@"I  	

8v6[[V<	= '@	   99ZQ''s   =Bc                     ^  \ rS rSrSr    SS\S\S\S\\   S\S\4U 4S	 jjjr	S
\
R                  S\
R                  S\\
R                  \
R                  4   4S jrS\
R                  S\
R                  S\\
R                     S\
R                  4S jr  SS\
R                  S\
R                  S\
R                  S\
R                  S\
R                  S\
R                  S\\
R                     S\\
R                     S\\
R                  \
R                  \
R                  \
R                  4   4S jjrS\
R                  S\
R                  S\
R                  S\
R                  S\
R                  S\
R                  S\\
R                  \
R                  4   4S jr\
R"                  R$                  S\
R                  S\
R                  S\
R                  S\
R                  S\
R                  S\
R                  S\
R                  S\\
R                  \
R                  \
R                  \
R                  4   4S j5       rSrU =r$ )_EmformerAttentionL   a;  Emformer layer attention module.

Args:
    input_dim (int): input dimension.
    num_heads (int): number of attention heads in each Emformer layer.
    dropout (float, optional): dropout probability. (Default: 0.0)
    weight_init_gain (float or None, optional): scale factor to apply when initializing
        attention module parameters. (Default: ``None``)
    tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
    negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
	input_dim	num_headsdropoutweight_init_gaintanh_on_memnegative_infc                   > [         TU ]  5         X-  S:w  a  [        SU SU S35      eXl        X l        X0l        XPl        X`l        U R                  U R                  -  S-  U l        [        R                  R                  USU-  SS9U l        [        R                  R                  XSS9U l        [        R                  R                  XSS9U l        U(       ay  [        R                  R                  R!                  U R                  R"                  US	9  [        R                  R                  R!                  U R                  R"                  US	9  g g )
Nr   zinput_dim (z") is not a multiple of num_heads (z).g      r8   T)bias)gain)super__init__r1   rR   rS   rT   rV   rW   scalingr   r-   Linearemb_to_key_valueemb_to_queryout_projinitxavier_uniform_weight)selfrR   rS   rT   rU   rV   rW   	__class__s          r   r\   _EmformerAttention.__init__Y   s    	 A%{9+5WXaWbbdeff""&($..8TA %	1y=t T!HHOOItOL	4HHHMM))$*?*?*F*FM])^HHMM))$*;*;*B*BIY)Z r   inputr   r   c                     UR                   u  n  nUR                  S5      S-   nUS X5-
   n[        R                  " X&/5      nU R	                  U5      R                  SSS9u  pX4$ )Nr   r   r8   chunksrE   )r   r!   r   rJ   r_   chunk)
re   rh   r   r"   r<   summary_lengthright_ctx_utterance_blockmems_right_ctx_utterance_blockkeyvalues
             r   _gen_key_value!_EmformerAttention._gen_key_valuew   ss    ++1a1)$)*>A,>$?!).D3T)U&**+IJPPXY_`Pa
zr   attention_weightsattention_maskr   c                    UR                  5       nUR                  UR                  S5      U R                  5      nUR	                  S5      nUR	                  S5      U R
                  -  nUb  UR                  X`R
                  US5      nUR                  UR                  S5      R                  S5      R                  [        R                  5      U R                  5      nUR                  X`R
                  -  US5      n[        R                  R                  R                  USS9R                  U5      n[        R                  R                  R                  U[        U R                  5      U R                  S9$ )Nr   r   r8   rD   )ptraining)floatmasked_fillr   rW   r!   rS   viewtor   boolr-   
functionalsoftmaxtype_asrT   ry   )re   rt   ru   r   attention_weights_floatr"   r#   attention_probss           r   _gen_attention_probs'_EmformerAttention._gen_attention_probs   sC    #4"9"9";"9"E"EnF^F^_`Facgctct"u""1%""1%7#&=&B&B1nnVWY[&\#&=&I&I&&q)33A699%**EtGXGX'# '>&B&B1~~CUWXZ\&]#((--556MSU5V^^_pqxx""**?eDLL>Q\`\i\i*jjr   r   r   r   r   r   left_context_valc	           	      f   UR                  S5      n	UR                  S5      UR                  S5      -   UR                  S5      -   n
U R                  [        R                  " X1U/5      5      nU R	                  [        R                  " XSU/5      5      R                  SSS9u  pUb  Ub  U
[        R                  " U5      R                  5       -
  UR                  S5      -
  n[        R                  " US UR                  S5      U-    UXR                  S5      U-   S  /5      n[        R                  " US UR                  S5      U-    UXR                  S5      U-   S  /5      nXU4 Vs/ s HW  nUR                  5       R                  SXR                  -  U R                  U R                  -  5      R                  SS5      PMY     snu  nnn[        R                  " UU R                  -  UR                  SS5      5      n[        XXBXW5      nU R!                  UUU5      n[        R                  " UU5      nUR"                  XR                  -  U
U R                  U R                  -  4:w  a  [%        S5      eUR                  SS5      R                  5       R                  XU R                  5      nU R'                  U5      nUR                  S5      nUS U
U-
   nUU
U-
  S  nU R(                  (       a  [        R*                  " U5      nO[        R,                  " USSS	9nUUX4$ s  snf )
Nr   r   r8   rj   rw   z+Computed attention has incorrect dimensionsi
   )minr   )r!   r`   r   rJ   r_   rl   r   r   
contiguousr|   rS   rR   	transposebmmr]   r'   r   r   AssertionErrorra   rV   tanhclamp)re   r   r   r   r   r   ru   r   r   r#   r"   queryrp   rq   r$   tensorreshaped_queryreshaped_keyreshaped_valuert   r   r   	attentionoutput_right_context_memsrm   output_right_contextoutput_memss                              r   _forward_impl _EmformerAttention._forward_impl   s	    NN1q!INN1$55QG !!%))]w,O"PQ **599d95U+VW]]eflm]n
',<,H*+eii.@.D.D.F*FVW*X'))D$))A,)DDE$		!'BBDEC IIFDIIaL+FFG$))A,)DDFGE !u-8
- $$R^^);T^^t~~=]^hhijlmn-8
4n "IInt||&C\E[E[\]_`Eab )7UYl 334E~Wcd IIo~>	??NNdnn,
 

 !!NOO''1-88:??dnnU	 %)MM)$<! a89M1~;MN/N0B0DE**[1K++ksCK#[#<<C8
s   .AL.c                 <    U R                  XX4XV5      u  px  n	XxSS 4$ )a  Forward pass for training.

B: batch size;
D: feature dimension of each frame;
T: number of utterance frames;
R: number of right context frames;
S: number of summary elements;
M: number of memory elements.

Args:
    utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
    lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        number of valid frames for i-th batch element in ``utterance``.
    right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
    summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
    mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
    attention_mask (torch.Tensor): attention mask for underlying attention module.

Returns:
    (Tensor, Tensor):
        Tensor
            output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
        Tensor
            updated memory elements, with shape `(M, B, D)`.
Nrw   )r   )
re   r   r   r   r   r   ru   outputr   r<   s
             r   forward_EmformerAttention.forward   s1    D %)$6$6y=cg$x!Q3B'''r   c                 l   UR                  S5      UR                  S5      -   UR                  S5      -   nUR                  S5      UR                  S5      -   UR                  S5      -   UR                  S5      -   n	[        R                  " X5      R                  [        R                  UR
                  S9n
SU
SSUR                  S5      24'   U R                  UUUUUU
UUS9u  ppUUXR                  S5      UR                  S5      -   S XR                  S5      UR                  S5      -   S 4$ )a  Forward pass for inference.

B: batch size;
D: feature dimension of each frame;
T: number of utterance frames;
R: number of right context frames;
S: number of summary elements;
M: number of memory elements.

Args:
    utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
    lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        number of valid frames for i-th batch element in ``utterance``.
    right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
    summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
    mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
    left_context_key (torch.Tensor): left context attention key computed from preceding invocation.
    left_context_val (torch.Tensor): left context attention value computed from preceding invocation.

Returns:
    (Tensor, Tensor, Tensor, and Tensor):
        Tensor
            output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
        Tensor
            updated memory elements, with shape `(M, B, D)`.
        Tensor
            attention key computed for left context and utterance.
        Tensor
            attention value computed for left context and utterance.
r   r   r
   Trw   N)r   r   )r!   r   rI   r}   r~   r
   r   )re   r   r   r   r   r   r   r   	query_dimkey_dimru   r   r   rp   rq   s                  r   infer_EmformerAttention.infer   s7   R "&&q)INN1,==QO	$$Q')..*;;diilJM]MbMbcdMeeY8;;%**U^UeUe;f-1r>TYYq\>)**.*<*<-- += 	+
'S 		!}11!4467))A,!3!3A!6689	
 	
r   )	rT   r_   r`   rR   rW   rS   ra   r]   rV   )        NF    ח)NN)__name__
__module____qualname____firstlineno____doc__r   rz   r   r~   r\   r   Tensorr   rr   r   r   r   jitexportr   __static_attributes____classcell__rf   s   @r   rP   rP   L   s   
  ,0!"[[ [ 	[
 #5/[ [ [ [<ELL  u||]b]i]iOiIj k <<k k u||,	k
 
k6 4837G=<<G= G= ||	G=
 G= llG= G= #5<<0G= #5<<0G= 
u||U\\5<<E	FG=R#(<<#( #( ||	#(
 #( ll#( #( 
u||U\\)	*#(J YY;
<<;
 ;
 ||	;

 ;
 ll;
  ,,;
  ,,;
 
u||U\\5<<E	F;
 ;
r   rP   c                     ^  \ rS rSrSr       S'S\S\S\S\S\S\S	\S
\S\\   S\	S\4U 4S jjjr
S\S\\R                     S\\R                     4S jrS\\R                     S\\R                  \R                  \R                  4   4S jrS\R                  S\R                  S\S\R                  S\\R                     S\\R                     4S jrS\R                  S\R                  S\R                  S\R                  4S jrS\R                  S\R                  S\\R                  \R                  4   4S jrS\R                  S\R                  S\R                  S\\R                  \R                  4   4S jrS\R                  S \R                  S\R                  S\R                  S!\\R                     S\\R                  \R                  4   4S" jrS\R                  S \R                  S\R                  S\R                  S\\\R                        S\\R                  \R                  \\R                     4   4S# jrS\R                  S \R                  S\R                  S\R                  S!\R                  S\\R                  \R                  \R                  4   4S$ jr\R2                  R4                  S\R                  S \R                  S\R                  S\\\R                        S\R                  S\\R                  \R                  \\R                     \R                  4   4S% j5       rS&rU =r$ )(_EmformerLayeri?  a  Emformer layer that constitutes Emformer.

Args:
    input_dim (int): input dimension.
    num_heads (int): number of attention heads.
    ffn_dim: (int): hidden layer dimension of feedforward network.
    segment_length (int): length of each input segment.
    dropout (float, optional): dropout probability. (Default: 0.0)
    activation (str, optional): activation function to use in feedforward network.
        Must be one of ("relu", "gelu", "silu"). (Default: "relu")
    left_context_length (int, optional): length of left context. (Default: 0)
    max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
    weight_init_gain (float or None, optional): scale factor to apply when initializing
        attention module parameters. (Default: ``None``)
    tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
    negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
rR   rS   ffn_dimsegment_lengthrT   r(   left_context_lengthmax_memory_sizerU   rV   rW   c           
      4  > [         TU ]  5         [        UUUU	U
US9U l        [        R
                  R                  U5      U l        [        R
                  R                  XDSS9U l	        [        U5      n[        R
                  R                  [        R
                  R                  U5      [        R
                  R                  X5      U[        R
                  R                  U5      [        R
                  R                  X15      [        R
                  R                  U5      5      U l        [        R
                  R                  U5      U l        [        R
                  R                  U5      U l        Xpl        X@l        Xl        Xl        US:  U l        g )N)rR   rS   rT   rU   rV   rW   Tkernel_sizestride	ceil_moder   )r[   r\   rP   r   r   r-   DropoutrT   	AvgPool1d	memory_opr2   
Sequential	LayerNormr^   pos_fflayer_norm_inputlayer_norm_outputr   r   r   rR   use_mem)re   rR   rS   r   r   rT   r(   r   r   rU   rV   rW   activation_modulerf   s                r   r\   _EmformerLayer.__init__R  s/    	+-#%
 xx''0++im+n2:>hh))HHy)HHOOI/HHW%HHOOG/HHW%
 !& 2 29 =!&!3!3I!>#6 ,."&*r   r   r
   r   c                 P   [         R                  " U R                  XR                  US9n[         R                  " U R                  XR                  US9n[         R                  " U R                  XR                  US9n[         R                  " SU[         R
                  US9nX4XV/$ )NrC   r   r   )r   rI   r   rR   r   int32)re   r   r
   empty_memoryr   r   past_lengths          r   _init_state_EmformerLayer._init_state  s    {{4#7#7^^\bc ;;t'?'?^^djk ;;t'?'?^^djkkk!Zu{{6R0@NNr   statec                 R   US   S   S   R                  5       n[        U R                  U5      n[        U R                  [        R
                  " X R                  -  5      5      nUS   U R                  U-
  S  nUS   U R                  U-
  S  nUS   U R                  U-
  S  nXVU4$ )N   r   r   r8   )r   r   r   r   r:   ceilr   )re   r   r   past_left_context_lengthpast_mem_lengthpre_memslc_keylc_vals           r   _unpack_state_EmformerLayer._unpack_state  s    Ahqk!n))+#&t'?'?#M d22DIIkL_L_>_4`a8D00?BDEq$225MMOPq$225MMOP''r   next_knext_vupdate_lengthr   c                 d   [         R                  " US   U/5      n[         R                  " US   U/5      n[         R                  " US   U/5      U R                  * S  US'   XfR                  S   U R                  -
  S  US'   XwR                  S   U R                  -
  S  US'   US   U-   US'   U$ )Nr   r8   r   r   )r   rJ   r   r   r   )re   r   r   r   r   r   new_knew_vs           r   _pack_state_EmformerLayer._pack_state  s     		58V,-		58V,-99eAh-.0D0D/D/FGaQ$*B*BBDEaQ$*B*BBDEa8m+ar   	rc_outputr   r   c                     U R                  U5      [        R                  " X2/5      -   nU R                  U5      U-   nU R	                  U5      nU$ N)rT   r   rJ   r   r   )re   r   r   r   results        r   _process_attention_output(_EmformerLayer._process_attention_output  sK     i(599m5O+PPV$v-''/r   c                     U R                  [        R                  " X!/5      5      nX2R                  S5      S  US UR                  S5       4$ Nr   )r   r   rJ   r!   )re   r   r   r   s       r   _apply_pre_attention_layer_norm._EmformerLayer._apply_pre_attention_layer_norm  sU      00M;U1VW//2454}11!45
 	
r   c                 t    U R                  XU5      nXR                  S5      S  US UR                  S5       4$ r   )r   r!   )re   r   r   r   s       r   _apply_post_attention_ffn(_EmformerLayer._apply_post_attention_ffn  sF     229W	++A.019=T}?Q?QRS?T3UUUr   r   ru   c           	      J   Uc  [        S5      eU R                  (       a4  U R                  UR                  SSS5      5      R                  SSS5      nO8[        R
                  " S5      R                  UR                  UR                  S9nU R                  UUUUUUS9u  pxXx4$ )Nz;attention_mask must be not None when for_inference is Falser   r8   r   r   )r   r   r   r   r   ru   )
r1   r   r   permuter   emptyr}   r   r
   r   )	re   r   r   r   r   ru   r   r   next_ms	            r   _apply_attention_forward'_EmformerLayer._apply_attention_forward  s     !Z[[<<nnY%6%6q!Q%?@HHAqQGkk!n''iooiFVFV'WG NN') + 
	   r   c           
         Uc)  U R                  UR                  S5      UR                  S9nU R                  U5      u  pgnU R                  (       a9  U R                  UR                  SSS5      5      R                  SSS5      n	U	S S n	O8[        R                  " S5      R                  UR                  UR                  S9n	U R                  R                  UUUU	UUUS9u  ppU R                  XUR                  S5      XE5      nXU4$ )Nr   rC   r8   r   r   )r   r   r   r   r   r   r   )r   r!   r
   r   r   r   r   r   r   r}   r   r   r   r   )re   r   r   r   r   r   r   r   r   r   r   r   r   r   s                 r   _apply_attention_infer%_EmformerLayer._apply_attention_infer  s    =$$Y^^A%6y?O?O$PE#'#5#5e#< &<<nnY%6%6q!Q%?@HHAqQGbqkGkk!n''iooiFVFV'WG,0NN,@,@'## -A -
)	6   1BDP%''r   c                     U R                  X5      u  nnU R                  UUUUU5      u  pU R                  XU5      u  pXU	4$ )a  Forward pass for training.

B: batch size;
D: feature dimension of each frame;
T: number of utterance frames;
R: number of right context frames;
M: number of memory elements.

Args:
    utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
    lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        number of valid frames for i-th batch element in ``utterance``.
    right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
    mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
    attention_mask (torch.Tensor): attention mask for underlying attention module.

Returns:
    (Tensor, Tensor, Tensor):
        Tensor
            encoded utterance frames, with shape `(T, B, D)`.
        Tensor
            updated right context frames, with shape `(R, B, D)`.
        Tensor
            updated memory elements, with shape `(M, B, D)`.
)r   r   r   )re   r   r   r   r   ru   layer_norm_utterancelayer_norm_right_contextr   r   output_utterancer   s               r   r   _EmformerLayer.forward  se    H 00J	
 $!%!>!> $"
	 261O1OPYfs1t.{BBr   c                     U R                  X5      u  nnU R                  XbXuU5      u  pn
U R                  XU5      u  pXX4$ )aj  Forward pass for inference.

B: batch size;
D: feature dimension of each frame;
T: number of utterance frames;
R: number of right context frames;
M: number of memory elements.

Args:
    utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
    lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        number of valid frames for i-th batch element in ``utterance``.
    right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
    state (List[torch.Tensor] or None): list of tensors representing layer internal state
        generated in preceding invocation of ``infer``.
    mems (torch.Tensor): memory elements, with shape `(M, B, D)`.

Returns:
    (Tensor, Tensor, List[torch.Tensor], Tensor):
        Tensor
            encoded utterance frames, with shape `(T, B, D)`.
        Tensor
            updated right context frames, with shape `(R, B, D)`.
        List[Tensor]
            list of tensors representing layer internal state
            generated in current invocation of ``infer``.
        Tensor
            updated memory elements, with shape `(M, B, D)`.
)r   r   r   )re   r   r   r   r   r   r   r   r   r   output_stater   r   s                r   r   _EmformerLayer.infer  s`    R 00J	
 $/3/J/J +C50
,	 261O1OPYfs1t.|PPr   )r   rT   rR   r   r   r   r   r   r   r   r   )r   r*   r   r   NFr   )r   r   r   r   r   r   rz   strr   r~   r\   r   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s   @r   r   r   ?  s   0  #$ ,0!",+,+ ,+ 	,+
 ,+ ,+ ,+ !,+ ,+ #5/,+ ,+ ,+ ,+\Oc O8ELL3I OdSXS_S_N` O(4#5 (%ell\a\h\h@h:i (  	
 ll ELL! 
ell	 	<<	 <<	 ||		
 
	

6;ll
	u||U\\)	*
VV27,,VOT||V	u||U\\)	*V!<<! ! ||	!
 ll! !.! 
u||U\\)	*!2(<<( ( ||	(
 ll( U\\*+( 
u||U\\4+==	>(8-C<<-C -C ||	-C
 ll-C -C 
u||U\\5<<7	8-C^ YY-Q<<-Q -Q ||	-Q
 U\\*+-Q ll-Q 
u||U\\4+=u||K	L-Q -Qr   r   c                   f  ^  \ rS rSr   SS\R
                  R                  S\S\S\S\4
U 4S jjjrS\R                  S	\R                  4S
 jr
S\S\S	\\   4S jrS\R                  S	\R                  4S jrS\R                  S\R                  S	\\R                  \R                  4   4S jr\R                   R"                   SS\R                  S\R                  S\\\\R                           S	\\R                  \R                  \\\R                        4   4S jj5       rSrU =r$ )_EmformerImpliO  emformer_layersr   r   right_context_lengthr   c                    > [         TU ]  5         US:  U l        [        R                  R                  UUSS9U l        Xl        X0l        X@l	        X l
        XPl        g )Nr   Tr   )r[   r\   r   r   r-   r   r   r  r   r  r   r   )re   r  r   r   r  r   rf   s         r   r\   _EmformerImpl.__init__P  s`     	&*++&! , 

  /#6 $8!,.r   rh   r   c                 ~   UR                   S   n[        R                  " X R                  -
  U R                  -  5      n/ n[        US-
  5       H6  nUS-   U R                  -  nX`R                  -   nUR                  XU 5        M8     UR                  XU R                  -
  S  5        [        R                  " U5      $ Nr   r   )	r   r:   r   r  r   r9   appendr   rJ   )re   rh   r"   num_segsright_context_blocksseg_idxstartends           r   _gen_right_context _EmformerImpl._gen_right_contextf  s    KKN99a";";;t?R?RRS!X\*Gq[D$7$77E333C ''C(89 + 	##Ed.G.G*G*I$JKyy-..r   r  utterance_lengthc           
         [         R                  " X R                  -  5      nU R                  nU R                  nX-  nXd-   n[        XR                  -  U-
  S5      n[        US-   U R                  -  U5      n	U R                  U-  n
U R                  (       a4  [        XR                  -
  S5      nUS-
  nUX-
  X-
  UUX-
  UX-
  X)-
  /	nU$ UUX-
  UX-
  X)-
  /nU$ r  )	r:   r   r   r  r   r   r   r   r   )re   r  r  r	  rclcrc_startrc_end	seg_startseg_end	rc_lengthm_start
mem_lengthr?   s                 r   _gen_attention_mask_col_widths,_EmformerImpl._gen_attention_mask_col_widthsq  s   99-0C0CCD&&%%<"5"55:A>	w{d&9&99;KL--8	<<'$8$88!<G!AJ!$"# *
J*  "# *J r   c                    UR                  S5      n[        R                  " X R                  -  5      n/ n/ n/ nU R                  (       aA  Sn[        U5       Vs/ s H  oS;   PM	     n	n[        U5       Vs/ s H  oS;   PM	     n
nXEU/nO$Sn[        U5       Vs/ s H  oS;   PM	     n	nS n
XE/n[        U5       H  nU R                  X5      n[        XU R                  UR                  5      nUR                  U5        [        UU	[        U R                  X,U R                  -  -
  5      UR                  5      nUR                  U5        U
c  M  [        XSUR                  5      nUR                  U5        M     S[        R                  " U Vs/ s H  n[        R                  " U5      PM     sn5      -
  R                  [        R                  5      nU$ s  snf s  snf s  snf s  snf )Nr   	   )r         )r  r      )r   r  r   )r!   r:   r   r   r   r9   r  rN   r  r
   r  r   r   rJ   r}   r~   )re   rh   r  r	  rc_mask
query_masksummary_masknum_colsidxrc_q_cols_masks_cols_maskmasks_to_concatr  r?   rc_mask_blockquery_mask_blocksummary_mask_blockmaskru   s                      r   _gen_attention_mask!_EmformerImpl._gen_attention_mask  s    ::a=99-0C0CCD
<<H:?/J/3Y./NJ49(ODOS&=OKD&LAOH7<XGVmNGK&3OXG<<WWJ5D,E,Eu||M NN=)8''$1D1D'DD   ./&%>zXY[`[g[g%h"##$67+ '. eii_(U_T4_(UVVZZ[`[e[efG KD
 H6 )Vs   G7G G% G*
r   c                    UR                  SSS5      nU R                  U5      nUSUR                  S5      U R                  -
   nU R	                  U5      nU R
                  (       a6  U R                  UR                  SSS5      5      R                  SSS5      SS O7[        R                  " S5      R                  UR                  UR                  S9nUnU R                   H  nU" XrX6U5      u  psnM     UR                  SSS5      U4$ )a  Forward pass for training and non-streaming inference.

B: batch size;
T: max number of input frames in batch;
D: feature dimension of each frame.

Args:
    input (torch.Tensor): utterance frames right-padded with right context frames, with
        shape `(B, T + right_context_length, D)`.
    lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        number of valid utterance frames for i-th batch element in ``input``.

Returns:
    (Tensor, Tensor):
        Tensor
            output frames, with shape `(B, T, D)`.
        Tensor
            output lengths, with shape `(B,)` and i-th element representing
            number of valid frames for i-th batch element in output frames.
r   r   r8   Nrw   r   )r   r  r!   r  r.  r   r   r   r   r}   r   r
   r  )	re   rh   r   r   r   ru   r   r   layers	            r   r   _EmformerImpl.forward  s    * aA&//6EEJJqMD,E,EEF	11)< || NN9,,Q156>>q!QGLQ""U\\"J 	
 ))E*/Vd*e'F4 *~~aA&//r   statesc                 F   UR                  S5      U R                  U R                  -   :w  a8  [        SU R                  U R                  -    SUR                  S5       S35      eUR	                  SSS5      nUR                  S5      U R                  -
  nXS nUSU n[
        R                  " X R                  -
  SS9nU R                  (       a3  U R                  UR	                  SSS5      5      R	                  SSS5      O7[
        R                  " S5      R                  UR                  UR                  S	9nUn	/ n
[        U R                  5       H5  u  pUR                  U	UUUc  SOX;   U5      u  ppU
R!                  U5        M7     U	R	                  SSS5      Xz4$ )
a  Forward pass for streaming inference.

B: batch size;
D: feature dimension of each frame.

Args:
    input (torch.Tensor): utterance frames right-padded with right context frames, with
        shape `(B, segment_length + right_context_length, D)`.
    lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        number of valid frames for i-th batch element in ``input``.
    states (List[List[torch.Tensor]] or None, optional): list of lists of tensors
        representing internal state generated in preceding invocation of ``infer``. (Default: ``None``)

Returns:
    (Tensor, Tensor, List[List[Tensor]]):
        Tensor
            output frames, with shape `(B, segment_length, D)`.
        Tensor
            output lengths, with shape `(B,)` and i-th element representing
            number of valid frames for i-th batch element in output frames.
        List[List[Tensor]]
            output states; list of lists of tensors representing internal state
            generated in current invocation of ``infer``.
r   zIPer configured segment_length and right_context_length, expected size of z# for dimension 1 of input, but got .r   r8   N)r   r   )r!   r   r  r1   r   r   r   r   r   r   r}   r   r
   	enumerater  r   r  )re   rh   r   r3  right_context_start_idxr   r   output_lengthsr   r   output_statesr=   r1  r   s                 r   r   _EmformerImpl.infer  s   > ::a=D//$2K2KKK&&*&9&9D<U<U&U%V W"ZZ]O1. 
 aA&"'**Q-$2K2K"K67223	W/H/H%HaP || NN9,,Q156>>q!QGQ""U\\"J 	
 24 )$*>*> ?I8=F,=95F<   . !@ ~~aA&EEr   )r  r   r   r   r  r   r   )r   r   r   r   )r   r   r   r   r   r-   
ModuleListr   r\   r   r  r   r  r.  r   r   r   r   r   r   r   r   r   s   @r   r  r  O  so   
 $%$% /,,/ / !	/
 "/ / /,	/ 	/ 	/"c "S "UYZ]U^ "H. .%,, .`!0U\\ !0ELL !0U5<<Y^YeYeKeEf !0F YY
 6:	:F||:F :F d5<<012	:F
 
u||U\\4U\\0B+CC	D:F :Fr   r  c                   ~   ^  \ rS rSrSr        SS\S\S\S\S\S\S	\S
\S\S\S\\   S\	S\4U 4S jjjr
SrU =r$ )r   i&  a  Emformer architecture introduced in
*Emformer: Efficient Memory Transformer Based Acoustic Model for Low Latency Streaming Speech Recognition*
:cite:`shi2021emformer`.

See Also:
    * :func:`~torchaudio.models.emformer_rnnt_model`,
      :func:`~torchaudio.models.emformer_rnnt_base`: factory functions.
    * :class:`torchaudio.pipelines.RNNTBundle`: ASR pipelines with pretrained model.

Args:
    input_dim (int): input dimension.
    num_heads (int): number of attention heads in each Emformer layer.
    ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
    num_layers (int): number of Emformer layers to instantiate.
    segment_length (int): length of each input segment.
    dropout (float, optional): dropout probability. (Default: 0.0)
    activation (str, optional): activation function to use in each Emformer layer's
        feedforward network. Must be one of ("relu", "gelu", "silu"). (Default: "relu")
    left_context_length (int, optional): length of left context. (Default: 0)
    right_context_length (int, optional): length of right context. (Default: 0)
    max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
    weight_init_scale_strategy (str or None, optional): per-layer weight initialization scaling
        strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise")
    tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
    negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)

Examples:
    >>> emformer = Emformer(512, 8, 2048, 20, 4, right_context_length=1)
    >>> input = torch.rand(128, 400, 512)  # batch, num_frames, feature_dim
    >>> lengths = torch.randint(1, 200, (128,))  # batch
    >>> output, lengths = emformer(input, lengths)
    >>> input = torch.rand(128, 5, 512)
    >>> lengths = torch.ones(128) * 5
    >>> output, lengths, states = emformer.infer(input, lengths, None)
rR   rS   r   r4   r   rT   r(   r   r  r   r3   rV   rW   c                    > [        X5      n[        R                  R                  [	        U5       Vs/ s H  n[        UUUUUUUU
X   UUS9PM     sn5      n[        TU ]  UUUU	U
S9  g s  snf )N)rT   r(   r   r   rU   rV   rW   )r   r  r   )r>   r   r-   r;  r9   r   r[   r\   )re   rR   rS   r   r4   r   rT   r(   r   r  r   r3   rV   rW   weight_init_gainsr=   r  rf   s                    r   r\   Emformer.__init__K  s      33MZ((-- "'z!2 "3I "#)(;$3%6%A +!- "3
$ 	 3!5+ 	 	
#s   A+ )r   r*   r   r   r   r6   Fr   )r   r   r   r   r   r   rz   r   r   r~   r\   r   r   r   s   @r   r   r   &  s    "V  #$$% 4?!")
)
 )
 	)

 )
 )
 )
 )
 !)
 ")
 )
 %-SM)
 )
 )
 )
r   r   )r:   typingr   r   r   r   __all__r   r   r'   r   r-   Moduler2   r   rz   r>   r~   r
   rN   rP   r   r  r   r@  r   r   <module>rD     s    ( (  ,ell u||  04||<< \\ \\	
 ,, u||, ell(As Auxx Agx} gRU gZ^_ghm_nZo g(S	(%)$Z(;>(HM(
\\(p
 p
fMQUXX__ MQ`TFEHHOO TFnN
} N
r   