
    hՊ              ,       D   S SK JrJr  S SKJrJrJr  S SKrS SKJ	r	  / SQr
 " S S\R                  R                  5      r " S S	\R                  R                  5      r " S
 S\5      r " S S\R                  R                  \5      r " S S\R                  R                  5      r " S S\R                  R                  5      r " S S\R                  R                  5      rS\S\S\S\S\S\S\S\S\S\S\S\S \S!\S"\S#\S$\S%\S&\S'\S(\S)\4,S* jrS\S)\4S+ jrg),    )ABCabstractmethod)ListOptionalTupleN)Emformer)RNNTemformer_rnnt_baseemformer_rnnt_modelc                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  S\	\R                  \R                  4   4S	 jr
S
rU =r$ )_TimeReduction   zCoalesces frames along time dimension into a
fewer number of frames with higher feature dimensionality.

Args:
    stride (int): number of frames to merge for each output frame.
stridereturnNc                 .   > [         TU ]  5         Xl        g N)super__init__r   )selfr   	__class__s     N/var/www/auris/envauris/lib/python3.13/site-packages/torchaudio/models/rnnt.pyr   _TimeReduction.__init__   s        inputlengthsc                    UR                   u  p4nXDU R                  -  -
  nUSS2SU2SS24   nUR                  U R                  SS9nX`R                  -  nUR                  X7XPR                  -  5      nUR	                  5       nX4$ )a  Forward pass.

B: batch size;
T: maximum input sequence length in batch;
D: feature dimension of each input sequence frame.

Args:
    input (torch.Tensor): input sequences, with shape `(B, T, D)`.
    lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        number of valid frames for i-th batch element in ``input``.

Returns:
    (torch.Tensor, torch.Tensor):
        torch.Tensor
            output sequences, with shape
            `(B, T  // stride, D * stride)`
        torch.Tensor
            output lengths, with shape `(B,)` and i-th element representing
            number of valid frames for i-th batch element in output sequences.
Ntrunc)rounding_mode)shaper   divreshape
contiguous)	r   r   r   BTD
num_framesT_maxoutputs	            r   forward_TimeReduction.forward   s    * ++adkk/*
a*a'(++dkk+Akk)q[[9""$r   )r   )__name__
__module____qualname____firstlineno____doc__intr   torchTensorr   r)   __static_attributes____classcell__r   s   @r   r   r      sW    s t U\\ ELL U5<<Y^YeYeKeEf  r   r   c                      ^  \ rS rSrSr  SS\S\S\S\SS4
U 4S	 jjjrS
\	R                  S\\\	R                        S\\	R                  \\	R                     4   4S jrSrU =r$ )_CustomLSTM7   a  Custom long-short-term memory (LSTM) block that applies layer normalization
to internal nodes.

Args:
    input_dim (int): input dimension.
    hidden_dim (int): hidden dimension.
    layer_norm (bool, optional): if ``True``, enables layer normalization. (Default: ``False``)
    layer_norm_epsilon (float, optional):  value of epsilon to use in
        layer normalization layers (Default: 1e-5)
	input_dim
hidden_dim
layer_normlayer_norm_epsilonr   Nc                    > [         TU ]  5         [        R                  R	                  USU-  U(       + S9U l        [        R                  R	                  USU-  SS9U l        U(       aI  [        R                  R                  X$S9U l        [        R                  R                  SU-  US9U l	        OF[        R                  R                  5       U l        [        R                  R                  5       U l	        X l        g )N   biasF)eps)r   r   r1   nnLinearx2gp2g	LayerNormc_normg_normIdentityr:   )r   r9   r:   r;   r<   r   s        r   r   _CustomLSTM.__init__C   s     	88??9a*n
N?T88??:q:~E?J((,,Z,PDK((,,Q^AS,TDK((++-DK((++-DK$r   r   statec                    Ucz  UR                  S5      n[        R                  " X0R                  UR                  UR
                  S9n[        R                  " X0R                  UR                  UR
                  S9nOUu  pEU R                  U5      n/ nUR                  S5       H  nXR                  U5      -   nU R                  U5      nUR                  SS5      u  ppU	R                  5       n	U
R                  5       n
UR                  5       nUR                  5       nX-  X-  -   nU R                  U5      nXR                  5       -  nUR                  U5        M     [        R                  " USS9nXE/nX4$ )aR  Forward pass.

B: batch size;
T: maximum sequence length in batch;
D: feature dimension of each input sequence element.

Args:
    input (torch.Tensor): with shape `(T, B, D)`.
    state (List[torch.Tensor] or None): list of tensors
        representing internal state generated in preceding invocation
        of ``forward``.

Returns:
    (torch.Tensor, List[torch.Tensor]):
        torch.Tensor
            output, with shape `(T, B, hidden_dim)`.
        List[torch.Tensor]
            list of tensors representing internal state generated
            in current invocation of ``forward``.
   )devicedtyper   r>   )dim)sizer1   zerosr:   rN   rO   rD   unbindrE   rH   chunksigmoidtanhrG   appendstack)r   r   rK   r#   hcgated_inputoutputsgates
input_gateforget_gate	cell_gateoutput_gater(   s                 r   r)   _CustomLSTM.forwardV   s@   . =

1AAu||5;;WAAu||5;;WADAhhuo ''*EHHQK'EKK&E>Ckk!Q>O;JY#++-J%--/K!(I%--/K*"88AAAffh&ANN1 + W!,}r   )rG   rH   r:   rE   rD   )Fh㈵>r+   r,   r-   r.   r/   r0   boolfloatr   r1   r2   r   r   r   r)   r3   r4   r5   s   @r   r7   r7   7   s    	 !$(%% % 	%
 "% 
% %&0\\0*243E*F0	u||T%,,//	00 0r   r7   c                   T   \ rS rSr\S\R                  S\R                  S\\R                  \R                  4   4S j5       r\S\R                  S\R                  S\	\
\
\R                           S\\R                  \R                  \
\
\R                        4   4S j5       rSrg	)
_Transcriber   r   r   r   c                     g r    )r   r   r   s      r   r)   _Transcriber.forward   s    r   statesc                     g r   rk   )r   r   r   rm   s       r   infer_Transcriber.infer   s     	r   rk   N)r+   r,   r-   r.   r   r1   r2   r   r)   r   r   ro   r3   rk   r   r   rh   rh      s    U\\ ELL U5<<Y^YeYeKeEf   ||  d5<<012	
 
u||U\\4U\\0B+CC	D r   rh   c            !         ^  \ rS rSrSrSSSSSS.S	\S
\S\S\S\S\S\S\S\S\S\S\S\S\S\SS4 U 4S jjjr	S\
R                  S\
R                  S\\
R                  \
R                  4   4S jr\
R                  R                  S\
R                  S\
R                  S\\\\
R                           S\\
R                  \
R                  \\\
R                        4   4S j5       rS rU =r$ )!_EmformerEncoder   am  Emformer-based recurrent neural network transducer (RNN-T) encoder (transcription network).

Args:
    input_dim (int): feature dimension of each input sequence element.
    output_dim (int): feature dimension of each output sequence element.
    segment_length (int): length of input segment expressed as number of frames.
    right_context_length (int): length of right context expressed as number of frames.
    time_reduction_input_dim (int): dimension to scale each element in input sequences to
        prior to applying time reduction block.
    time_reduction_stride (int): factor by which to reduce length of input sequence.
    transformer_num_heads (int): number of attention heads in each Emformer layer.
    transformer_ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
    transformer_num_layers (int): number of Emformer layers to instantiate.
    transformer_left_context_length (int): length of left context.
    transformer_dropout (float, optional): transformer dropout probability. (Default: 0.0)
    transformer_activation (str, optional): activation function to use in each Emformer layer's
        feedforward network. Must be one of ("relu", "gelu", "silu"). (Default: "relu")
    transformer_max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
    transformer_weight_init_scale_strategy (str, optional): per-layer weight initialization scaling
        strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise")
    transformer_tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
        relur   	depthwiseF)transformer_dropouttransformer_activationtransformer_max_memory_size&transformer_weight_init_scale_strategytransformer_tanh_on_memr9   
output_dimsegment_lengthright_context_lengthtime_reduction_input_dimtime_reduction_stridetransformer_num_headstransformer_ffn_dimtransformer_num_layerstransformer_left_context_lengthrw   rx   ry   rz   r{   r   Nc                ^  > [         TU ]  5         [        R                  R	                  UUSS9U l        [        U5      U l        XV-  n[        UUUU	X6-  UUU
XF-  UUUS9U l	        [        R                  R	                  UU5      U l
        [        R                  R                  U5      U l        g )NFr?   )dropout
activationleft_context_lengthr~   max_memory_sizeweight_init_scale_strategytanh_on_mem)r   r   r1   rB   rC   input_linearr   time_reductionr   transformeroutput_linearrF   r;   )r   r9   r|   r}   r~   r   r   r   r   r   r   rw   rx   ry   rz   r{   transformer_input_dimr   s                    r   r   _EmformerEncoder.__init__   s    & 	!HHOO$ , 

 --BC 8 P#!!"3'- ?!5!N7'M/
 #XX__-BJO((,,Z8r   r   r   c                     U R                  U5      nU R                  X25      u  pEU R                  XE5      u  pgU R                  U5      nU R	                  U5      n	X4$ )a  Forward pass for training.

B: batch size;
T: maximum input sequence length in batch;
D: feature dimension of each input sequence frame (input_dim).

Args:
    input (torch.Tensor): input frame sequences right-padded with right context, with
        shape `(B, T + right context length, D)`.
    lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        number of valid frames for i-th batch element in ``input``.

Returns:
    (torch.Tensor, torch.Tensor):
        torch.Tensor
            output frame sequences, with
            shape `(B, T // time_reduction_stride, output_dim)`.
        torch.Tensor
            output input lengths, with shape `(B,)` and i-th element representing
            number of valid elements for i-th batch element in output frame sequences.
)r   r   r   r   r;   )
r   r   r   input_linear_outtime_reduction_outtime_reduction_lengthstransformer_outtransformer_lengthsoutput_linear_outlayer_norm_outs
             r   r)   _EmformerEncoder.forward   sg    ,  ,,U3595H5HIY5c2/3/?/?@R/k, ..?):;22r   rm   c                     U R                  U5      nU R                  XB5      u  pVU R                  R                  XVU5      u  nnn	U R	                  U5      n
U R                  U
5      nXU	4$ )a  Forward pass for inference.

B: batch size;
T: maximum input sequence segment length in batch;
D: feature dimension of each input sequence frame (input_dim).

Args:
    input (torch.Tensor): input frame sequence segments right-padded with right context, with
        shape `(B, T + right context length, D)`.
    lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        number of valid frames for i-th batch element in ``input``.
    state (List[List[torch.Tensor]] or None): list of lists of tensors
        representing internal state generated in preceding invocation
        of ``infer``.

Returns:
    (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
        torch.Tensor
            output frame sequences, with
            shape `(B, T // time_reduction_stride, output_dim)`.
        torch.Tensor
            output input lengths, with shape `(B,)` and i-th element representing
            number of valid elements for i-th batch element in output.
        List[List[torch.Tensor]]
            output states; list of lists of tensors
            representing internal state generated in current invocation
            of ``infer``.
)r   r   r   ro   r   r;   )r   r   r   rm   r   r   r   r   r   transformer_statesr   r   s               r   ro   _EmformerEncoder.infer   s~    F  ,,U3595H5HIY5c2
 ""#5vV		
 ..?):;4FFFr   )r   r;   r   r   r   )r+   r,   r-   r.   r/   r0   rf   strre   r   r1   r2   r   r)   jitexportr   r   ro   r3   r4   r5   s   @r   rr   rr      s   H &)&,+,6A(-#*9 *9 	*9
 *9 "*9 #&*9  #*9  #*9 !*9 !$*9 *-*9 #*9 !$*9 &)*9  14!*9" "&#*9$ 
%*9 *9X3U\\ 3ELL 3U5<<Y^YeYeKeEf 3: YY+G||+G +G d5<<012	+G
 
u||U\\4U\\0B+CC	D+G +Gr   rr   c                      ^  \ rS rSrSr   SS\S\S\S\S\S\S	\S
\SS4U 4S jjjr SS\	R                  S\	R                  S\\\\	R                           S\\	R                  \	R                  \\\	R                        4   4S jjrSrU =r$ )
_Predictori(  a  Recurrent neural network transducer (RNN-T) prediction network.

Args:
    num_symbols (int): size of target token lexicon.
    output_dim (int): feature dimension of each output sequence element.
    symbol_embedding_dim (int): dimension of each target token embedding.
    num_lstm_layers (int): number of LSTM layers to instantiate.
    lstm_hidden_dim (int): output dimension of each LSTM layer.
    lstm_layer_norm (bool, optional): if ``True``, enables layer normalization
        for LSTM layers. (Default: ``False``)
    lstm_layer_norm_epsilon (float, optional): value of epsilon to use in
        LSTM layer normalization layers. (Default: 1e-5)
    lstm_dropout (float, optional): LSTM dropout probability. (Default: 0.0)

num_symbolsr|   symbol_embedding_dimnum_lstm_layerslstm_hidden_dimlstm_layer_normlstm_layer_norm_epsilonlstm_dropoutr   Nc	                 <  > [         T
U ]  5         [        R                  R	                  X5      U l        [        R                  R                  U5      U l        [        R                  R                  [        U5       V	s/ s H  n	[        U	S:X  a  UOUUUUS9PM     sn	5      U l        [        R                  R                  US9U l        [        R                  R                  XR5      U l        [        R                  R                  U5      U l        Xl        g s  sn	f )Nr   )r;   r<   )p)r   r   r1   rB   	Embedding	embeddingrF   input_layer_norm
ModuleListranger7   lstm_layersDropoutr   rC   linearoutput_layer_normr   )r   r   r|   r   r   r   r   r   r   idxr   s             r   r   _Predictor.__init__9  s     	++KN % 2 23G H 88.. !1 2C ,/1H(/#.'>	 2

 xx'','7hhoooB!&!3!3J!?(s   >Dr   r   rK   c                    UR                  SS5      nU R                  U5      nU R                  U5      nUn/ n[        U R                  5       H8  u  pU
" Xsc  SOX9   5      u  p{U R                  U5      nUR                  U5        M:     U R                  U5      nU R                  U5      nUR                  SSS5      X(4$ )as  Forward pass.

B: batch size;
U: maximum sequence length in batch;
D: feature dimension of each input sequence element.

Args:
    input (torch.Tensor): target sequences, with shape `(B, U)` and each element
        mapping to a target symbol, i.e. in range `[0, num_symbols)`.
    lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        number of valid frames for i-th batch element in ``input``.
    state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
        representing internal state generated in preceding invocation
        of ``forward``. (Default: ``None``)

Returns:
    (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
        torch.Tensor
            output encoding sequences, with shape `(B, U, output_dim)`
        torch.Tensor
            output lengths, with shape `(B,)` and i-th element representing
            number of valid elements for i-th batch element in output encoding sequences.
        List[List[torch.Tensor]]
            output states; list of lists of tensors
            representing internal state generated in current invocation of ``forward``.
rM   r   N   )	permuter   r   	enumerater   r   rW   r   r   )r   r   r   rK   input_tbembedding_outinput_layer_norm_outlstm_out	state_out	layer_idxlstmlstm_state_out
linear_outoutput_layer_norm_outs                 r   r)   _Predictor.forwardX  s    @ ==A&x0#44]C'.0	()9)9:OI'+HmdQVQa'b$H||H-H^,  ;
 [[*
 $ 6 6z B$,,Q15wIIr   )r   r   r   r   r   r   r   )Frc   rt   r   rd   r5   s   @r   r   r   (  s    . !&)-!)) ) "	)
 ) ) ) "') ) 
) )F 59	-J||-J -J T%,,/01	-J
 
u||U\\4U\\0B+CC	D-J -Jr   r   c                      ^  \ rS rSrSrSS\S\S\SS4U 4S jjjrS	\R                  S
\R                  S\R                  S\R                  S\
\R                  \R                  \R                  4   4
S jrSrU =r$ )_Joineri  a(  Recurrent neural network transducer (RNN-T) joint network.

Args:
    input_dim (int): source and target input dimension.
    output_dim (int): output dimension.
    activation (str, optional): activation function to use in the joiner.
        Must be one of ("relu", "tanh"). (Default: "relu")

r9   r|   r   r   Nc                 *  > [         TU ]  5         [        R                  R	                  XSS9U l        US:X  a$  [        R                  R                  5       U l        g US:X  a$  [        R                  R                  5       U l        g [        SU 35      e)NTr?   ru   rV   zUnsupported activation )
r   r   r1   rB   rC   r   ReLUr   Tanh
ValueError)r   r9   r|   r   r   s       r   r   _Joiner.__init__  sk    hhooi$oG#hhmmoDO6!#hhmmoDO6zlCDDr   source_encodingssource_lengthstarget_encodingstarget_lengthsc                     UR                  S5      R                  5       UR                  S5      R                  5       -   nU R                  U5      nU R                  U5      nXrU4$ )a  Forward pass for training.

B: batch size;
T: maximum source sequence length in batch;
U: maximum target sequence length in batch;
D: dimension of each source and target sequence encoding.

Args:
    source_encodings (torch.Tensor): source encoding sequences, with
        shape `(B, T, D)`.
    source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        valid sequence length of i-th batch element in ``source_encodings``.
    target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
    target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        valid sequence length of i-th batch element in ``target_encodings``.

Returns:
    (torch.Tensor, torch.Tensor, torch.Tensor):
        torch.Tensor
            joint network output, with shape `(B, T, U, output_dim)`.
        torch.Tensor
            output source lengths, with shape `(B,)` and i-th element representing
            number of valid elements along dim 1 for i-th batch element in joint network output.
        torch.Tensor
            output target lengths, with shape `(B,)` and i-th element representing
            number of valid elements along dim 2 for i-th batch element in joint network output.
r   rM   )	unsqueezer"   r   r   )r   r   r   r   r   joint_encodingsactivation_outr(   s           r   r)   _Joiner.forward  s`    D +44Q7BBDGWGaGabcGdGoGoGqq9^,~55r   )r   r   )ru   )r+   r,   r-   r.   r/   r0   r   r   r1   r2   r   r)   r3   r4   r5   s   @r   r   r     s    E# E3 EC EUY E E%6,,%6 %6  ,,	%6
 %6 
u||U\\5<<7	8%6 %6r   r   c                     ^  \ rS rSrSrS\S\S\SS4U 4S jjr SS	\	R                  S
\	R                  S\	R                  S\	R                  S\\\\	R                           S\\	R                  \	R                  \	R                  \\\	R                        4   4S jjr\	R                  R                   S	\	R                  S
\	R                  S\\\\	R                           S\\	R                  \	R                  \\\	R                        4   4S j5       r\	R                  R                   S	\	R                  S
\	R                  S\\	R                  \	R                  4   4S j5       r\	R                  R                   S\	R                  S\	R                  S\\\\	R                           S\\	R                  \	R                  \\\	R                        4   4S j5       r\	R                  R                   S\	R                  S
\	R                  S\	R                  S\	R                  S\\	R                  \	R                  \	R                  4   4
S j5       rSrU =r$ )r	   i  a  torchaudio.models.RNNT()

Recurrent neural network transducer (RNN-T) model.

Note:
    To build the model, please use one of the factory functions.

See Also:
    :class:`torchaudio.pipelines.RNNTBundle`: ASR pipeline with pre-trained models.

Args:
    transcriber (torch.nn.Module): transcription network.
    predictor (torch.nn.Module): prediction network.
    joiner (torch.nn.Module): joint network.
transcriber	predictorjoinerr   Nc                 F   > [         TU ]  5         Xl        X l        X0l        g r   )r   r   r   r   r   )r   r   r   r   r   s       r   r   RNNT.__init__  s    &"r   sourcesr   targetsr   predictor_statec                     U R                  UUS9u  pbU R                  UUUS9u  ptnU R                  UUUUS9u  pnUUUU4$ )a  Forward pass for training.

B: batch size;
T: maximum source sequence length in batch;
U: maximum target sequence length in batch;
D: feature dimension of each source sequence element.

Args:
    sources (torch.Tensor): source frame sequences right-padded with right context, with
        shape `(B, T, D)`.
    source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        number of valid frames for i-th batch element in ``sources``.
    targets (torch.Tensor): target sequences, with shape `(B, U)` and each element
        mapping to a target symbol.
    target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        number of valid frames for i-th batch element in ``targets``.
    predictor_state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
        representing prediction network internal state generated in preceding invocation
        of ``forward``. (Default: ``None``)

Returns:
    (torch.Tensor, torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
        torch.Tensor
            joint network output, with shape
            `(B, max output source length, max output target length, output_dim (number of target symbols))`.
        torch.Tensor
            output source lengths, with shape `(B,)` and i-th element representing
            number of valid elements along dim 1 for i-th batch element in joint network output.
        torch.Tensor
            output target lengths, with shape `(B,)` and i-th element representing
            number of valid elements along dim 2 for i-th batch element in joint network output.
        List[List[torch.Tensor]]
            output states; list of lists of tensors
            representing prediction network internal state generated in current invocation
            of ``forward``.
)r   r   r   r   rK   r   r   r   r   )r   r   r   )	r   r   r   r   r   r   r   r   r(   s	            r   r)   RNNT.forward  s    X ,0+;+;" ,< ,
( =ANN"! =K =
9/
 26-)-)	 2= 2
. 	
 	
r   rK   c                 :    U R                   R                  XU5      $ )a  Applies transcription network to sources in streaming mode.

B: batch size;
T: maximum source sequence segment length in batch;
D: feature dimension of each source sequence frame.

Args:
    sources (torch.Tensor): source frame sequence segments right-padded with right context, with
        shape `(B, T + right context length, D)`.
    source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        number of valid frames for i-th batch element in ``sources``.
    state (List[List[torch.Tensor]] or None): list of lists of tensors
        representing transcription network internal state generated in preceding invocation
        of ``transcribe_streaming``.

Returns:
    (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
        torch.Tensor
            output frame sequences, with
            shape `(B, T // time_reduction_stride, output_dim)`.
        torch.Tensor
            output lengths, with shape `(B,)` and i-th element representing
            number of valid elements for i-th batch element in output.
        List[List[torch.Tensor]]
            output states; list of lists of tensors
            representing transcription network internal state generated in current invocation
            of ``transcribe_streaming``.
)r   ro   )r   r   r   rK   s       r   transcribe_streamingRNNT.transcribe_streaming  s    F %%guEEr   c                 $    U R                  X5      $ )a>  Applies transcription network to sources in non-streaming mode.

B: batch size;
T: maximum source sequence length in batch;
D: feature dimension of each source sequence frame.

Args:
    sources (torch.Tensor): source frame sequences right-padded with right context, with
        shape `(B, T + right context length, D)`.
    source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        number of valid frames for i-th batch element in ``sources``.

Returns:
    (torch.Tensor, torch.Tensor):
        torch.Tensor
            output frame sequences, with
            shape `(B, T // time_reduction_stride, output_dim)`.
        torch.Tensor
            output lengths, with shape `(B,)` and i-th element representing
            number of valid elements for i-th batch element in output frame sequences.
)r   )r   r   r   s      r   
transcribeRNNT.transcribeD  s    6 88r   c                 "    U R                  XUS9$ )aj  Applies prediction network to targets.

B: batch size;
U: maximum target sequence length in batch;
D: feature dimension of each target sequence frame.

Args:
    targets (torch.Tensor): target sequences, with shape `(B, U)` and each element
        mapping to a target symbol, i.e. in range `[0, num_symbols)`.
    target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        number of valid frames for i-th batch element in ``targets``.
    state (List[List[torch.Tensor]] or None): list of lists of tensors
        representing internal state generated in preceding invocation
        of ``predict``.

Returns:
    (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
        torch.Tensor
            output frame sequences, with shape `(B, U, output_dim)`.
        torch.Tensor
            output lengths, with shape `(B,)` and i-th element representing
            number of valid elements for i-th batch element in output.
        List[List[torch.Tensor]]
            output states; list of lists of tensors
            representing internal state generated in current invocation of ``predict``.
r   )r   )r   r   r   rK   s       r   predictRNNT.predicta  s    B ~~G5~QQr   r   r   c                 4    U R                  UUUUS9u  pRnXRU4$ )a  Applies joint network to source and target encodings.

B: batch size;
T: maximum source sequence length in batch;
U: maximum target sequence length in batch;
D: dimension of each source and target sequence encoding.

Args:
    source_encodings (torch.Tensor): source encoding sequences, with
        shape `(B, T, D)`.
    source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        valid sequence length of i-th batch element in ``source_encodings``.
    target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
    target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
        valid sequence length of i-th batch element in ``target_encodings``.

Returns:
    (torch.Tensor, torch.Tensor, torch.Tensor):
        torch.Tensor
            joint network output, with shape `(B, T, U, output_dim)`.
        torch.Tensor
            output source lengths, with shape `(B,)` and i-th element representing
            number of valid elements along dim 1 for i-th batch element in joint network output.
        torch.Tensor
            output target lengths, with shape `(B,)` and i-th element representing
            number of valid elements along dim 2 for i-th batch element in joint network output.
r   )r   )r   r   r   r   r   r(   s         r   join	RNNT.join  s6    F 26-)-)	 2= 2
. ~55r   )r   r   r   r   )r+   r,   r-   r.   r/   rh   r   r   r   r1   r2   r   r   r   r)   r   r   r   r   r   r   r3   r4   r5   s   @r   r	   r	     s    L Z QX ]a  ?CA
A
 A
 	A

 A
 "$tELL'9":;A
 
u||U\\5<<d5<<>P9QQ	RA
F YY"F"F "F T%,,/01	"F
 
u||U\\4U\\0B+CC	D"F "FH YY99 9 
u||U\\)	*	9 98 YY R R  R T%,,/01	 R
 
u||U\\4U\\0B+CC	D R  RD YY(6,,(6 (6  ,,	(6
 (6 
u||U\\5<<7	8(6 (6r   r	   r9   encoding_dimr   r}   r~   r   r   r   r   r   rw   rx   r   ry   rz   r{   r   r   r   r   r   r   c                     [        U UUUUUUUU	U
UUUUUS9n[        UUUUUUUUS9n[        X5      n[        UUU5      $ )ap	  Builds Emformer-based :class:`~torchaudio.models.RNNT`.

Note:
    For non-streaming inference, the expectation is for `transcribe` to be called on input
    sequences right-concatenated with `right_context_length` frames.

    For streaming inference, the expectation is for `transcribe_streaming` to be called
    on input chunks comprising `segment_length` frames right-concatenated with `right_context_length`
    frames.

Args:
    input_dim (int): dimension of input sequence frames passed to transcription network.
    encoding_dim (int): dimension of transcription- and prediction-network-generated encodings
        passed to joint network.
    num_symbols (int): cardinality of set of target tokens.
    segment_length (int): length of input segment expressed as number of frames.
    right_context_length (int): length of right context expressed as number of frames.
    time_reduction_input_dim (int): dimension to scale each element in input sequences to
        prior to applying time reduction block.
    time_reduction_stride (int): factor by which to reduce length of input sequence.
    transformer_num_heads (int): number of attention heads in each Emformer layer.
    transformer_ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
    transformer_num_layers (int): number of Emformer layers to instantiate.
    transformer_left_context_length (int): length of left context considered by Emformer.
    transformer_dropout (float): Emformer dropout probability.
    transformer_activation (str): activation function to use in each Emformer layer's
        feedforward network. Must be one of ("relu", "gelu", "silu").
    transformer_max_memory_size (int): maximum number of memory elements to use.
    transformer_weight_init_scale_strategy (str): per-layer weight initialization scaling
        strategy. Must be one of ("depthwise", "constant", ``None``).
    transformer_tanh_on_mem (bool): if ``True``, applies tanh to memory elements.
    symbol_embedding_dim (int): dimension of each target token embedding.
    num_lstm_layers (int): number of LSTM layers to instantiate.
    lstm_layer_norm (bool): if ``True``, enables layer normalization for LSTM layers.
    lstm_layer_norm_epsilon (float): value of epsilon to use in LSTM layer normalization layers.
    lstm_dropout (float): LSTM dropout probability.

Returns:
    RNNT:
        Emformer RNN-T model.
)r9   r|   r}   r~   r   r   r   r   r   rw   rx   r   ry   rz   r{   )r   r   r   r   r   r   )rr   r   r   r	   )r9   r   r   r}   r~   r   r   r   r   r   rw   rx   r   ry   rz   r{   r   r   r   r   r   encoderr   r   s                           r   r   r     s{    B %1!933/5/5(G$?/U 7G" 1',' 7!	I \/FF++r   c                     [        S(0 SS_SS_SU _SS_SS	_S
S_SS	_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS _S!S"_S#S_S$S%_S&S'_6$ ))zBuilds basic version of Emformer-based :class:`~torchaudio.models.RNNT`.

Args:
    num_symbols (int): The size of target token lexicon.

Returns:
    RNNT:
        Emformer RNN-T model.
r9   P   r   i   r   r}      r~   r>   r      r   r      r   i   r      rw   g?rx   gelur      ry   r   rz   rv   r{   Tr   i   r      r   r   gMbP?r   g333333?rk   )r   )r   s    r   r
   r
     s         	
  "%     !  "    & )+ %& 0;  !%!" !#$ %& '( !%)* + r   )abcr   r   typingr   r   r   r1   torchaudio.modelsr   __all__rB   Moduler   r7   rh   rr   r   r   r	   r0   rf   r   re   r   r
   rk   r   r   <module>r      s   # ( (  & @)UXX__ )XO%((// Od3 MGuxx MG`]J ]J@:6ehhoo :6zh6588?? h6V],], ], 	],
 ], ], "], ], ], ],  ], ],  ], &)], "%],  -0!]," "#],$ %],& '],( )],* #+],, -],. 
/],@ C  D  r   