
    h5                     j   S SK r S SKrS SKJrJr  S SKJr  S SKJr  S SK	J
r
JrJr  S SKrS SKrS SKJr  S SKJrJrJr  / rS\R,                  " \R.                  " \R0                  5      R2                  5      -  r\" S	S
\-  5      rS r " S S\R<                  R>                  5      r  " S S\R<                  R>                  5      r! " S S\5      r" " S S\5      r# " S S\R<                  R>                  \"5      r$ " S S\#5      r%\ " S S5      5       r&\&" S\" \SS9SSSS S!S"S#S$S%SS&9r'S'\'l(        g)(    N)ABCabstractmethod)	dataclass)partial)CallableListTuple)module_utils)emformer_rnnt_baseRNNTRNNTBeamSearch(   
   g?c                     [         R                  " X [        R                  :     5      X [        R                  :  '   X [        R                  :*     [        R                  -  X [        R                  :*  '   U $ N)torchlogmathexs    Z/var/www/auris/envauris/lib/python3.13/site-packages/torchaudio/pipelines/rnnt_pipeline.py_piecewise_linear_logr      sK    IIaDFF
m,A$&&jMDFF{^dff,A466kNH    c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )_FunctionalModule   c                 .   > [         TU ]  5         Xl        g r   )super__init__
functional)selfr!   	__class__s     r   r    _FunctionalModule.__init__   s    $r   c                 $    U R                  U5      $ r   r!   r"   inputs     r   forward_FunctionalModule.forward   s    u%%r   r&   __name__
__module____qualname____firstlineno__r    r)   __static_attributes____classcell__r#   s   @r   r   r      s    %& &r   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )_GlobalStatsNormalization#   c                 X  > [         TU ]  5         [        U5       n[        R                  " UR                  5       5      nS S S 5        U R                  S[        R                  " WS   5      5        U R                  S[        R                  " US   5      5        g ! , (       d  f       Na= f)Nmean	invstddev)	r   r    openjsonloadsreadregister_bufferr   tensor)r"   global_stats_pathfblobr#   s       r   r    "_GlobalStatsNormalization.__init__$   sx    #$::affh'D % 	VU\\$v,%?@[%,,tK7H*IJ	 %$s   %B
B)c                 8    XR                   -
  U R                  -  $ r   )r7   r8   r'   s     r   r)   !_GlobalStatsNormalization.forward-   s    		!T^^33r    r+   r2   s   @r   r4   r4   #   s    K4 4r   r4   c                   v    \ rS rSr\S\R                  S\\R                  \R                  4   4S j5       rSr	g)_FeatureExtractor1   r(   returnc                     g)  Generates features and length output from the given input tensor.

Args:
    input (torch.Tensor): input tensor.

Returns:
    (torch.Tensor, torch.Tensor):
    torch.Tensor:
        Features, with shape `(length, *)`.
    torch.Tensor:
        Length, with shape `(1,)`.
NrE   r'   s     r   __call___FeatureExtractor.__call__2       r   rE   N)
r,   r-   r.   r/   r   r   Tensorr	   rL   r0   rE   r   r   rG   rG   1   s8    ell uU\\5<<5O/P  r   rG   c                   6    \ rS rSr\S\\   S\4S j5       rSr	g)_TokenProcessorB   tokensrI   c                     g)zDecodes given list of tokens to text sequence.

Args:
    tokens (List[int]): list of tokens to decode.

Returns:
    str:
        Decoded text sequence.
NrE   )r"   rS   kwargss      r   rL   _TokenProcessor.__call__C   rN   r   rE   N)
r,   r-   r.   r/   r   r   intstrrL   r0   rE   r   r   rQ   rQ   B   s&    	tCy 	s 	 	r   rQ   c                      ^  \ rS rSrSrS\R                  R                  SS4U 4S jjrS\R                  S\
\R                  \R                  4   4S jrS	rU =r$ )
_ModuleFeatureExtractorP   z``torch.nn.Module``-based feature extraction pipeline.

Args:
    pipeline (torch.nn.Module): module that implements feature extraction logic.
pipelinerI   Nc                 .   > [         TU ]  5         Xl        g r   )r   r    r\   )r"   r\   r#   s     r   r     _ModuleFeatureExtractor.__init__W   s     r   r(   c                 r    U R                  U5      n[        R                  " UR                  S   /5      nX#4$ )rK   r   )r\   r   r>   shape)r"   r(   featureslengths       r   r)   _ModuleFeatureExtractor.forward[   s5     =='x~~a012r   )r\   )r,   r-   r.   r/   __doc__r   nnModuler    rO   r	   r)   r0   r1   r2   s   @r   rZ   rZ   P   sQ    ! !T ! U\\  eELL%,,4N.O    r   rZ   c                   J    \ rS rSrSrS\SS4S jrSS\\   S\	S\4S	 jjr
S
rg)_SentencePieceTokenProcessorm   zhSentencePiece-model-based token processor.

Args:
    sp_model_path (str): path to SentencePiece model.
sp_model_pathrI   Nc                 $   [         R                  " S5      (       d  [        S5      eSS KnUR	                  US9U l        U R
                  R                  5       U R
                  R                  5       U R
                  R                  5       1U l	        g )Nsentencepiecez2SentencePiece is not available. Please install it.r   )
model_file)
r
   is_module_availableRuntimeErrorrl   SentencePieceProcessorsp_modelunk_ideos_idpad_idpost_process_remove_list)r"   rj   spms      r   r    %_SentencePieceTokenProcessor.__init__t   sq    //@@STT#22m2LMM  "MM  "MM  ")
%r   rS   lstripc                     USS  Vs/ s H  o3U R                   ;  d  M  UPM     nnSR                  U R                  R                  U5      5      R	                  SS5      nU(       a  UR                  5       $ U$ s  snf )a  Decodes given list of tokens to text sequence.

Args:
    tokens (List[int]): list of tokens to decode.
    lstrip (bool, optional): if ``True``, returns text sequence with leading whitespace
        removed. (Default: ``True``).

Returns:
    str:
        Decoded text sequence.
   N u   ▁ )ru   joinrq   id_to_piecereplacerx   )r"   rS   rx   token_indexfiltered_hypo_tokensoutput_strings         r   rL   %_SentencePieceTokenProcessor.__call__   s}     ,2!": 
+5KDLiLi9iK: 	  
  9 9:N OPXXYacfg ''))   
s
   A:A:)ru   rq   )T)r,   r-   r.   r/   rd   rX   r    r   rW   boolrL   r0   rE   r   r   rh   rh   m   s=    
c 
d 
!tCy !$ !# ! !r   rh   c                      \ rS rSr% Sr " S S\5      r " S S\5      r\	\
S'   \/ \4   \
S'   \	\
S	'   \	\
S
'   \\
S'   \\
S'   \\
S'   \\
S'   \\
S'   \\
S'   \\
S'   \\
S'   S\4S jr\S\4S j5       r\S\4S j5       r\S\4S j5       r\S\4S j5       r\S\4S j5       r\S\4S j5       rS\4S jrS\4S jrS\4S jrS\4S jrSrg )!
RNNTBundle   u  Dataclass that bundles components for performing automatic speech recognition (ASR, speech-to-text)
inference with an RNN-T model.

More specifically, the class provides methods that produce the featurization pipeline,
decoder wrapping the specified RNN-T model, and output token post-processor that together
constitute a complete end-to-end ASR inference pipeline that produces a text sequence
given a raw waveform.

It can support non-streaming (full-context) inference as well as streaming inference.

Users should not directly instantiate objects of this class; rather, users should use the
instances (representing pre-trained models) that exist within the module,
e.g. :data:`torchaudio.pipelines.EMFORMER_RNNT_BASE_LIBRISPEECH`.

Example
    >>> import torchaudio
    >>> from torchaudio.pipelines import EMFORMER_RNNT_BASE_LIBRISPEECH
    >>> import torch
    >>>
    >>> # Non-streaming inference.
    >>> # Build feature extractor, decoder with RNN-T model, and token processor.
    >>> feature_extractor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_feature_extractor()
    100%|███████████████████████████████| 3.81k/3.81k [00:00<00:00, 4.22MB/s]
    >>> decoder = EMFORMER_RNNT_BASE_LIBRISPEECH.get_decoder()
    Downloading: "https://download.pytorch.org/torchaudio/models/emformer_rnnt_base_librispeech.pt"
    100%|███████████████████████████████| 293M/293M [00:07<00:00, 42.1MB/s]
    >>> token_processor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_token_processor()
    100%|███████████████████████████████| 295k/295k [00:00<00:00, 25.4MB/s]
    >>>
    >>> # Instantiate LibriSpeech dataset; retrieve waveform for first sample.
    >>> dataset = torchaudio.datasets.LIBRISPEECH("/home/librispeech", url="test-clean")
    >>> waveform = next(iter(dataset))[0].squeeze()
    >>>
    >>> with torch.no_grad():
    >>>     # Produce mel-scale spectrogram features.
    >>>     features, length = feature_extractor(waveform)
    >>>
    >>>     # Generate top-10 hypotheses.
    >>>     hypotheses = decoder(features, length, 10)
    >>>
    >>> # For top hypothesis, convert predicted tokens to text.
    >>> text = token_processor(hypotheses[0][0])
    >>> print(text)
    he hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to [...]
    >>>
    >>>
    >>> # Streaming inference.
    >>> hop_length = EMFORMER_RNNT_BASE_LIBRISPEECH.hop_length
    >>> num_samples_segment = EMFORMER_RNNT_BASE_LIBRISPEECH.segment_length * hop_length
    >>> num_samples_segment_right_context = (
    >>>     num_samples_segment + EMFORMER_RNNT_BASE_LIBRISPEECH.right_context_length * hop_length
    >>> )
    >>>
    >>> # Build streaming inference feature extractor.
    >>> streaming_feature_extractor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_streaming_feature_extractor()
    >>>
    >>> # Process same waveform as before, this time sequentially across overlapping segments
    >>> # to simulate streaming inference. Note the usage of ``streaming_feature_extractor`` and ``decoder.infer``.
    >>> state, hypothesis = None, None
    >>> for idx in range(0, len(waveform), num_samples_segment):
    >>>     segment = waveform[idx: idx + num_samples_segment_right_context]
    >>>     segment = torch.nn.functional.pad(segment, (0, num_samples_segment_right_context - len(segment)))
    >>>     with torch.no_grad():
    >>>         features, length = streaming_feature_extractor(segment)
    >>>         hypotheses, state = decoder.infer(features, length, 10, state=state, hypothesis=hypothesis)
    >>>     hypothesis = hypotheses[0]
    >>>     transcript = token_processor(hypothesis[0])
    >>>     if transcript:
    >>>         print(transcript, end=" ", flush=True)
    he hoped there would be stew for dinner turn ips and car rots and bru 'd oes and fat mut ton pieces to [...]
c                       \ rS rSrSrSrg)RNNTBundle.FeatureExtractor   z:Interface of the feature extraction part of RNN-T pipelinerE   Nr,   r-   r.   r/   rd   r0   rE   r   r   FeatureExtractorr      s    Hr   r   c                       \ rS rSrSrSrg)RNNTBundle.TokenProcessor   z7Interface of the token processor part of RNN-T pipelinerE   Nr   rE   r   r   TokenProcessorr      s    Er   r   
_rnnt_path_rnnt_factory_func_global_stats_path_sp_model_path_right_padding_blank_sample_rate_n_fft_n_mels_hop_length_segment_length_right_context_lengthrI   c                     U R                  5       n[        R                  R                  U R                  5      n[
        R                  " U5      nUR                  U5        UR                  5         U$ r   )	r   
torchaudioutilsdownload_assetr   r   loadload_state_dicteval)r"   modelpath
state_dicts       r   
_get_modelRNNTBundle._get_model   sT    '')..t?ZZ%
j)

r   c                     U R                   $ )zCSample rate (in cycles per second) of input waveforms.

:type: int
)r   r"   s    r   sample_rateRNNTBundle.sample_rate   s        r   c                     U R                   $ )z'Size of FFT window to use.

:type: int
)r   r   s    r   n_fftRNNTBundle.n_fft  s     {{r   c                     U R                   $ )zPNumber of mel spectrogram features to extract from input waveforms.

:type: int
)r   r   s    r   n_melsRNNTBundle.n_mels  s     ||r   c                     U R                   $ )zTNumber of samples between successive frames in input expected by model.

:type: int
)r   r   s    r   
hop_lengthRNNTBundle.hop_length  s     r   c                     U R                   $ )zDNumber of frames in segment in input expected by model.

:type: int
)r   r   s    r   segment_lengthRNNTBundle.segment_length  s     ###r   c                     U R                   $ )zSNumber of frames in right contextual block in input expected by model.

:type: int
)r   r   s    r   right_context_lengthRNNTBundle.right_context_length%  s     )))r   c                 L    U R                  5       n[        XR                  5      $ )z7Constructs RNN-T decoder.

Returns:
    RNNTBeamSearch
)r   r   r   )r"   r   s     r   get_decoderRNNTBundle.get_decoder-  s     !e[[11r   c                   ^  [         R                  R                  T R                  5      n[	        [
        R                  R                  [         R                  R                  T R                  T R                  T R                  T R                  S9[        S 5      [        S 5      [        U5      [        U 4S j5      5      5      $ )zbConstructs feature extractor for non-streaming (full-context) ASR.

Returns:
    FeatureExtractor
r   r   r   r   c                 &    U R                  SS5      $ Nrz   r   	transposer   s    r   <lambda>2RNNTBundle.get_feature_extractor.<locals>.<lambda>B      AKK1,=r   c                 &    [        U [        -  5      $ r   r   _gainr   s    r   r   r   C      ,A!e),Lr   c                 t   > [         R                  R                  R                  U SSSTR                  45      $ )Nr   )r   re   r!   padr   )r   r"   s    r   r   r   E  s,    EHH,?,?,C,CA1aQUQdQdGe,fr   r   r   r   r   rZ   r   re   
Sequential
transformsMelSpectrogramr   r   r   r   r   r4   r"   
local_paths   ` r   get_feature_extractor RNNTBundle.get_feature_extractor6  s      %%44T5L5LM
&HH%%44 $ 0 0

4;;cgcrcr 5  ""=>!"LM)*5!"fg

 
	
r   c                 p   [         R                  R                  U R                  5      n[	        [
        R                  R                  [         R                  R                  U R                  U R                  U R                  U R                  S9[        S 5      [        S 5      [        U5      5      5      $ )z^Constructs feature extractor for streaming (simultaneous) ASR.

Returns:
    FeatureExtractor
r   c                 &    U R                  SS5      $ r   r   r   s    r   r   <RNNTBundle.get_streaming_feature_extractor.<locals>.<lambda>U  r   r   c                 &    [        U [        -  5      $ r   r   r   s    r   r   r   V  r   r   r   r   s     r   get_streaming_feature_extractor*RNNTBundle.get_streaming_feature_extractorI  s      %%44T5L5LM
&HH%%44 $ 0 0

4;;cgcrcr 5  ""=>!"LM)*5	
 		
r   c                 j    [         R                  R                  U R                  5      n[	        U5      $ )z9Constructs token processor.

Returns:
    TokenProcessor
)r   r   r   r   rh   r   s     r   get_token_processorRNNTBundle.get_token_processor[  s+      %%44T5H5HI
+J77r   rE   N)r,   r-   r.   r/   rd   rG   r   rQ   r   rX   __annotations__r   r   rW   r   propertyr   r   r   r   r   r   r   r   r   r   r   r0   rE   r   r   r   r      sU   FPI, IF F O T**KKLD  !S ! ! s       C     $ $ $ *c * *2^ 2
'7 
&
1A 
$8^ 8r   r   z(models/emformer_rnnt_base_librispeech.pti  )num_symbolsz2pipeline-assets/global_stats_rnnt_librispeech.jsonz.pipeline-assets/spm_bpe_4096_librispeech.model   i   i>  i  r[         )r   r   r   r   r   r   r   r   r   r   r   r   a  ASR pipeline based on Emformer-RNNT,
pretrained on *LibriSpeech* dataset :cite:`7178964`,
capable of performing both streaming and non-streaming inference.

The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base`
and utilizes weights trained on LibriSpeech using training script ``train.py``
`here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__ with default arguments.

Please refer to :py:class:`RNNTBundle` for usage instructions.
))r:   r   abcr   r   dataclassesr   	functoolsr   typingr   r   r	   r   r   torchaudio._internalr
   torchaudio.modelsr   r   r   __all__log10iinfoint16max_decibelpowr   r   re   rf   r   r4   rG   rQ   rZ   rh   r   EMFORMER_RNNT_BASE_LIBRISPEECHrd   rE   r   r   <module>r      s0     # !  ( (   - F F DJJu{{5;;7;;<<Bx & &4 4 "c  ehhoo/@  :(!? (!V I8 I8 I8X ",91tDKC" 	*  &r   