
    hS                         S SK r S SKJr  S SKJrJrJr  S SKrS SKJ	r	  S SK
Jr  S SKJr  S SKJr  S SKJr  S	rS
rSrS/S/SS/S.rS\S\\   S\S\\\\4      4S jr " S S\5      rg)    N)Path)ListTupleUnion)Tensor)Dataset)download_url_to_file)_get_librispeech_metadata)_extract_tarlibrispeech_finetuningzIhttps://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz@5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342afz1h/0z1h/*9h)10min1h10hpathfolders
_ext_audioreturnc           	      p   [        U 5      n / nU H  nU R                  U SU 35       Vs/ s H  oUR                  U 5      PM     nnX6 Vs/ s HA  n[        UR                  R                  R                  5      [        UR
                  5      4PMC     sn-  nM     UR                  S S9  U$ s  snf s  snf )a  Get the file names and the corresponding file paths without `speaker_id`
and `chapter_id` directories.
The format of path is like:
    {root}/{_ARCHIVE_NAME}/1h/[0-5]/[clean, other] or
    {root}/{_ARCHIVE_NAME}/9h/[clean, other]

Args:
    path (Path): Root path to the dataset.
    folders (List[str]): Folders that contain the desired audio files.
    _ext_audio (str): Extension of audio files.

Returns:
    List[Tuple[str, str]]:
        List of tuples where the first element is the relative path to the audio file.
        The format of relative path is like:
        1h/[0-5]/[clean, other] or 9h/[clean, other]
        The second element is the file name without audio extension.
z/*/*/*/*c                     U S   U S   -   $ )Nr       )xs    ^/var/www/auris/envauris/lib/python3.13/site-packages/torchaudio/datasets/librilight_limited.py<lambda>$_get_fileids_paths.<locals>.<lambda>,   s    1Q4!A$;    )key)r   globrelative_tostrparentstemsort)r   r   r   files_pathsfolderppathss          r   _get_fileids_pathsr*      s    ( :DK.2ii6((:,8W.XY.Xt$.XYEREqQXX__334c!&&kBERR  ./ ZRs   B.AB3c            
       ~    \ rS rSrSrSrSr  SS\\\	4   S\S\
SS	4S
 jjrS\S\\\\\\\4   4S jrS\4S jrSrg	)LibriLightLimited0   a  Subset of Libri-light :cite:`librilight` dataset,
which was used in HuBERT :cite:`hsu2021hubert` for supervised fine-tuning.

Args:
    root (str or Path): Path to the directory where the dataset is found or downloaded.
    subset (str, optional): The subset to use. Options: [``"10min"``, ``"1h"``, ``"10h"``]
        (Default: ``"10min"``).
    download (bool, optional):
        Whether to download the dataset if it is not found at root path. (default: ``False``).
z
.trans.txtz.flacrootsubsetdownloadr   Nc                    U[         ;  a#  [        S[         R                  5        SU 35      e[         U   n[        R                  " U5      n[        R
                  R                  U[        5      U l        [        R
                  R                  U[         S35      n[        R
                  R                  U R                  5      (       dT  U(       d  [        S5      e[        R
                  R                  U5      (       d  [        [        U[        S9  [        U5        [!        U R                  X@R"                  5      U l        g )Nz`subset` must be one of z	. Found: z.tgzz9Dataset not found. Please use `download=True` to download)hash_prefix)_SUBSET_MAP
ValueErrorkeysosfspathr   join_ARCHIVE_NAME_pathisdirRuntimeErrorisfiler	   _URL	_CHECKSUMr   r*   r   _fileids_paths)selfr.   r/   r0   r   archives         r   __init__LibriLightLimited.__init__?   s     $78H8H8J7K9U[T\]^^f%yyWW\\$6
'',,td%;<ww}}TZZ(("#^__77>>'**$T7	J!0WooVr   nc                    U R                   U   u  p#[        X0R                  X R                  U R                  5      n[
        R                  " [        R                  R                  U R                  US   5      5      u  pVU4USS -   $ )a8  Load the n-th sample from the dataset.

Args:
    n (int): The index of the sample to be loaded
Returns:
    Tuple of the following items;

    Tensor:
        Waveform
    int:
        Sample rate
    str:
        Transcript
    int:
        Speaker ID
    int:
        Chapter ID
    int:
        Utterance ID
r   r   N)
r@   r
   r:   r   _ext_txt
torchaudioloadr6   r   r8   )rA   rE   	file_pathfileidmetadatawaveform_s          r   __getitem__LibriLightLimited.__getitem__T   sn    * !//2	,VZZOO]a]j]jk oobggll4::x{&KL{Xab\))r   c                 ,    [        U R                  5      $ )N)lenr@   )rA   s    r   __len__LibriLightLimited.__len__n   s    4&&''r   )r@   r:   )r   F)__name__
__module____qualname____firstlineno____doc__rG   r   r   r"   r   boolrC   intr   r   rO   rS   __static_attributes__r   r   r   r,   r,   0   s    	 HJ
 	WCIW W 	W
 
W**S *U63S#s+J%K *4( (r   r,   )r6   pathlibr   typingr   r   r   rH   torchr   torch.utils.datar   torchaudio._internalr	   torchaudio.datasets.librispeechr
   torchaudio.datasets.utilsr   r9   r>   r?   r3   r"   r*   r,   r   r   r   <module>rd      s    	  % %   $ 5 E 2 )RN	&$HT DI 3 4PUVY[^V^P_K` :?( ?(r   