
    ~Vji.                         d dl Z d dlmZ d dlmZmZmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZ dZd	Zd
Zg dZddddddddZd Zdedededededee         deeeeeeef         fdZ G d de
          ZdS )    N)Path)ListTupleUnion)Tensor)Dataset)download_url_to_file)_extract_tar_load_waveformtrain-clean-100LibriSpeechi>  )z	dev-cleanz	dev-otherz
test-cleanz
test-otherr   ztrain-clean-360ztrain-other-500@76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3@12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365@39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23@d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29@d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2@146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf@ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2)z4http://www.openslr.org/resources/12/dev-clean.tar.gzz4http://www.openslr.org/resources/12/dev-other.tar.gzz5http://www.openslr.org/resources/12/test-clean.tar.gzz5http://www.openslr.org/resources/12/test-other.tar.gzz:http://www.openslr.org/resources/12/train-clean-100.tar.gzz:http://www.openslr.org/resources/12/train-clean-360.tar.gzz:http://www.openslr.org/resources/12/train-other-500.tar.gzc                 N   d}d}||z   }t           j                            | |          }t           j                            ||          }t           j                            |          s-t                              |d           }t          |||           t          |           d S )Nz$http://www.openslr.org/resources/12/z.tar.gz)hash_prefix)ospathjoinisfile
_CHECKSUMSgetr	   r
   )rooturlbase_urlext_archivefilenamearchivedownload_urlchecksums           a/root/voice-cloning/.venv/lib/python3.11/site-packages/torchaudio/datasets/librispeech_biasing.py_download_librispeechr&   !   s    5HK[ Hgll4**G7<<(33L7>>'"" J>>,55\7IIII    fileidr   folder	ext_audioext_txtblistreturnc                    |pg }|                      d          \  }}}| d| d| }	t          j                            ||||	 |           }
| d| | }t          j                            |||||          }g }t	          |          5 }|D ]i}|                                                     dd          \  }}|	|k    r6|                                 D ]}||v r||vr|                    |             njt          d|	           	 d d d            n# 1 swxY w Y   |
t          |t          |          t          |          t          |          |fS )N-    zTranslation not found for )
splitr   r   r   openstripappendFileNotFoundErrorSAMPLE_RATEint)r(   r   r)   r*   r+   r,   
speaker_id
chapter_idutterance_idfileid_audiofilepath	file_textuttblistftlinefileid_text
transcriptwords                     r%   _get_librispeech_metadatarE   .   s    KRE+1<<+<+<(J
L !>>:>>>>Lw||FJ
|<XY<X<XYYH 55
5G55IT6:z9MMIH	i QB 
	Q 
	QD&*jjll&8&8a&@&@#K{**&,,.. . .Du}}X)=)= --- + $$O$O$OPPP Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q 	JJL s   A?DDDc                       e Zd ZdZdZdZeeddfdee	e
f         de	de	d	ed
ee	         ddfdZdedee	ee	eeef         fdZdedeeee	eeef         fdZdefdZdS )LibriSpeechBiasinga  *LibriSpeech* :cite:`7178964` dataset with prefix-tree construction and biasing support.

    Args:
        root (str or Path): Path to the directory where the dataset is found or downloaded.
        url (str, optional): The URL to download the dataset from,
            or the type of the dataset to dowload.
            Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``,
            ``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and
            ``"train-other-500"``. (default: ``"train-clean-100"``)
        folder_in_archive (str, optional):
            The top-level directory of the dataset. (default: ``"LibriSpeech"``)
        download (bool, optional):
            Whether to download the dataset if it is not found at root path. (default: ``False``).
        blist (list, optional):
            The list of biasing words (default: ``[]``).
    z
.trans.txtz.flacFNr   r   folder_in_archivedownloadr,   r-   c                 \   || _         |t          vrt          d| dt           d          t          j        |          }t          j                            ||          | _        t          j                            |||          | _        t          j        	                    | j                  s+|rt          ||           nt          d| j         d          t          d t          | j                                      d| j        z             D                       | _        || _        d S )NzInvalid url 'z' given; please provide one of .zDataset not found at z5. Please set `download=True` to download the dataset.c              3   >   K   | ]}t          |j                  V  d S N)strstem).0ps     r%   	<genexpr>z.LibriSpeechBiasing.__init__.<locals>.<genexpr>   s*      ddac!&kkddddddr'   z*/*/*)_url_DATA_SUBSETS
ValueErrorr   fspathr   r   _archive_pathisdirr&   RuntimeErrorsortedr   glob
_ext_audio_walkerr,   )selfr   r   rH   rI   r,   s         r%   __init__zLibriSpeechBiasing.__init__i   s    	m##aSaaQ^aaabbbyT+<==W\\$(93??
w}}TZ(( 	 %dC0000"mDJmmm   dd4
3C3C3H3HSWSbIb3c3cddddd


r'   nc                 v    | j         |         }t          || j        | j        | j        | j        | j                  S )a  Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
        but otherwise returns the same fields as :py:func:`__getitem__`.

        Args:
            n (int): The index of the sample to be loaded

        Returns:
            Tuple of the following items;

            str:
                Path to audio
            int:
                Sample rate
            str:
                Transcript
            int:
                Speaker ID
            int:
                Chapter ID
            int:
                Utterance ID
            list:
                List of biasing words in the utterance
        )r^   rE   rW   rS   r]   _ext_txtr,   )r_   ra   r(   s      r%   get_metadatazLibriSpeechBiasing.get_metadata   s5    2 a(	4?\`\ikokuvvvr'   c                     |                      |          }t          | j        |d         |d                   }|f|dd         z   S )a
  Load the n-th sample from the dataset.

        Args:
            n (int): The index of the sample to be loaded

        Returns:
            Tuple of the following items;

            Tensor:
                Waveform
            int:
                Sample rate
            str:
                Transcript
            int:
                Speaker ID
            int:
                Chapter ID
            int:
                Utterance ID
            list:
                List of biasing words in the utterance
        r   r1   N)rd   r   rW   )r_   ra   metadatawaveforms       r%   __getitem__zLibriSpeechBiasing.__getitem__   sG    0 $$Q''!$-!hqkJJ{Xabb\))r'   c                 *    t          | j                  S rM   )lenr^   )r_   s    r%   __len__zLibriSpeechBiasing.__len__   s    4<   r'   )__name__
__module____qualname____doc__rc   r]   URLFOLDER_IN_ARCHIVEr   rN   r   boolr   r`   r8   r   rd   r   rh   rk    r'   r%   rG   rG   T   s!        " HJ
 !2 CI  	
  Cy 
   6wc weCc3S,H&I w w w w8*S *U63S#s+J%K * * * *8! ! ! ! ! ! !r'   rG   )r   pathlibr   typingr   r   r   torchr   torch.utils.datar   torchaudio._internalr	   torchaudio.datasets.utilsr
   r   rp   rq   r7   rT   r   r&   rN   r8   rE   rG   rs   r'   r%   <module>rz      s   				       % % % % % % % % % %       $ $ $ $ $ $ 5 5 5 5 5 5 B B B B B B B B!    =<~== CE CE CE 

 
 
###$'#47#BE#NRSVi#
3S#sC'(# # # #Li! i! i! i! i! i! i! i! i! i!r'   