
    3;ji~'                         d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZmZ  e            rdd	lmZ ndZ ej        e          Zd
ddZ G d de          ZdS )z Tokenization class for model T5.    N)copyfile)ListOptionalTuple   )PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )T5Tokenizerzspiece.modelztokenizer.json)
vocab_filetokenizer_filec                   \    e Zd ZU dZeZddgZeZg Z	e
e         ed<   	 	 	 	 	 	 	 	 d fd
	Zedefd            Zed             Zddedee         dee         fdZ	 dde
e         dee
e                  de
e         fdZ	 dde
e         dee
e                  de
e         fdZd Zd Z xZS )T5TokenizerFasta`  
    Construct a "fast" T5 tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 100):
            Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are accessible as
            "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be retrieved by
            calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids method
        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        add_prefix_space (`bool`, *optional*):
            Whether or not the tokenizer should automatically add a prefix space
        from_slow (`book`, *optional*, defaults to `False`):
            Whether or not the tokenizer should be converted from a slow one. If `add_prefix_space` is set, this will be set to `True`.
    	input_idsattention_maskprefix_tokensN</s><unk><pad>d   c	           
         |ld |D             }
t          |
          dk     r|d t          |          D             z  }nK|dk    r)|t          |
          k    rt          d| d| d          nd t          |          D             }
|
}|t                              d	           d
|	d<    t                      j        |f||||||d|	 || _        || _        d S )Nc                 4    g | ]}d t          |          v |S )
<extra_id_)str).0xs     e/root/voice-cloning/.venv/lib/python3.11/site-packages/transformers/models/t5/tokenization_t5_fast.py
<listcomp>z,T5TokenizerFast.__init__.<locals>.<listcomp>d   s,    [[[!LTWXYTZTZDZDZADZDZDZ    r   c                     g | ]}d | d	S r   > r   is     r   r   z,T5TokenizerFast.__init__.<locals>.<listcomp>f   s$    -Z-Z-ZA.?1.?.?.?-Z-Z-Zr    r   zBoth extra_ids (z!) and additional_special_tokens (zk) are provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensc                     g | ]}d | d	S r"   r$   r%   s     r   r   z,T5TokenizerFast.__init__.<locals>.<listcomp>n   s$    HHH!----HHHr    zXYou set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizersT	from_slow)r   	eos_token	unk_token	pad_token	extra_idsadditional_special_tokens)	lenrange
ValueErrorloggerwarning_oncesuper__init__r   
_extra_ids)selfr   r   r)   r*   r+   r,   r-   add_prefix_spacekwargsextra_tokens	__class__s              r   r4   zT5TokenizerFast.__init__V   sQ    %0[['@[[[L<  1$$)-Z-ZyIYIY-Z-Z-ZZ))Q9L0A0A#A#A y  Sl      IHuY7G7GHHHL(4%'j   #'F;		
)&?		
 		
 		
 		
 		
 %#r    returnc                 \    | j         r$t          j                            | j                   ndS )NF)r   ospathisfiler6   s    r   can_save_slow_tokenizerz'T5TokenizerFast.can_save_slow_tokenizer   s$    26/Lrw~~do...uLr    c                     | t           j        v rEt           j        |          }|||k    r|S |'t          j        d| d|  d| d| d	t                     |S )NzGThis tokenizer was incorrectly instantiated with a model max length of z which will be corrected in Transformers v5.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on z( automatically truncating your input to zM when padding/encoding.
- If you want to encode/pad to sequences longer than z you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.)r   max_model_input_sizeswarningswarnFutureWarning)pretrained_model_name_or_pathmax_model_lengthinit_max_model_lengthdeprecated_max_model_lengths       r   !_eventually_correct_t5_max_lengthz1T5TokenizerFast._eventually_correct_t5_max_length   s    (O,QQQ*9*OPm*n'$05JN^5^5^,,&.g3g g 6	g g
 4g g %@g g g "    r    save_directoryfilename_prefixc                 
   | j         st          d          t          j                            |          s t
                              d| d           d S t          j                            ||r|dz   ndt          d         z             }t          j        	                    | j
                  t          j        	                    |          k    r2t          | j
        |           t
                              d|            |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory- r   zCopy vocab file to )rA   r0   r=   r>   isdirr1   errorjoinVOCAB_FILES_NAMESabspathr   r   info)r6   rL   rM   out_vocab_files       r   save_vocabularyzT5TokenizerFast.save_vocabulary   s    + 	  
 w}}^,, 	LLT^TTTUUUFoM_s222QbcoQpp
 
 7??4?++rw~/N/NNNT_n555KK>n>>???  r    token_ids_0token_ids_1c                 `    || j         gz   }|
| j        |z   S || j         gz   }| j        |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )eos_token_idr   )r6   rY   rZ   s      r    build_inputs_with_special_tokensz0T5TokenizerFast.build_inputs_with_special_tokens   sK    & "T%6$77%33%):(;;K%3kAAr    c                 z    | j         g}|t          ||z             dgz  S t          ||z   |z   |z             dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )r\   r.   )r6   rY   rZ   eoss       r   $create_token_type_ids_from_sequencesz4T5TokenizerFast.create_token_type_ids_from_sequences   sS       !{S())QC//;${2S899QC??r    c                 b    t          t          t          d | j                                      S )Nc                 J    t          t          j        d|                     d uS )Nz<extra_id_\d+>)boolresearch)r   s    r   <lambda>z5T5TokenizerFast.get_sentinel_tokens.<locals>.<lambda>   s     bi0A1&E&E!F!Fd!R r    )listsetfilterr-   r@   s    r   get_sentinel_tokensz#T5TokenizerFast.get_sentinel_tokens   s1    RRTXTrsstt
 
 	
r    c                 D      fd                                  D             S )Nc                 :    g | ]}                     |          S r$   )convert_tokens_to_ids)r   tokenr6   s     r   r   z:T5TokenizerFast.get_sentinel_token_ids.<locals>.<listcomp>   s'    ZZZe**511ZZZr    )rj   r@   s   `r   get_sentinel_token_idsz&T5TokenizerFast.get_sentinel_token_ids   s)    ZZZZt?W?W?Y?YZZZZr    )NNr   r   r   r   NN)N)__name__
__module____qualname____doc__rT   vocab_files_namesmodel_input_namesr   slow_tokenizer_classr   r   int__annotations__r4   propertyrc   rA   staticmethodrK   r   r   r   rX   r]   r`   rj   ro   __classcell__)r:   s   @r   r   r   )   s        $ $L *$&67&!M49!!! "&-$ -$ -$ -$ -$ -$^ M M M M XM     \ *! !c !HSM !]bcf]g ! ! ! !* JNB B9B3;DI3FB	cB B B B6 JN@ @9@3;DI3F@	c@ @ @ @,
 
 

[ [ [ [ [ [ [r    r   )rs   r=   rd   rD   shutilr   typingr   r   r   tokenization_utils_fastr   utilsr	   r
   tokenization_t5r   
get_loggerrp   r1   rT   r   r$   r    r   <module>r      s   ' & 				 				        ( ( ( ( ( ( ( ( ( ( > > > > > > 8 8 8 8 8 8 8 8  ,,,,,,,K 
	H	%	%#1EUVV @[ @[ @[ @[ @[- @[ @[ @[ @[ @[r    