
    0;ji'                     T   d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlm	Z	 d dl
mZ dZdZdZd	Zeeeed
dddgZ e j        e          Z G d d          ZdZdadadadedefdZdedefdZdedefdZdedefdZdedefdZ G d d          Z dedefdZ! G d d          Z"dS )    N)Path)category	normalize)	Tokenizer)hf_hub_downloadz[START]z[STOP]z[UNK]z[SPACE]z[PAD]z[SEP]z[CLS]z[MASK]c                   8    e Zd Zd Zd ZdefdZdefdZd ZdS )	EnTokenizerc                 `    t          j        |          | _        |                                  d S N)r   	from_file	tokenizercheck_vocabset_sot_eot)selfvocab_file_paths     `/root/voice-cloning/.venv/lib/python3.11/site-packages/chatterbox/models/tokenizers/tokenizer.py__init__zEnTokenizer.__init__   s,    $-$7$H$H##%%%%%    c                 d    | j                                         }t          |v sJ t          |v sJ d S r   r   	get_vocabSOTEOTr   vocs     r   r   z"EnTokenizer.check_vocabset_sot_eot   3    n&&((czzzzczzzzzzr   textc                 ~    |                      |          }t          j        |                              d          }|S )Nr   encodetorch	IntTensor	unsqueeze)r   r   text_tokenss      r   text_to_tokenszEnTokenizer.text_to_tokens   s6    kk$''ok22<<Q??r   txtc                 ~    |                     dt                    }| j                            |          }|j        }|S )z_
        clean_text > (append `lang_id`) > replace SPACE > encode text using Tokenizer
         )replaceSPACEr   r   ids)r   r%   coder*   s       r   r   zEnTokenizer.encode#   s8     kk#u%%~$$S))h
r   c                    t          |t          j                  r&|                                                                }| j                            |d          }|                    dd          }|                    t          d          }|                    t          d          }|                    t          d          }|S NF)skip_special_tokensr'    
isinstancer    Tensorcpunumpyr   decoder(   r)   r   UNKr   seqr%   s      r   r5   zEnTokenizer.decode,   s    c5<(( 	$''))//##C>((%(HHkk#r""kk%%%kk#r""kk#r""
r   N)	__name__
__module____qualname__r   r   strr$   r   r5    r   r   r	   r	      sw        & & &  
3    
#    	 	 	 	 	r   r	   zResembleAI/chatterboxcreturnc                 <    dt          |           cxk    odk    nc S )zCheck if character is kanji.i N  i  ordr>   s    r   is_kanjirD   A   *    CFF####e#####r   c                 <    dt          |           cxk    odk    nc S )zCheck if character is katakana.i0  i0  rA   rC   s    r   is_katakanarG   F   rE   r   r   c                 z   	 t           ddl}|                                a t                               |           }g }|D ]}|d         }|d         }t	          d |D                       r'|r|d         dv rd|z   }|                    |           R|rt          d |D                       rnn|                    |d                    |                    |           d
                    |          }ddl}|	                    d|          }|S # t          $ r t                              d           | cY S w xY w)zSJapanese text normalization: converts kanji to hiragana; katakana remains the same.Nr   orighirac                 ,    g | ]}t          |          S r=   )rD   .0r>   s     r   
<listcomp>z&hiragana_normalize.<locals>.<listcomp>\   s    ---AHQKK---r   )u   はu   へr'   c                 ,    g | ]}t          |          S r=   )rG   rL   s     r   rN   z&hiragana_normalize.<locals>.<listcomp>b   s    222k!nn222r   Fr/   NFKDz9pykakasi not available - Japanese text processing skipped)_kakasipykakasikakasiconvertanyappendalljoinunicodedatar   ImportErrorloggerwarning)	r   rR   resultoutrinprJ   normalized_textrY   s	            r   hiragana_normalizerb   K   s}   #?OOOoo''G&& 	  	 AF)CV9D -----.. 
  &DG~55:D

4     8;  22c22233      

1V9%%%% 

3''#,, 	%//HH   RSSSs   DD &D:9D:c                     	 t           ddlm}  |            a t                               |           S # t          $ r t
                              d           | cY S t          $ r)}t
                              d|            | cY d}~S d}~ww xY w)z:Hebrew text normalization: adds diacritics to Hebrew text.Nr   )Dictaz9dicta_onnx not available - Hebrew text processing skippedzHebrew diacritization failed: )_dicta
dicta_onnxrd   add_diacriticsrZ   r[   r\   	Exception)r   rd   es      r   add_hebrew_diacriticsrj   u   s    >((((((UWWF$$T***   RSSS   ;;;<<<!   03 &B	B$BBBc                 t    d d                     fd| D                       }|                                S )zJKorean text normalization: decompose syllables into Jamo for tokenization.c                     d| cxk    rdk    sn | S t          |           dz
  }t          d|dz  z             }t          d|dz  dz  z             }|dz  dk    rt          d	|dz  z             nd
}||z   |z   S )z/Decompose Korean syllable into Jamo components.u   가u   힯i   i   iL  ia     r   i  r/   )rB   chr)charbaseinitialmedialfinals        r   decompose_hangulz*korean_normalize.<locals>.decompose_hangul   s    D,,,,H,,,,K 4yy6!ft0011Vtw/B6677+/"9q==FTBY&'''b%''r   r/   c              3   .   K   | ]} |          V  d S r   r=   )rM   rp   ru   s     r   	<genexpr>z#korean_normalize.<locals>.<genexpr>   s/      ==%%d++======r   )rX   strip)r   r]   ru   s     @r   korean_normalizery      sH    ( ( ( WW=========F<<>>r   c                   :    e Zd ZdZd	dZd	dZd ZdefdZd Z	dS )
ChineseCangjieConverterz>Converts Chinese characters to Cangjie codes for tokenization.Nc                     i | _         i | _        d | _        |                     |           |                                  d S r   )word2cjcj2word	segmenter_load_cangjie_mapping_init_segmenter)r   	model_dirs     r   r   z ChineseCangjieConverter.__init__   sC    ""9---r   c                    	 t          t          d|          }t          |dd          5 }t          j        |          }ddd           n# 1 swxY w Y   |D ]a}|                    d          dd         \  }}|| j        |<   || j        vr|g| j        |<   A| j        |                             |           bdS # t          $ r(}t                              d	|            Y d}~dS d}~ww xY w)
z7Load Cangjie mapping from HuggingFace model repository.zCangjie5_TC.json)repo_idfilename	cache_dirr_   zutf-8)encodingN	   z Could not load Cangjie mapping: )r   REPO_IDopenjsonloadsplitr}   r~   rV   rh   r[   r\   )	r   r   cangjie_filefpdataentrywordr+   ri   s	            r   r   z-ChineseCangjieConverter._load_cangjie_mapping   sn   	C*+#  L lC'::: %by}}% % % % % % % % % % % % % % %  4 4"[[..rr2
d%)T"t|++*.DL&&L&--d33334 4  	C 	C 	CNNAaAABBBBBBBBB	Cs;   )B= A B= AB= AA'B= =
C/C**C/c                     	 ddl m}  |            | _        dS # t          $ r% t                              d           d| _        Y dS w xY w)zInitialize pkuseg segmenter.r   )pkusegz;pkuseg not available - Chinese segmentation will be skippedN)spacy_pkusegr   r   rZ   r[   r\   )r   r   s     r   r   z'ChineseCangjieConverter._init_segmenter   se    	"++++++#VXXDNNN 	" 	" 	"NNXYYY!DNNNN	"s    +AAglyphc                     |}| j                             |d          }|dS | j        |                             |          }|dk    rt	          |          nd}|t	          |          z   S )z.Encode a single Chinese glyph to Cangjie code.Nr   r/   )r}   getr~   indexr<   )r   r   normed_glyphr+   r   s        r   _cangjie_encodez'ChineseCangjieConverter._cangjie_encode   sj    |d33<4T"((66#aiiE


Rc%jj  r   c                    g }| j         0| j                             |          }d                    |          }n|}|D ]}t          |          dk    r|                     |          }||                    |           Bg }|D ]}|                    d| d           |                    d           d                    |          }|                    |           |                    |           d                    |          S )z5Convert Chinese characters in text to Cangjie tokens.Nr'   Loz[cj_]z[cj_.]r/   )r   cutrX   r   r   rV   )	r   r   outputsegmented_words	full_texttcangjier+   r>   s	            r   __call__z ChineseCangjieConverter.__call__   s   >%"n0066O11III 	! 	!A{{d""..q11?MM!$$$  - -AKKq,,,,H%%%wwt}}d####a    wwvr   r   )
r9   r:   r;   __doc__r   r   r   r<   r   r   r=   r   r   r{   r{      s        HH   C C C C." " "!S ! ! ! !    r   r{   c                     	 t           ddlm}  |            a t                               |           S # t          $ r t
                              d           | cY S t          $ r)}t
                              d|            | cY d}~S d}~ww xY w)z>Russian text normalization: adds stress marks to Russian text.Nr   )RussianTextStresserzErussian_text_stresser not available - Russian stress labeling skippedz Russian stress labeling failed: )_russian_stresser#russian_text_stresser.text_stresserr   stress_textrZ   r[   r\   rh   )r   r   ri   s      r   add_russian_stressr      s    $OOOOOO 3 3 5 5 ,,T222   ^___   =!==>>>rk   c            	       n    e Zd Zd Zd Zddedededefd	Zdd
edededefdZddedededefdZ	d Z
dS )MTLTokenizerc                     t          j        |          | _        t          |          j        }t          |          | _        |                                  d S r   )r   r   r   r   parentr{   cangjie_converterr   )r   r   r   s      r   r   zMTLTokenizer.__init__  sL    $-$7$H$H))0	!8!C!C##%%%%%r   c                 d    | j                                         }t          |v sJ t          |v sJ d S r   r   r   s     r   r   z#MTLTokenizer.check_vocabset_sot_eot  r   r   NTraw_textlanguage_id	lowercasenfkd_normalizec                 Z    |}|r|                                 }|rt          d|          }|S )z]
        Text preprocessor that handles lowercase conversion and NFKD normalization.
        rP   )lowerr   )r   r   r   r   r   preprocessed_texts         r   preprocess_textzMTLTokenizer.preprocess_text  sD     % 	: 1 7 7 9 9 	E )&2C D D  r   r   c                     |                      ||||          }t          j        |                              d          }|S )Nr   r   r   r   r   )r   r   r   r   r   r#   s         r   r$   zMTLTokenizer.text_to_tokens  s?    kk$K9eskttok22<<Q??r   r%   c                    |                      ||||          }|dk    r|                     |          }nW|dk    rt          |          }nA|dk    rt          |          }n+|dk    rt	          |          }n|dk    rt          |          }|rd|                                 d| }|                    d	t                    }| j	        
                    |          j        S )
Nr   zhjahekoru[r   r'   )r   r   rb   rj   ry   r   r   r(   r)   r   r   r*   )r   r%   r   r   r   s        r   r   zMTLTokenizer.encode  s    ""3K9es"tt $((--CCD  $S))CCD  ',,CCD  "3''CCD  $S))C  	21k''))11C11Ckk#u%%~$$S))--r   c                    t          |t          j                  r&|                                                                }| j                            |d          }|                    dd                              t          d                              t          d                              t          d          }|S r-   r0   r7   s      r   r5   zMTLTokenizer.decode3  s    c5<(( 	$''))//##Cn##CU#CCkk#r""**5#66>>sBGGOOPSUWXX
r   )NTT)r9   r:   r;   r   r   r<   boolr   r$   r   r5   r=   r   r   r   r      s        & & &  

! 
! 
!# 
!QU 
!nr 
! 
! 
! 
! 3 S D im    
. .# .C .4 .`d . . . .,    r   r   )#loggingr   r    pathlibr   rY   r   r   
tokenizersr   huggingface_hubr   r   r   r6   r)   SPECIAL_TOKENS	getLoggerr9   r[   r	   r   rQ   re   r   r<   r   rD   rG   rb   rj   ry   r{   r   r   r=   r   r   <module>r      s            + + + + + + + +             + + + + + + sC'8L		8	$	$! ! ! ! ! ! ! !J " 	 $ $ $ $ $ $
$3 $4 $ $ $ $
'S 'S ' ' ' 'T     &3 3    *M M M M M M M M`S S    &9 9 9 9 9 9 9 9 9 9r   