
    0;ji"                        d dl mZ d dlmZ d dlZd dlZd dlZd dlmc m	Z
 d dlmZ d dlmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ dZdedefdZe G d d                      Z  G d d          Z!dS )    )	dataclass)PathN)hf_hub_download)	load_file   )T3)S3_SRdrop_invalid_tokens)S3GEN_SRS3Gen)EnTokenizer)VoiceEncoder)T3CondzResembleAI/chatterboxtextreturnc                     t                     dk    rdS  d                                         r% d                                          dd         z    d                                                                g d}|D ]\  }}                     ||                                d           h d}t           fd|D                       s d	z    S )
zt
        Quick cleanup func for punctuation from LLMs or
        containing chars not seen often in the dataset
    r   z)You need to add some text for me to talk.r   N ))z..., )u   …r   ):,)z - r   );r   )u   —-)u   –r   )z ,r   )u   “")u   ”r   )u   ‘')u   ’r   >   !r   r   .?c              3   B   K   | ]}                     |          V  d S N)endswith).0pr   s     H/root/voice-cloning/.venv/lib/python3.11/site-packages/chatterbox/tts.py	<genexpr>zpunc_norm.<locals>.<genexpr>:   s/      99At}}Q999999    r   )lenislowerupperjoinsplitreplacerstripany)r   punc_to_replaceold_char_sequencenew_charsentence_enderss   `    r#   	punc_normr2      s    
 4yyA~~:: Aw *Aw}}abb) 88DJJLL!!D  O (7 9 9#8||-x88 ;;sD///O999999999 Kr%   c                   R    e Zd ZU dZeed<   eed<   d ZdefdZ	e
d
d            Zd	S )ConditionalsaR  
    Conditionals for T3 and S3Gen
    - T3 conditionals:
        - speaker_emb
        - clap_emb
        - cond_prompt_speech_tokens
        - cond_prompt_speech_emb
        - emotion_adv
    - S3Gen conditionals:
        - prompt_token
        - prompt_token_len
        - prompt_feat
        - prompt_feat_len
        - embedding
    t3genc                     | j                             |          | _         | j                                        D ]7\  }}t	          j        |          r|                    |          | j        |<   8| S )Ndevice)r5   tor6   itemstorch	is_tensor)selfr9   kvs       r#   r:   zConditionals.toT   si    '**F*++HNN$$ 	2 	2DAqq!! 2dd&d11r%   fpathc                 p    t          | j        j        | j                  }t	          j        ||           d S )N)r5   r6   )dictr5   __dict__r6   r<   save)r>   rA   arg_dicts      r#   rE   zConditionals.save[   s=    w
 
 
 	
8U#####r%   cpuc                     t          |t                    rt          j        |          }t          j        ||d          } | t          di |d         |d                   S )NT)map_locationweights_onlyr5   r6    )
isinstancestrr<   r9   loadr   )clsrA   rI   kwargss       r#   rN   zConditionals.loadb   sa    lC(( 	6 <55LE4PPPs6))F4L))6%=999r%   N)rG   )__name__
__module____qualname____doc__r   __annotations__rC   r:   r   rE   classmethodrN   rK   r%   r#   r4   r4   @   s~           	JJJ	III  $$ $ $ $ $ : : : [: : :r%   r4   c                       e Zd Zdez  Zdez  Z	 ddedede	de
ded	efd
Zedd            Zedd            ZddZ	 	 	 	 	 	 	 ddZdS )ChatterboxTTS   
   Nr5   s3genve	tokenizerr9   condsc                     t           | _        || _        || _        || _        || _        || _        || _        t          j	                    | _
        d S r   )r   srr5   r[   r\   r]   r9   r^   perthPerthImplicitWatermarkerwatermarker)r>   r5   r[   r\   r]   r9   r^   s          r#   __init__zChatterboxTTS.__init__n   sL     
"
 9;;r%   r   c                    t          |          }|dv rt          j        d          }nd }t                      }|                    t          |dz                       |                    |                                           t                      }t          |dz            }d|	                                v r|d         d         }|                    |           |                    |                                           t                      }|                    t          |dz            d	           |                    |                                           t          t          |d
z                      }d }	|dz  x}
                                r/t                              |
|                              |          }	 | ||||||	          S )N)rG   mpsrG   ve.safetensorst3_cfg.safetensorsmodelr   s3gen.safetensorsF)stricttokenizer.jsonconds.pt)rI   )r^   )r   r<   r9   r   load_state_dictr   r:   evalr   keysr   r   rM   existsr4   rN   )rO   ckpt_dirr9   rI   r\   r5   t3_stater[   r]   r^   builtin_voices              r#   
from_localzChatterboxTTS.from_local   s   >> ^## <..LLL^^
h!1122	
 	
 	
 	fTTX(<<==hmmoo%%(+H
8$$$
fh!4455e 	 	
 	
 	
 	++,,
 
	 %
22M::<< 	[ %%m,%OORRSYZZEs2ub)V5AAAAr%   c                 b   |dk    rgt           j        j                                        sDt           j        j                                        st          d           nt          d           d}dD ]}t          t          |          }|                     t          |          j
        |          S )Nrf   zUMPS not available because the current PyTorch install was not built with MPS enabled.z~MPS not available because the current MacOS version is not 12.3+ and/or you do not have an MPS-enabled device on this machine.rG   )rg   rh   rj   rl   rm   )repo_idfilename)r<   backendsrf   is_availableis_builtprintr   REPO_IDru   r   parent)rO   r9   rA   
local_paths       r#   from_pretrainedzChatterboxTTS.from_pretrained   s     U??5>#5#B#B#D#D?>%..00 Xmnnnn  W  X  X  XFp 	J 	JE(5IIIJJ~~d:..5v>>>r%         ?c           
      J   t          j        |t                    \  }}t          j        |t          t                    }|d | j                 }| j                            |t          | j                  }| j	        j
        j        x}r`| j        j        }|                    |d | j                 g|          \  }	}
t          j        |	                              | j                  }	t          j        | j                            |gt                              }|                    dd                              | j                  }t-          ||	|t          j        d	d	d	          z  
                              | j                  }t1          ||          | _        d S )N)r`   )orig_sr	target_srr8   )max_lensample_rater   T)axiskeepdimr   speaker_embcond_prompt_speech_tokensemotion_adv)librosarN   r   resampler	   DEC_COND_LENr[   	embed_refr9   r5   hpspeech_cond_prompt_lenr]   forwardENC_COND_LENr<   
atleast_2dr:   
from_numpyr\   embeds_from_wavsmeanr   onesr4   r^   )r>   	wav_fpathexaggerations3gen_ref_wav_srref_16k_wavs3gen_ref_dictplens3_tokzrt3_cond_prompt_tokens_ve_embedt3_conds                r#   prepare_conditionalsz"ChatterboxTTS.prepare_conditionals   s   $\)AAAs&}hRWXXX%&8t'8&89--mXdk-ZZ 7:444 	\z+H'/'7'7EWdFWEW9X8Ycg'7'h'h$!1$)$45J$K$K$N$Nt{$[$[! #DG$<$<k]X]$<$^$^__==a=6699$+FF &;$uz!Q':'::
 
 
 "DK"
 
 	 	
 "'>::


r%   333333?皙?      ?皙?c	                 P   |r|                      ||           n| j        
J d            || j        j        j        d         k    rb| j        j        }	t	          |	j        |	j        |t          j        ddd          z            	                    | j
                  | j        _        t          |          }| j                            |          	                    | j
                  }
|dk    rt          j        |
|
gd	          }
| j        j        j        }| j        j        j        }t%          j        |
d
|          }
t%          j        |
d|          }
t          j                    5  | j                            | j        j        |
d|||||          }|d         }t-          |          }||dk              }|	                    | j
                  }| j                            || j        j                  \  }}|                    d                                                                                                          }| j                            || j                  }d d d            n# 1 swxY w Y   t          j         |          !                    d          S )N)r   zBPlease `prepare_conditionals` first or specify `audio_prompt_path`)r   r   r   r   r   r8   g        r   )dim)r   r   )value)r   r   i  )r   text_tokensmax_new_tokenstemperature
cfg_weightrepetition_penaltymin_ptop_pi  )speech_tokensref_dictr   )"r   r^   r5   r   r   r   r   r<   r   r:   r9   r2   r]   text_to_tokenscatr   start_text_tokenstop_text_tokenFpadinference_mode	inferencer
   r[   r6   squeezedetachrG   numpyrc   apply_watermarkr`   r   	unsqueeze)r>   r   r   r   r   audio_prompt_pathr   r   r   _condr   soteotr   wavr   watermarked_wavs                    r#   generatezChatterboxTTS.generate   s     	p%%&7l%SSSS:))+o))) 4:=4W=== JME"!-*/*I(5:aA+>+>>   bb$$	 JM n33D99<<T[II)[+$>AFFFKgj)gj(eKs;;;eKs;;;!## 	Y 	Y G--
'#'%#5 . 	 	M *!,M 0>>M)-$*>?M),,T[99MZ))+ *  FC ++a..''))--//5577C".>>sPTPW>XXO5	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y6 00::1===s   +C?I66I:=I:r   )r   rX   )r   )r   r   r   Nr   r   r   )rQ   rR   rS   r	   r   r   r   r   r   r   r   rM   r4   rd   rV   ru   r   r   r   rK   r%   r#   rX   rX   j   s       u9L=L #< << < 	<
 < < < < < <$ $B $B $B [$BL ? ? ? [?; ; ; ;: @> @> @> @> @> @>r%   rX   )"dataclassesr   pathlibr   r   r<   ra   torch.nn.functionalnn
functionalr   huggingface_hubr   safetensors.torchr   	models.t3r   models.s3tokenizerr	   r
   models.s3genr   r   models.tokenizersr   models.voice_encoderr   models.t3.modules.cond_encr   r}   rM   r2   r4   rX   rK   r%   r#   <module>r      s   ! ! ! ! ! !                   + + + + + + ' ' ' ' ' '       : : : : : : : : ) ) ) ) ) ) ) ) * * * * * * . . . . . . . . . . . . "'C 'C ' ' ' 'T &: &: &: &: &: &: &: &:Rf> f> f> f> f> f> f> f> f> f>r%   