
    0;ji3                     D   d dl Z d dlZd dlZd dlZd dlmZ d dlm	Z	 ddl
mZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z! d Z" ed          d             Z# G d dej$        j%                  Z& G d de&          Z'dS )    N)	lru_cache)Optional   )S3_SRSPEECH_VOCAB_SIZES3Tokenizer   )S3GEN_SR)CausalMaskedDiffWithXvec)CAMPPlus)mel_spectrogram)ConvRNNF0Predictor)HiFTGenerator)UpsampleConformerEncoder)CausalConditionalCFM)ConditionalDecoder)
CFM_PARAMSc                     t          | j                  dk    r| j        d         dk    s
J d            | | t          k              S )Nr   r   r	   z&only batch size of one allowed for now)lenshaper   )xs    W/root/voice-cloning/.venv/lib/python3.11/site-packages/chatterbox/models/s3gen/s3gen.pydrop_invalid_tokensr   $   s@    qw<<1q2Z0Q""##    d   c                 h    t           j                            | |                              |          S N)ta
transformsResampleto)src_srdst_srdevices      r   get_resamplerr%   *   s(    =!!&&1144V<<<r   c                        e Zd ZdZd fd	Zed             Zed             Z	 	 ddej	        d	e
fd
Z	 	 	 	 	 ddej        deej	                 d	ee
         dee         def
dZ xZS )S3Token2Melzt
    S3Gen's CFM decoder maps S3 speech tokens to mel-spectrograms.

    TODO: make these modules configurable?
    Fc                    t                                                       t          d          | _        t          | _        t          d          | _        || _        t          dddddddd	d
ddddd          }t          ddd	dgdddddd| j                  }t          }t          d||          }t          ||          | _        i | _        d S )Nspeech_tokenizer_v2_25hzF)memory_efficienti      i      g?Tlinearrel_pos_espnetrel_selfattn)output_sizeattention_headslinear_units
num_blocksdropout_ratepositional_dropout_rateattention_dropout_ratenormalize_beforeinput_layerpos_enc_layer_typeselfattention_layer_type
input_sizeuse_cnn_modulemacaron_stylei@  P      g        @         gelu)in_channelsout_channelscausalchannelsdropoutattention_head_dimn_blocksnum_mid_blocks	num_headsact_fnmeanflow)spk_emb_dim
cfm_params	estimator)encoderdecoder)super__init__r   	tokenizerr   mel_extractorr   speaker_encoderrN   r   r   r   r   r   flow
resamplers)selfrN   rR   rQ   rP   rS   	__class__s         r   rU   zS3Token2Mel.__init__5   s   $%?@@,' # 
  
  

 !*$'#&! /%3 
 
 
" 'U!]
 
 
	  
&!
 
 
 -
 
 
	
 r   c                 \    | j                                         }t          |          j        S r   )rV   
parametersnextr$   r[   paramss     r   r$   zS3Token2Mel.devicel   s$    **,,F||""r   c                 \    | j                                         }t          |          j        S r   )rY   r^   r_   dtyper`   s     r   rc   zS3Token2Mel.dtypeq   s$    %%''F||!!r   autoTref_wavref_src                    |dk    r| j         n|}t          |t          j                  r&t	          j        |                                          }|j         |k    r|                    |          }t          |j	                  dk    r|
                    d          }|                    d          d|z  k    rt          d           |}|t          k    r t          |t          |          |          }|                    || j                  }|                     |                              dd                              | j                  }d }|}|t$          k    r t          |t$          |          |          }| j                            |                    | j                            }	|                     |                                          \  }
}|j	        d         d|
j	        d         z  k    r@t-          j        d	           |
d d d |j	        d         dz  f         }
|
j	        d         |d<   t1          |
                    |          ||||	
          S )Nrd   r	   r   
   z+WARNING: s3gen received ref longer than 10sr$   rc   r   rc   zAReference mel length is not equal to 2 * reference token length.
)prompt_tokenprompt_token_lenprompt_featprompt_feat_len	embedding)r$   
isinstancenpndarraytorch
from_numpyfloatr!   r   r   	unsqueezesizeprintr
   r%   rc   rW   	transposer   rX   	inferencerV   loggingwarningdict)r[   re   rf   r$   ref_fade_out
ref_wav_24ref_mels_24ref_mels_24_len
ref_wav_16ref_x_vectorref_speech_tokensref_speech_token_lenss               r   	embed_refzS3Token2Mel.embed_refv   sW    !'& 0 0fgrz** 	8&w//5577G>V##jj((Gw}""''**G<<??R&[((?@@@
X@vx@@IIJ]]&
]CC
((44>>q!DDGGdjGYY 
U??=vuf==gFFJ +55jmm$*m6U6UVV 48>>*BRBRBTBT3U3U00 Q1'8'>q'A#AAAOT   !2!!!5Ok6G6Ja6O5O2O P'8'>q'A!!$*--f552#+"
 
 
 	
r   Nspeech_tokensref_dictfinalizec	           
      l   |du |du z  sJ d| d| d            ||                      ||          }nt          |          D ]}	t          ||	         t          j                  rt          j        ||	                   ||	<   t          j        ||	                   r*||	                             | j	        | j
                  ||	<   t          j        |          }|6t          j        d |D                                           | j	                  } | j        j        d|||||| j        d|\  }
}|
S )	a  
        Generate waveforms from S3 speech tokens and a reference waveform, which the speaker timbre is inferred from.

        NOTE:
        - The speaker encoder accepts 16 kHz waveform.
        - S3TokenizerV2 accepts 16 kHz waveform.
        - The mel-spectrogram for the reference assumes 24 kHz input signal.
        - This function is designed for batch_size=1 only.

        Args
        ----
        - `speech_tokens`: S3 speech tokens [B=1, T]
        - `ref_wav`: reference waveform (`torch.Tensor` with shape=[B=1, T])
        - `ref_sr`: reference sample rate
        - `finalize`: whether streaming is finished or not. Note that if False, the last 3 tokens will be ignored.
        Nz5Must provide exactly one of ref_wav or ref_dict (got z and )ri   c                 8    g | ]}|                     d           S ))rw   ).0sts     r   
<listcomp>z'S3Token2Mel.forward.<locals>.<listcomp>   s"    1V1V1V""''"++1V1V1Vr   )token	token_lenr   noised_melsn_timestepsrN    )r   listrp   rq   rr   rs   rt   	is_tensorr!   r$   rc   
atleast_2d
LongTensorrY   rz   rN   )r[   r   re   rf   r   n_cfm_timestepsr   speech_token_lensr   rkoutput_mels_s               r   forwardzS3Token2Mel.forward   s   : 4H$45  	I  	I  8Iov  8I  8I  ~F  8I  8I  8I  	I  	I5~~gv66HH 8nn Y YhrlBJ77 B#(#3HRL#A#AHRL?8B<00 Y#+B<??$+TZ?#X#XHRL(77 $ % 01V1V1V1V1V W W Z Z[_[f g g,, 
'#']
 
 
 
Q r   F)rd   T)NNFNN)__name__
__module____qualname____doc__rU   propertyr$   rc   rs   Tensorintr   r   r   r}   boolr   __classcell__r\   s   @r   r'   r'   /   s         
5 5 5 5 5 5n # # X# " " X" 5
 5
5
 5
 5
 5
 5
| $(8 8'8 %,'	8
 8 4.8 8 8 8 8 8 8 8 8r   r'   c            
           e Zd ZdZdZd fd	Z	 	 	 	 	 	 ddeej                 dee	         dee
         d	ef fd
Z ej                    	 	 	 	 	 	 ddeej                 dee	         dee
         d	ef fd            Z ej                    ddej        fd            Z ej                    	 	 	 	 	 	 ddeej                 dee	         dee
         fd            Z xZS )S3Token2Wavz
    The decoder of S3Gen is a concat of token-to-mel (CFM) and a mel-to-waveform (HiFiGAN) modules.

    TODO: make these modules configurable?
    )ztokenizer._mel_filtersztokenizer.windowFc           
         t                                          |           t                      }t          t          g dg dg dg dg dg dg|          | _        t          dz  }t          j        d|z            }t          j        t          j	        t          j
        d|                    d	z   dz  ||d <   |                     d
|d           d| _        d S )N)r+         )         )r   r   r   )r	   r   r   )sampling_rateupsample_ratesupsample_kernel_sizessource_resblock_kernel_sizessource_resblock_dilation_sizesf0_predictor2   r   r   r	   	trim_fadeF)
persistentfp32)rT   rU   r   r   r
   mel2wavrs   zeroscoslinspacepiregister_bufferestimator_dtype)r[   rN   r   n_trimr   r\   s        r   rU   zS3Token2Wav.__init__   s    """)++$"$99"-++)3,5IIyyy)))+L%
 
 
 RKF
++	#iuxF(K(KLLqPTUU	&''[)FFF%r   Nre   rf   r   r   c
           
      b   t                                          ||||||||	          }
|r|
S t          j        ddd                              | j                  }| j                            |
|          ^}}| j        s-|dddt          | j
                  fxx         | j
        z  cc<   |S )z
        Generate waveforms from S3 speech tokens and a reference waveform, which the speaker timbre is inferred from.
        NOTE: used for sync synthesis only. Please use `S3GenStreamer` for streaming synthesis.
        )r   re   rf   r   r   r   r   r	   r   speech_featcache_sourceN)rT   r   rs   r   r!   r$   r   rz   trainingr   r   )r[   r   re   rf   r   r   r   skip_vocoderr   r   r   hift_cache_sourceoutput_wavsr   r\   s                 r   r   zS3Token2Wav.forward  s    & ggoo->Hx+ & 
 
  	 "K1a0033DK@@,00[Wh0iia} 	C/C////000DNB000r   c           
          |p
| j         rdnd}d }| j         r9t          j        dd|                    d          dz  | j        | j                  }t                                          ||||||||          }	|	S )Nr   rh   r	   r>   r   )rc   r$   )r   re   rf   r   r   r   r   )rN   rs   randnrw   rc   r$   rT   r   )r[   r   re   rf   r   r   r   r   noiser   r\   s             r   flow_inferencezS3Token2Wav.flow_inference,  s     *I4=.Haab= 	iK2}'9'9"'='='A\`\ghhhEggoo->X^iq+hE & 
 
 r   r   c                     |5t          j        ddd                              | j        | j                  }| j                            ||          S )Nr	   r   ri   r   )rs   r   r!   r$   rc   r   rz   )r[   r   r   s      r   hift_inferencezS3Token2Wav.hift_inferenceC  sM     ;q!Q//22$+TZ2XXL|%%+L%YYYr   Tc           	         |                      ||||||d          }|                    | j                  }|                     |d           \  }	}
|	d d d t	          | j                  fxx         | j        z  cc<   |	|
fS )NT)r   re   rf   r   r   r   rj   )r   r!   rc   r   r   r   )r[   r   re   rf   r   r   r   r   r   r   output_sourcess              r   rz   zS3Token2Wav.inferenceI  s    $ ))/+ * 
 
 "nn4:n66&*&9&9+t&L&L#^ 	AAA+DN++++,,,>,,,N**r   r   )NFNFNN)NNNNFNr   )NNNTNN)r   r   r   r   ignore_state_dict_missingrU   r   rs   r   r   r}   r   r   inference_moder   r   rz   r   r   s   @r   r   r      s         !O& & & & & &6 $(% % %,'	%
 % 4.% % % % % % %N U
 +/ $#'  %,'	
  4.      , UZ Z Z Z Z Z
 U
 +/ $#'  +  + %,'	 +
  + 4. +  +  +  +  +  +  +  +r   r   )(r{   numpyrq   rs   
torchaudior   	functoolsr   typingr   s3tokenizerr   r   r   constr
   rY   r   xvectorr   	utils.melr   r   r   hifiganr   transformer.upsample_encoderr   flow_matchingr   rS   r   configsr   r   r%   nnModuler'   r   r   r   r   <module>r      s                         ? ? ? ? ? ? ? ? ? ?       * * * * * *       & & & & & & , , , , , , " " " " " " B B B B B B / / / / / / ' ' ' ' ' '      $ $ $ 3= = =v v v v v%(/ v v vrB+ B+ B+ B+ B++ B+ B+ B+ B+ B+r   