
    0;ji#                         d dl mZmZ d dlZd dlZd dlZd dlmc m	Z
 d dlmZ d dlmZmZ dZdZdZdZd	Z G d
 de          ZdS )    )ListTupleN)padding)S3TokenizerV2ModelConfigi>     i     i  c                       e Zd ZdZdZd e            fdedef fdZdee	j
                 fdZd	 Z e	j                    	 	 dde	j
        dddedee	j
        e	j        f         fd            Z	 dde	j
        defdZ xZS )S3Tokenizerz
    s3tokenizer.S3TokenizerV2 with the following changes:
    - a more integrated `forward`
    - compute `log_mel_spectrogram` using `_mel_filters` and `window` in `register_buffers`
    )_mel_filterswindowspeech_tokenizer_v2_25hznameconfigc                 d   t                                          |           d| _        t          j                            t          | j        |j                  }|                     dt          j
        |                     |                     dt          j        | j                             d S )Ni  )srn_fftn_melsr   r   )super__init__r   librosafiltersmelS3_SRr   register_buffertorchFloatTensorhann_window)selfr   r   r   	__class__s       c/root/voice-cloning/.venv/lib/python3.11/site-packages/chatterbox/models/s3tokenizer/s3tokenizer.pyr   zS3Tokenizer.__init__   s    
 	
***= + 
 

 	l++	
 	
 	

 	dj))	
 	
 	
 	
 	
    returnc                    g }|D ]}t          |t          j                  rt          j        |          }|                                dk    r|                    d          }|j        d         |z  t          z  }t          j	        |          }||t          z  z  }t          |          }t          j        j                            |d||j        d         z
  fdd          }|                    |           |S )z
        Given a list of wavs with the same `sample_rate`, pad them so that the length is multiple of 40ms (S3 runs at 25 token/sec).
           r   constant)modevalue)
isinstancenpndarrayr   
from_numpydim	unsqueezeshapeS3_TOKEN_RATEceilintnn
functionalpadappend)r   wavsr   processed_wavswavn_tokensintended_wav_lens          r!   r6   zS3Tokenizer.pad6   s      	' 	'C#rz** ,&s++wwyyA~~mmA&&	!r)]:Hwx((H'2+=>"#344(%))$sy}45	 *  C !!#&&&&r"   c                     g }|D ]r}t          |t          j                  rt          j        |          }|                                dk    r|                    d          }|                    |           s|S )z4Prepare a list of audios for s3tokenizer processing.r%   r   )r*   r+   r,   r   r-   r.   r/   r7   )r   r8   r9   r:   s       r!   _prepare_audiozS3Tokenizer._prepare_audioN   sy     	' 	'C#rz** ,&s++wwyyA~~mmA&&!!#&&&&r"   Nr8   acceleratorAcceleratormax_lenc                 `   |                      |          }g g }}|D ]j}|                    | j                  }|                     |          }||dd|dz  f         }|                    |                    d                     kt          |          \  }}|| }	n|                    |           }	|	                    ||                    | j                            \  }
}|
	                                
                                |	                                
                                fS )a  
        NOTE: mel-spec has a hop size of 160 points (100 frame/sec).
        FIXME: this class inherits `nn.Module` but doesn't accept `torch.Tensor` and handles a list of wavs one by one, which is unexpected.

        Args
        ----
        - `wavs`: 16 kHz speech audio
        - `max_len` max length to truncate the output sequence to (25 token/sec).
        NOTE: please pad the waveform if longer sequence is needed.
        N.   r   )r>   todevicelog_mel_spectrogramr7   squeezer   unwrap_modelquantizelongdetach)r   r8   r?   rA   r9   melsmel_lensr:   r   	tokenizerspeech_tokensspeech_token_lenss               r!   forwardzS3Tokenizer.forwardZ   s+   " ,,T22Rh! 	( 	(C&&%%C**3//C"#|!|+,KKA'''' hII#0066I+4+=+=dHKKPTP[D\D\+]+]((  ''))""$$++--
 	
r"   r   audior   c                 ~   t          j        |          st          j        |          }|                    | j                  }|dk    rt          j        |d|f          }t          j        || j        t          | j
                            | j                  d          }|dddf                                         dz  }| j                            | j                  |z  }t          j        |d	                                          }t          j        ||                                d
z
            }|dz   dz  }|S )a  
        Compute the log-Mel spectrogram of

        Parameters
        ----------
        audio: torch.Tensor, shape = (*)
            The path to audio or either a NumPy array or Tensor containing the
            audio waveform in 16 kHz

        padding: int
            Number of zero samples to pad to the right

        Returns
        -------
        torch.Tensor, shape = (128, n_frames)
            A Tensor that contains the Mel spectrogram
        r   T)r   return_complex.Nr&      g|=)ming       @g      @)r   	is_tensorr-   rD   rE   Fr6   stftr   S3_HOPr   absr   clamplog10maximummax)r   rR   r   rY   
magnitudesmel_speclog_specs          r!   rF   zS3Tokenizer.log_mel_spectrogram   s   , u%% 	,$U++E%%Q;;E%!W..Ez4:v;>>$+..
 
 

 #ss(^''))1,
$''44zA;xU33399;;=8<<>>C+?@@sNc)r"   )NN)r   )__name__
__module____qualname____doc__ignore_state_dict_missingr   strr   r   r   Tensorr6   r>   no_gradr3   r   
LongTensorrQ   rF   __classcell__)r    s   @r!   r   r      sD         !; -)kmm
 

 
 
 
 
 
 
.tEL1    0
 
 
 U]__ $(	#
 #
l#
 ##
 	#

 
u|U--	.#
 #
 #
 _#
P ( (|( ( ( ( ( ( ( ( (r"   r   )typingr   r   numpyr+   r   r   torch.nn.functionalr4   r5   rX   s3tokenizer.utilsr   s3tokenizer.model_v2r   r   r   rZ   S3_TOKEN_HOPr1   SPEECH_VOCAB_SIZEr    r"   r!   <module>ru      s                           % % % % % %        		 R R R R R- R R R R Rr"   