
    0;ji;)                         d dl mZmZmZ d dlZd dlmZ d dlZd dl	Z	d dl
mc mZ d dl	mZmZ ddlmZ ddlmZ ddefd	Zd
edededefdZdededefdZ	 	 	 ddej        dedefdZ G d dej                  ZdS )    )ListUnionOptionalN)
as_strided)nnTensor   )VoiceEncConfig)melspectrogramseq_lenc                 <   |t          d | D                       }n|t          d | D                       k    sJ t          | d         t                    rd | D             } d}t          | d         t          j                  r| }|d         j        }nd | D             }t          |          |g|d         j        dd         R }t          j        |||d         j	        |          }t          |          D ]!\  }}|||d|                    d          f<   "|S )	a  
    Given a list of length B of array-like objects of shapes (Ti, ...), packs them in a single tensor of
    shape (B, T, ...) by padding each individual array on the right.

    :param arrays: a list of array-like objects of matching shapes except for the first axis.
    :param seq_len: the value of T. It must be the maximum of the lengths Ti of the arrays at
    minimum. Will default to that value if None.
    :param pad_value: the value to pad the arrays with.
    :return: a (B, T, ...) tensor
    Nc              3   4   K   | ]}t          |          V  d S Nlen.0arrays     g/root/voice-cloning/.venv/lib/python3.11/site-packages/chatterbox/models/voice_encoder/voice_encoder.py	<genexpr>zpack.<locals>.<genexpr>   s(      55Uc%jj555555    c              3   4   K   | ]}t          |          V  d S r   r   r   s     r   r   zpack.<locals>.<genexpr>   s(      ==Uc%jj======r   r   c                 6    g | ]}t          j        |          S  )npr   r   s     r   
<listcomp>zpack.<locals>.<listcomp>"   s     666e"(5//666r   c                 6    g | ]}t          j        |          S r   )torch	as_tensorr   s     r   r   zpack.<locals>.<listcomp>*   s"    >>>e5?5))>>>r   r	   )dtypedevice)max
isinstancelistr   r   r!   r   shapefullr    	enumeratesize)	arraysr   	pad_valuer!   tensorspacked_shapepacked_tensoritensors	            r   packr0      sJ    55f55555#==f========= &)T"" 766v666 F&)U\** ?">>v>>> LL'AGAJ,<QRR,@AALJ|Ygaj>NW]^^^Mw'' 3 3	6,2a&++a..())r   n_framesstepmin_coveragehpc                     | dk    sJ |j         }t          t          | |z
  |z   d          |          \  }}|dk    s|||z
  z   |z  |k    r|dz  }|||dz
  z  z   }||fS Nr   r	   )ve_partial_framesdivmodr"   )r1   r2   r3   r4   win_sizen_wins	remaindertarget_ns           r   get_num_winsr=   6   s     a<<<<#Hs8h#6#=qAA4HHFI{{yHtO4@LPP!$&1*--H8r   overlapratec                    d| cxk    rdk     sn J |-t          t          j        |j        d| z
  z                      }n1t          t          j        |j        |z  |j        z                      }d|cxk     r|j        k    sn J |S r6   )intr   roundr7   sample_rate)r>   r?   r4   
frame_steps       r   get_frame_steprE   E   s     !|""6!g+"FGGHH

2>D#8B<P"PQQRR
z1111R1111111r         ?皙?melc           	      D   d|cxk     rdk    sn J t          |||          }t          t          |           |||          \  }}|t          |           k    rAt          j        | t          j        |t          |           z
  |j        fd          f          } n|t          |           k     r
| d|         } |                     t          j        d          } ||j	        |j        f}| j
        d         |z  | j
        d         | j
        d         f}	t          | ||	          }
|
S )z<
    Takes unscaled mels in (T, M) format
    TODO: doc
    r   r	   NC)order)rE   r=   r   r   concatenater&   num_melsastypefloat32r7   stridesr   )rH   r4   r>   r?   r3   rD   
n_partials
target_lenr%   rP   partialss              r   stride_as_partialsrT   T   s&    |    q      r22J *#c((JbQQJ
 CHHnc27JS,A2;+OQR#S#STUU	c#hh		+:+ **RZs*
+
+C
 -r{;E{1~
*CKNCKNKG#ug..HOr   c                   D    e Zd Z e            f fd	Zed             Zdej        fdZ	ddej
        defd	Zed
ej        fd            Zedej        dej        fd            Z	 ddee
eej                 f         fdZ	 	 	 ddeej                 dee         fdZ xZS )VoiceEncoderc                    t                                                       || _        t          j        | j        j        | j        j        dd          | _        |j        r| j        	                                 t          j
        | j        j        | j        j                  | _        t          j        t          j        dg          d          | _        t          j        t          j        dg          d          | _        d S )N   T)
num_layersbatch_firstg      $@)requires_gradg      )super__init__r4   r   LSTMrM   ve_hidden_sizelstmflatten_lstm_paramsflatten_parametersLinearspeaker_embed_sizeproj	Parameterr   r/   similarity_weightsimilarity_bias)selfr4   	__class__s     r   r]   zVoiceEncoder.__init__x   s     GDG,dg.DQR`deee	! 	+I((***Idg4dg6PQQ	 "$elC5.A.AQU!V!V!V!|EL#,?,?tTTTr   c                 N    t          |                                           j        S r   )next
parametersr!   )ri   s    r   r!   zVoiceEncoder.device   s    DOO%%&&--r   melsc                    | j         j        ri|                                dk     s|                                dk    r9t	          d|                                 d|                                           |                     |          \  }\  }}|                     |d                   }| j         j        rt          j	        |          }|t          j                            |dd          z  S )a  
        Computes the embeddings of a batch of partial utterances.

        :param mels: a batch of unscaled mel spectrograms of same duration as a float32 tensor
        of shape (B, T, M) where T is hp.ve_partial_frames
        :return: the embeddings as a float32 tensor of shape (B, E) where E is
        hp.speaker_embed_size. Embeddings are L2-normed and thus lay in the range [-1, 1].
        r   r	   zMels outside [0, 1]. Min=z, Max=Tdimkeepdim)r4   normalized_melsminr"   	Exceptionr`   re   ve_final_reluFrelur   linalgnorm)ri   rn   _hidden
raw_embedss        r   forwardzVoiceEncoder.forward   s     7" 	X

Q$((**q..V

VV$((**VVWWW 4;FA YYvbz**
7  	,
++J EL--ja-NNNNr   rF   NrG   r?   c                     t          j        |          r|                                n|}t          || j                  t           fd|D              \  }}t          |          |                    d          z
  }	|	dk    rqt          j        |                    d          |	 j        j	        fdt           j
                  }
t          j        ||
                    |j                  fd          } fdt          ||          D             t          fdD                       sJ t          j                  t!          t#          j        t'                    |pt'                    z                      }t          j         fd                    |          D             d                                          t#          j        dgt#          j        |          f          }fd	t          |d
d         |dd
                   D             }t          j        |          }|t           j                            |dd          z  }|S )z
        Computes the embeddings of a batch of full utterances with gradients.

        :param mels: (B, T, M) unscaled mels
        :return: (B, E) embeddings on CPU
        c              3   F   K   | ]}t          |j                  V  d S r   )r=   r4   )r   lrD   r3   ri   s     r   r   z)VoiceEncoder.inference.<locals>.<genexpr>   s6      'm'm_`Q
LRVRY(Z(Z'm'm'm'm'm'mr   r	   r   )r    rr   c                 t    g | ]4\  }}t          |          D ]}||z  |z  j        j        z             5S r   )ranger4   r7   )r   rH   	n_partialr.   rD   ri   s       r   r   z*VoiceEncoder.inference.<locals>.<listcomp>   se     
 
 
YyAQAQ
 
<= JJ1J JJK
 
 
 
r   c              3   D   K   | ]}d          j         |j         k    V  dS )r   Nr%   )r   partialrS   s     r   r   z)VoiceEncoder.inference.<locals>.<genexpr>   s1      NN'8A;$5NNNNNNr   c                 &    g | ]} |          S r   r   )r   batchri   s     r   r   z*VoiceEncoder.inference.<locals>.<listcomp>   s!    #V#V#VEDDKK#V#V#Vr   c                 R    g | ]#\  }}t          j        ||         d           $S )r   r   )r   mean)r   startendpartial_embedss      r   r   z*VoiceEncoder.inference.<locals>.<listcomp>   s5    rrrzucejc	!:BBBrrrr   Nrp   Trq   )r   	is_tensortolistrE   r4   zipr"   r(   r&   rM   rO   cattor!   allstackrA   r   ceilr   chunkcpurL   cumsumrz   r{   )ri   rn   mel_lensr>   r?   r3   
batch_sizerQ   target_lenslen_diffpadn_chunksslicesr~   embedsrD   r   rS   s   `    `         @@@r   	inferencezVoiceEncoder.inference   sa    ).(A(AO8??$$$x $GT47;;
"%'m'm'm'm'm'mdl'm'm'm"n
K {##diill2a<<*diillHdg6FGRWR_```C9dCFF4;$7$78a@@@D
 
 
 
 
"%dJ"7"7
 
 
 NNNNXNNNNNNNN;x(( rws8}}
0Kc(mmLMMNN#V#V#V#VX^^H=U=U#V#V#V\]^^^bbdd !bi
&;&; <==rrrrUXY_`cac`cYdflmnmomofpUqUqrrr
[,,
el//
4/PPPr   
utt_embedsc                     | j         dk    sJ t          j        | d          } | t          j                            | d          z  S )z
        Takes an array of L2-normalized utterance embeddings, computes the mean embedding and L2-normalize it to get a
        speaker embedding.
           r   )axis)ndimr   r   rz   r{   )r   s    r   utt_to_spk_embedzVoiceEncoder.utt_to_spk_embed   sF     !####WZa000
BINN:q9999r   embeds_xembeds_yc                     | j         dk    r| nt                              |           } |j         dk    r|nt                              |          }| |z  S )z`
        Cosine similarity for L2-normalized utterance embeddings or speaker embeddings
        r	   )r   rV   r   )r   r   s     r   voice_similarityzVoiceEncoder.voice_similarity   sW    
  (}1188|7T7TU]7^7^'}1188|7T7TU]7^7^(""r   F    c                    t          t                    rLd D             t          fdD                       s
J d            d D             }t                    t	          j                    5   | j                            | j                  |fd|i|	                                }ddd           n# 1 swxY w Y   |r| 
                    |          n|S )a  
        Convenience function for deriving utterance or speaker embeddings from mel spectrograms.

        :param mels: unscaled mels strictly within [0, 1] as either a (B, T, M) tensor or a list of (Ti, M) arrays.
        :param mel_lens: if passing mels as a tensor, individual mel lengths
        :param as_spk: whether to return utterance embeddings or a single speaker embedding
        :param kwargs: args for inference()

        :returns: embeds as a (B, E) float32 numpy array if <as_spk> is False, else as a (E,) array
        c                 6    g | ]}t          j        |          S r   )r   asarrayr   rH   s     r   r   z1VoiceEncoder.embeds_from_mels.<locals>.<listcomp>   s     444BJsOO444r   c              3   \   K   | ]&}|j         d          d         j         d          k    V  'dS )r	   r   Nr   )r   mrn   s     r   r   z0VoiceEncoder.embeds_from_mels.<locals>.<genexpr>   s9      DD!qwqzT!W]1%55DDDDDDr   zMels aren't in (B, T, M) formatc                 (    g | ]}|j         d          S )r   r   r   s     r   r   z1VoiceEncoder.embeds_from_mels.<locals>.<listcomp>   s    555	!555r   r   N)r#   r   r   r0   r   inference_moder   r   r!   numpyr   )ri   rn   r   as_spkr   kwargsr   s    `     r   embeds_from_melszVoiceEncoder.embeds_from_mels   sM    dD!! 	44t444DDDDDtDDDDDggFgggD55555H::D !## 	q 	q'(<(<hhhS]haghhnnppJ	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 	q 5;Jt$$Z000
Js   6<B>>CC   wavstrim_top_dbc                       j         j        k    r fd|D             }rfd|D             }d|vrd|d<    fd|D             }  j        |f||d|S )z
        Wrapper around embeds_from_mels

        :param trim_top_db: this argument was only added for the sake of compatibility with metavoice's implementation
        c                 T    g | ]$}t          j        |j        j        d           %S )kaiser_fast)orig_sr	target_srres_type)librosaresampler4   rC   )r   wavrC   ri   s     r   r   z1VoiceEncoder.embeds_from_wavs.<locals>.<listcomp>  sC         kTWEXcpqqq  r   c                 ^    g | ])}t           j                            |           d         *S ))top_dbr   )r   effectstrim)r   r   r   s     r   r   z1VoiceEncoder.embeds_from_wavs.<locals>.<listcomp>  s3    UUUGO(([(AA!DUUUr   r?   g?c                 D    g | ]}t          |j                  j        S r   )r   r4   T)r   wri   s     r   r   z1VoiceEncoder.embeds_from_wavs.<locals>.<listcomp>  s(    ;;;q$'**,;;;r   )r   r   )r4   rC   r   )ri   r   rC   r   r   r   r   rn   s   ` `  `  r   embeds_from_wavszVoiceEncoder.embeds_from_wavs   s     $'---      D
  	VUUUUPTUUUD F6N;;;;d;;;$t$TZ&ZZZSYZZZr   )rF   NrG   N)NFr   )Fr   r   )__name__
__module____qualname__r
   r]   propertyr!   r   FloatTensorr   r   floatr   staticmethodr   ndarrayr   r   r   r   r   r   r   __classcell__)rj   s   @r   rV   rV   w   s       (.** U U U U U U . . X.OE- O O O O.% %el % % % % %N :RZ : : : \: #2: # # # # \# ^`K K&$rz"223K K K K< %'[ [2:[ e_[ [ [ [ [ [ [ [r   rV   )Nr   )rF   NrG   )typingr   r   r   r   r   numpy.lib.stride_tricksr   r   r   torch.nn.functionalr   
functionalrx   r   configr
   melspecr   rA   r0   r   r=   rE   r   rT   ModulerV   r   r   r   <module>r      s   ) ( ( ( ( ( ( ( ( (     . . . . . .                    " " " " " " # # # # # ## ## # # # #L
  		   
 	   $    	   	       F[[ [[ [[ [[ [[29 [[ [[ [[ [[ [[r   