
    0;ji+U              
       4   d dl mZ d dlmZmZ d dlZd dlmZ d dlm	Z	m
Z
mZmZ d dlmZmZmZmZ e G d d                      Z	 	 d!d
ededefdZdej        dej        dej        deej        ej        f         fdZdej        dej        fdZ G d dej        j                  Z G d dej        j                  Z G d de          Z G d dej        j                  Z G d dej        j                  Z G d d ej        j                  Z dS )"    )	dataclass)OptionalTupleN	rearrange)Conv1d	LayerNormLinearMultiHeadAttention)make_non_pad_maskmask_to_bias
onnx2torchmerge_tokenized_segmentsc                   r    e Zd ZU dZeed<   dZeed<   dZeed<   dZeed<   d	Z	eed
<   dZ
eed<   dZeed<   dS )ModelConfig   n_melsi  n_audio_ctxi   n_audio_state   n_audio_head   n_audio_layer  n_codebook_sizeFuse_sdpaN)__name__
__module____qualname__r   int__annotations__r   r   r   r   r   r   bool     N/root/voice-cloning/.venv/lib/python3.11/site-packages/s3tokenizer/model_v2.pyr   r      s         FCKM3L#M3OSHdr$   r        @dimendthetac                    d|t          j        d| d          d | dz                                           | z  z  z  }t          j        ||j                  }|||z  }t          j        ||                                          }t          j        t          j        |          |          }t          j        ||fd          S )N      ?r      devicer'   )torcharangefloatr.   outerpolar	ones_likecat)r'   r(   r)   scalingfreqst	freqs_ciss          r%   precompute_freqs_cisr<   %   s     55<322;SAX;?EEGG#MNOES...AKK5!!''))EEOE22E::I9i+4444r$   xqxkr;   returnc                    t          j        |          }|d d d d df         |d d d d df         }}|                    d                              d                              | j                  }|                    d                              d                              | j                  }| j        d         }| d d d d d d d |dz  f         | d d d d d d |dz  d f         }}t          j        | |fd          }	|j        d         }|d d d d d d d |dz  f         |d d d d d d |dz  d f         }}t          j        | |fd          }
| |z  |	|z  z   ||z  |
|z  z   fS )Nr      r,   r/   r0   )r1   view_as_real	unsqueezetodtypeshaper7   )r=   r>   r;   realcossinDhalf_lhalf_rxq_rxk_rs              r%   apply_rotary_embrO   3   s   
 i((DAAAqqq!G}d111aaa7mC
--


$
$Q
'
'
*
*28
4
4C
--


$
$Q
'
'
*
*28
4
4C
A111aaa!q&()2aaaAAAqAvww.>+?FF9vgv&B///D
A111aaa!q&()2aaaAAAqAvww.>+?FF9vgv&B///D8dSj "s(TCZ"777r$   xc                     |j         ddcxk    rk     sn J | j        |j        d         |j        d         fk    sJ fdt          |j                  D             } | j        | S )Nr   rA   r/   c                 <    g | ]\  }}|d k    s	|d z
  k    r|nd S )rA   r#   ).0idndims      r%   
<listcomp>z)reshape_for_broadcast.<locals>.<listcomp>M   sB       041Q!VVqD1H}}!  r$   )rV   rF   	enumerateview)r;   rP   rF   rV   s      @r%   reshape_for_broadcastrZ   I   s    6D====D======?qwqz172;77777   8A!'8J8J  E 9>5!!r$   c                       e Zd Zddedef fdZ ej                    dej        dej        fd            Z ej                    dej        dej        fd            Z	 ej                    d	ej        dej        fd
            Z
 xZS )FSQCodebook   r'   levelc                     t                                                       t          j                            |d          | _        || _        d | _        d S )N   )super__init__r1   nnr
   project_downr^   embed)selfr'   r^   	__class__s      r%   rb   zFSQCodebook.__init__U   sC    !HOOC33



r$   rP   r?   c                 &    t          |d          }|S )Nz... d -> (...) dr   rf   rP   s     r%   
preprocesszFSQCodebook.preprocess[   s    a+,,r$   c                 .   |j         }|                     |          }|                     |                                          }|                                }|dz  }|                                dz   }t          j        | j        t          j	        d| j        z  |j
        |j                            }t          j        ||                    d          z  d          }|                    |d         |d                                                   }|S )Ng   ?rA   r,   )r.   rE   r   r/   r0   )rF   rj   rd   r3   tanhroundr1   powr^   r2   r.   rE   sumrC   reshaper    )rf   rP   x_shapehpowersmuinds          r%   encodezFSQCodebook.encode`   s    'OOAa  &&((FFHH""GGIIMJLDJqxqwGGGI I Yq6++A...B777jjWQZ004466
r$   	embed_indc                      t          d          )Nz2There is no official up project component provided)NotImplementedError)rf   rw   s     r%   decodezFSQCodebook.decoder   s    !@B B 	Br$   )r]   )r   r   r   r    rb   r1   inference_modeTensorrj   rv   rz   __classcell__rg   s   @r%   r\   r\   S   s        C        UEL U\     U     " UB B B B B B B B B Br$   r\   c                        e Zd ZdZdedef fdZed             Z ej	                    dej
        dej
        fd            Z ej	                    d	ej
        dej
        fd
            Z xZS )FSQVectorQuantizationzVector quantization implementation (inference-only).
    Args:
        dim (int): Dimension
        codebook_size (int): Codebook size
    r'   codebook_sizec                     t                                                       d|k    sJ t          |d          | _        || _        d S )Nr   r]   )r'   r^   )ra   rb   r\   	_codebookr   )rf   r'   r   rg   s      r%   rb   zFSQVectorQuantization.__init__   sN    
 	}$$$$$A666*r$   c                     | j         j        S N)r   re   rf   s    r%   codebookzFSQVectorQuantization.codebook   s    ~##r$   rP   r?   c                 6    | j                             |          S r   )r   rv   ri   s     r%   rv   zFSQVectorQuantization.encode   s    ~$$Q'''r$   rw   c                 Z    | j                             |          }t          |d          }|S )Nzb n d -> b d n)r   rz   r   )rf   rw   quantizes      r%   rz   zFSQVectorQuantization.decode   s,    >((33X'788r$   )r   r   r   __doc__r    rb   propertyr   r1   r{   r|   rv   rz   r}   r~   s   @r%   r   r   x   s         ++ + + + + + + $ $ X$ U( ( ( ( ( ( U         r$   r   c                   ^    e Zd Z	 	 ddedededef fdZ	 dd	ej        d
eej                 fdZ		 	 	 ddej        dej        dej        d
eej                 deej                 deej                 fdZ
	 	 	 ddej        d
eej                 deej                 deej                 fdZ xZS )FSMNMultiHeadAttention   Fn_staten_headkernel_sizer   c           	      ~   t                                          ||           t          j                            |||dd|d          | _        |dz
  dz  | _        |dz
  | j        z
  | _        t          j                            | j        | j        fd          | _	        || _
        t          ||d          | _        d S )NrA   r   F)stridepaddinggroupsbiasr,           )r   )ra   rb   r1   rc   r   
fsmn_blockleft_paddingright_paddingConstantPad1dpad_fnr   r
   key)rf   r   r   r   r   rg   s        r%   rb   zFSMNMultiHeadAttention.__init__   s     	&)))(//'*1*5122318/4 * 6 6 )1_2(1_t/@@h,, 23S: : !'7777r$   Ninputsmaskc                 b   |                                 \  }}}}|                    ||d          }||                     d          dk    r||z  }|                    dd          }|                     |          }|                     |          }|                    dd          }||z  }||z  S )Nr/   r,   r   rA   )sizerY   	transposer   r   )rf   r   r   br:   _rP   s          r%   forward_fsmnz#FSMNMultiHeadAttention.forward_fsmn   s     [[]]
1aQ2&&		!q 0 0d]FQ""KKNNOOAKK1	V4xr$   qkvmask_padr;   c                    |j         \  }}}|| j        z  dz  }	 |j        g |j         d d         | j        dR  } |j        g |j         d d         | j        dR  } |j        g |j         d d         | j        dR  }|t          |||          \  }}|                     ||          }
|                    dddd          |	z  }|                    dddd          }| j        s|                    dddd          |	z  }||z  }|||z   }|                                }t          j	        j
                            |d                              |j                  }||z                      dddd                              d	          |                                |
fS |                    dddd          |	z  }|J t          j	        j
                            ||||d
d          }|                    dd                                                              |                    d          d|          }|d |
fS )Ng      пr,   r/   )r;   r   rA   r]   r0   )	start_dimr   r+   )	attn_mask	dropout_pscale)rF   r   rY   rO   r   permuter   r3   r1   rc   
functionalsoftmaxrD   rE   flattendetachscaled_dot_product_attentionr   
contiguousr   )rf   r   r   r   r   r   r;   r   rJ   r   
fsm_memoryqkwoutputs                 r%   qkv_attentionz$FSMNMultiHeadAttention.qkv_attention   sy    '1adk!E)AF1AGBQBK11b111AF1AGBQBK11b111AF1AGBQBK11b111 #AqI>>>DAq&&q(33
IIaAq!!E)IIaAq!!} 	,		!Q1%%-AQB$YB#++BB+77::17CCAE??1a #GaG00"))++zJ J 		!Q1%%-A###X(EE F  F &&q'(* **4*,,ttAFF1IIr17M7M  4++r$   rP   c                     |                      |          }|                     |          }|                     |          }|                     ||||||          \  }}	}
|                     |          |
z   |	fS r   )queryr   valuer   out)rf   rP   r   r   r;   r   r   r   wvr   r   s              r%   forwardzFSMNMultiHeadAttention.forward   st     JJqMMHHQKKJJqMM!//1ax09; ;B
xx||j(",,r$   r   Fr   NNN)r   r   r   r    r"   rb   r1   r|   r   r   r   r   r}   r~   s   @r%   r   r      s~        8 88 8 	8
 8 8 8 8 8 84 59 "\#EL1   $ 6:9=:>,, ,,,,,, ,, %U\2	,,
 !) 6,, "*%,!7,, ,, ,, ,,` 043748	- -<-u|,- #5<0- $EL1	- - - - - - - -r$   r   c            
            e Zd Z	 	 ddedededef fdZ	 	 	 dd	ej        d
eej                 deej                 deej                 fdZ	 xZ
S )ResidualAttentionBlockr   Fr   r   r   r   c                    t                                                       t          ||||          | _        t	          |d          | _        |dz  }t          j                            t          ||          t          j        
                                t          ||                    | _        t	          |          | _        d S )Nr   gh㈵>)eps   )ra   rb   r   attnr	   attn_lnr1   rc   
Sequentialr
   GELUmlpmlp_ln)rf   r   r   r   r   n_mlprg   s         r%   rb   zResidualAttentionBlock.__init__   s     	*7+1+64<> > >	 !d333!8&&vgu'='=ux}}'-eW'='=? ?((r$   NrP   r   r   r;   c                     ||                      |                     |          |||          d         z   }||                     |                     |                    z   }|S )N)r   r   r;   r   )r   r   r   r   )rf   rP   r   r   r;   s        r%   r   zResidualAttentionBlock.forward  sf     		LLOO$  ! !!"$ $ Q(((r$   r   r   )r   r   r   r    r"   rb   r1   r|   r   r   r}   r~   s   @r%   r   r      s         ) )) ) 	)
 ) ) ) ) ) )0 (,+/,0 < u|$ 5<(	
 EL)       r$   r   c                        e Zd Zdedededededef fdZdej        d	ej        d
eej        ej        f         fdZ	 xZ
S )AudioEncoderV2r   r   r   n_layerr   r   c                 h   t                                                       || _        t          |d|d          | _        t          ddd          | _        t          dd          | _        t          j	        
                    fdt          |          D                       | _        d S )Nr]   rA   )r   r   r   r,   @   i   c                 4    g | ]}t                     S )r   )r   )rS   r   r   r   r   s     r%   rW   z+AudioEncoderV2.__init__.<locals>.<listcomp><  s8     +
 +
 +
 #7FXFFF+
 +
 +
r$   )ra   rb   r   r   conv1conv2r<   r;   r1   rc   
ModuleListrangeblocks)rf   r   r   r   r   r   r   rg   s     ``  `r%   rb   zAudioEncoderV2.__init__%  s     	F#()#)$%	' ' '

 G#()#$$%	' ' '

 .b(;;h)) +
 +
 +
 +
 +
 +
7^^+
 +
 +
  r$   rP   x_lenr?   c                 6   |j         d         }t          ||                              d          }t          j        j                            |                     ||z                      }|dz   dz
  dz
  | j        z  dz   }|dz   dz
  dz
  | j        z  dz   }t          ||                              d          }t          j        j                            | 	                    ||z                      }|dz   dz
  dz
  dz  dz   }|dz   dz
  dz
  | j        z  dz   }t          ||                              d          }|
                    ddd          }| j                            |j                  }|                    dd          }t          ||j                  }t          j        |          }|dddddf         |dddddf         }
}	t          j        |	|	fd          }	t          j        |
|
fd          }
|	                    d                              d          }	|
                    d                              d          }
| j        D ]>} |||                    d          ||d|                    d                             }?||fS )z
        x : torch.Tensor, shape = (batch_size, n_mels, T)
            the mel spectrogram of the audio
        x_len: torch.Tensor, shape = (batch_size,)
            length of each audio in x
        r/   rA   r,   r   Nr0   )rF   r   rC   r1   rc   r   gelur   r   r   r   r;   rD   r.   r   r   rE   rB   r7   r   r   )rf   rP   r   Tr   x_slenr;   r   tmprH   rI   blocks               r%   r   zAudioEncoderV2.forwardA  s}    GBK **44Q77H$$TZZD%9%9::[(1,<q@a%+%)dk9A= //99!<<H$$TZZD%9%9::[(1,2Q61*{*Q.4;>B //99!<<IIaAN%%ah//	>>!Q''D!'** ++qqq!!!Qw<QQQ1WSic
+++ic
+++mmA((++mmA((++[ 	M 	MEa**Hi

6KLLAA%xr$   )r   r   r   r    r"   rb   r1   r|   r   r   r}   r~   s   @r%   r   r   #  s          	
        8" "|"(-elEL.H(I" " " " " " " "r$   r   c                       e Zd ZdZ e            fdedef fdZdej        dej        de	ej        ej        f         fdZ
 ej                    dej        dej        de	ej        ej        f         fd	            Z ej                    dej        dej        d
ej        dede	ej        ej        f         f
d            Zed             ZdefdZdefdZd Z xZS )S3TokenizerV2zdS3 tokenizer v2 implementation (inference-only).
    Args:
        config (ModelConfig): Config
    nameconfigc                 n   t                                                       || _        d|vrd|v sJ d|_        || _        t          | j        j        | j        j        | j        j        | j        j	        d| j        j
                  | _        t          | j        j        | j        j                  | _        d S )Nv1v2r   r,   )ra   rb   r   r   r   r   r   r   r   r   r   encoderr   	quantizer)rf   r   r   rg   s      r%   rb   zS3TokenizerV2.__init__l  s    	t4<<<<%)F"%KK%K$K%K 
 
 /K%K'
 
r$   melmel_lenr?   c                 .    |                      ||          S r   )r   )rf   r   r   s      r%   r   zS3TokenizerV2.forward  s    }}S'***r$   c                     d}||k    }|                                 r|                     ||||          S |                     ||          \  }}| j                            |          }||fS )ap  
        Quantize mel spectrogram to tokens, with automatic long audio handling.

        Args:
            mel: mel spectrogram tensor, shape (batch_size, n_mels, T)
            mel_len: mel length tensor, shape (batch_size,)

        Returns:
            code: quantized tokens, shape (batch_size, T')
            code_len: token length, shape (batch_size,)
        i  )any_quantize_mixed_batchr   r   rv   )rf   r   r   
max_frameslong_audio_maskhiddencode_lencodes           r%   r   zS3TokenizerV2.quantize  s      
 "J.   	"--c7O.8: : :  $||C99FH>((00D>!r$   r   r   c                 	   |                     d          }d}d}d}d}	||z  |z  }
|	|z  |z  }|
|z
  }g }g }g }t          |          D ]}||         }||         }||                                         }|s|ddd|f         }|                                }||
k     r,|
|z
  }t          j        j                            |d|f          }|                    |           |                    t          j        ||j	                             |                    |ddd	d
           d}d}||k     rt          ||
z   |          }|dd||f         }|                     d	          }||
k     r,|
|z
  }t          j        j                            |d|f          }|                    |           |                    t          j        ||j	                             |                    |d|dd
           |d	z  }||z  }||k     |}|D ]}|d         |k    r|d         r||d<   |sMt          j        |dt          j        |j	                  t          j        |t          j        |j	                  fS t          j        |          }t          j        |          }|                     ||          \  }}| j                            |          } i }!t#          |          D ]\  }"}|d         }|d         }|d         }| |"d||"                                         f                                                                                                         }#|s;t          j        |#t          j        |j	                  }$|$t+          |#          f|!|<   ||!vrg |!|<   |!|                             |#           t          |          D ]r}||                                         rV|!|         }%d}&t-          |%|	|&          }'t          j        |'t          j        |j	                  }(|(t+          |'          f|!|<   st/          d |!                                D                       })t          j        ||)t          j        |j	                  }*t          j        |t          j        |j	                  }+|!                                D ]\  }\  }$}|$|*|d|f<   ||+|<   |*|+fS )a  
        Handle mixed batch with both short and long audio using unified batch processing.

        Args:
            mel: mel spectrogram tensor, shape (batch_size, n_mels, T)
            mel_len: mel length tensor, shape (batch_size,)
            long_audio_mask: boolean mask for long audio, shape (batch_size,)
            max_frames: maximum frames for short audio

        Returns:
            code: quantized tokens, shape (batch_size, T')
            code_len: token length, shape (batch_size,)
        r   i>        r   Nr-   FrA   )	batch_idxis_long_audiosegment_idxtotal_segmentsTr   r   r   )rE   r.   r      )overlap
token_ratec              3   &   K   | ]}|d          V  dS )rA   Nr#   )rS   	code_infos     r%   	<genexpr>z6S3TokenizerV2._quantize_mixed_batch.<locals>.<genexpr>B  s&      JJI9Q<JJJJJJr$   )r   r   itemr1   rc   r   padappendtensorr.   minzeroslongstackr   r   rv   rX   cpunumpytolistlenr   maxvaluesitems),rf   r   r   r   r   
batch_sizesample_rate
hop_lengthwindow_sizer   frames_per_windowframes_per_overlapframes_per_strideall_segmentsall_segments_lensegment_infor   	audio_melaudio_mel_lenr   segmentseg_lenpad_sizestartr   r(   r   infounified_batch_melunified_batch_lensr   r   codesresultsseg_idxsegment_codecode_tensoraudio_codesr  merged_codesmerged_codes_tensormax_code_lenoutput_codesoutput_codes_lens,                                               r%   r   z#S3TokenizerV2._quantize_mixed_batch  s   $ XXa[[
 
 (+5C${2j@-0BB 
 z** 8	@ 8	@III#I.M+I6;;==M  3@#AAA~~$56',,.. ...07:H#h155g8}MMG##G,,, ''L<<<> > >##!*%*#$&'	% %     m++e&77GGC'595G%ll1ooG!222#4w#>"'("5"9"9#a]#4 #4 !''000$++WSZ@@@B B B ''%.)-'2*.	) )     1$K..E- m++2 "-( @ @DK(I55$:O51?-.@  	6;z %*Z&)j2 2 2 49;#-).*-*46 46 466 6 "K55"[)9::  <<(9;MNN%%f-- &|44 	8 	8MGT[)I 1M}-K !2(7+002222447CEE%%''&&((  ! 
8#l<1625*> > > '23|3D3D%E	"" G++)+GI&	")),7777 z** 	N 	NIy)..00 N%i0  
7@GCM O  O  O
 ',l<9>:=*'F 'F 'F# ':3|;L;L%M	" JJ9I9IJJJJJ{:#/).*-*6 6 6 !;z-2Z.1j: : : 3:--// 	3 	3.I.X1<LIXI-.*2Y''---r$   c                 N    t          |                                           j        S r   )next
parametersr.   r   s    r%   r.   zS3TokenizerV2.deviceR  s    DOO%%&&--r$   	onnx_pathc                 V    t          |d d          }|                     |d           d S )NFTstrict)r   load_state_dict)rf   r5  ckpts      r%   init_from_onnxzS3TokenizerV2.init_from_onnxV  s2    )T511T$/////r$   	ckpt_pathc                 b    t          j        |dd          }|                     |d           d S )Nr  T)map_locationmmapr7  )r1   loadr9  )rf   r<  r:  s      r%   init_from_ptzS3TokenizerV2.init_from_ptZ  s7    z)%dCCCT$/////r$   c                 H    |                                  D ]\  }}d|_        d S )NF)named_parametersrequires_grad)rf   r   params      r%   freezezS3TokenizerV2.freeze^  s6    --// 	( 	(HAu"'E	( 	(r$   )r   r   r   r   r   strrb   r1   r|   r   r   r{   r   r    r   r   r.   r;  rA  rF  r}   r~   s   @r%   r   r   f  s        
 9D 
 
S 
+ 
 
 
 
 
 
*+5< ++*/el0J*K+ + + + U"EL ","+0u|1K+L" " " "< Uk.|k..3lk."\k. k. !&elEL&@ Ak. k. k. k.Z . . X.0 0 0 0 00c 0 0 0 0( ( ( ( ( ( (r$   r   )r&   N)!dataclassesr   typingr   r   r1   einopsr   s3tokenizer.modelr   r	   r
   r   s3tokenizer.utilsr   r   r   r   r   r    r3   r<   r|   rO   rZ   rc   Moduler\   r   r   r   r   r   r#   r$   r%   <module>rN     s   " ! ! ! ! ! " " " " " " " "        K K K K K K K K K K K K c c c c c c c c c c c c         )0!%5 5c 5!5 %5 5 5 5888 |8 5<%&	8 8 8 8,"U\ "el " " " ""B "B "B "B "B%(/ "B "B "BJ    EHO   @b- b- b- b- b-/ b- b- b-J# # # # #UX_ # # #L@ @ @ @ @UX_ @ @ @Fz( z( z( z( z(EHO z( z( z( z( z(r$   