
    0;ji+M                        d Z ddlmZ ddlmZmZmZ ddlZddl	Z	ddl
mc mZ ddlmZ ddl	mZmZ ddlmZmZmZmZ e G d	 d
                      Z G d dej                  Z G d dej                  Z G d dej                  ZddZ G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z  G d dej                  Z!dS ) zyModified from https://github.com/openai/whisper/blob/main/whisper/model.py
   Add EuclideanCodebook & VectorQuantization
    )	dataclass)IterableOptionalTupleN	rearrange)Tensornn   )make_non_pad_maskmask_to_bias
onnx2torchmerge_tokenized_segmentsc                   r    e Zd ZU dZeed<   dZeed<   dZeed<   dZeed<   d	Z	eed
<   dZ
eed<   dZeed<   dS )ModelConfig   n_melsi  n_audio_ctxi   n_audio_state   n_audio_head   n_audio_layeri   n_codebook_sizeFuse_sdpaN)__name__
__module____qualname__r   int__annotations__r   r   r   r   r   r   bool     K/root/voice-cloning/.venv/lib/python3.11/site-packages/s3tokenizer/model.pyr   r      s         FCKM3L#M3OSHdr#   r   c                       e Zd ZdedefdZdS )	LayerNormxreturnc                    t          j        |                                | j        | j        | j                                        nd | j        | j                                        nd | j                                      |j                  S N)	F
layer_normfloatnormalized_shapeweightbiasepstypedtypeselfr'   s     r$   forwardzLayerNorm.forward-   ss    |GGII!#';#:DK!%!6DIOODH
 
 $qw--	r#   Nr   r   r   r	   r6   r"   r#   r$   r&   r&   +   s6         F      r#   r&   c                       e Zd ZdedefdZdS )Linearr'   r(   c                     t          j        || j                            |j                  | j        d n| j                            |j                            S r*   )r+   linearr/   tor3   r0   r4   s     r$   r6   zLinear.forward9   sJ    xKNN17##I%DD49<<+@+@
 
 	
r#   Nr7   r"   r#   r$   r9   r9   7   s6        
 
F 
 
 
 
 
 
r#   r9   c                   <     e Zd Zdededee         def fdZ xZS )Conv1dr'   r/   r0   r(   c                     t                                          ||                    |j                  |d n|                    |j                            S r*   )super_conv_forwardr<   r3   )r5   r'   r/   r0   	__class__s       r$   rA   zConv1d._conv_forwardC   sM    ww$$vyy!!4<44TWWQW=M=MO O 	Or#   )r   r   r   r	   r   rA   __classcell__rB   s   @r$   r>   r>   A   sn        Ov Ov O$V,O17O O O O O O O O O Or#   r>   '  c                    |dz  dk    sJ t          j        |          |dz  dz
  z  }t          j        | t          j        |dz            z            }t          j        |           ddt           j        f         |t           j        ddf         z  }t          j        t          j        |          t          j        |          gd          S )z*Returns sinusoids for positional embedding   r   r   Ndim)	nplogtorchexparangenewaxiscatsincos)lengthchannelsmax_timescalelog_timescale_incrementinv_timescalesscaled_times         r$   	sinusoidsrY   I   s    a<1 f]33x1}q7HIY 77$|HM:: ; < <N,v&&qqq"*}5

AAA9 K9ei,,ei.D.DE1MMMMr#   c            
       v     e Zd Zddededef fdZ	 ddedee         fd	Z	 dd
edededee         fdZ	 xZ
S )MultiHeadAttentionFn_staten_headr   c                    t                                                       || _        t          ||          | _        t          ||d          | _        t          ||          | _        t          ||          | _        || _        d S )NF)r0   )	r@   __init__r]   r9   querykeyvalueoutr   )r5   r\   r]   r   rB   s       r$   r_   zMultiHeadAttention.__init__V   st    GW--
'7777GW--
'7++ r#   Nr'   maskc                     |                      |          }|                     |          }|                     |          }|                     ||||          \  }}|                     |          |fS r*   )r`   ra   rb   qkv_attentionrc   )r5   r'   rd   qkvwvqks           r$   r6   zMultiHeadAttention.forward`   sc    
 JJqMMHHQKKJJqMM##Aq!T22Bxx||Rr#   rg   rh   ri   c                 "   |j         \  }}}|| j        z  dz  } |j        g |j         d d         | j        dR                      dddd          |z  } |j        g |j         d d         | j        dR  } |j        g |j         d d         | j        dR                      dddd          }| j        s|                    dddd          |z  }||z  }|||z   }|                                }t          j        j        	                    |d          
                    |j                  }	|	|z                      dddd                              d          |                                fS |                    dddd          |z  }|J t          j        j                            ||||d	d
          }
|
                    dd                                                              |                    d          d|          }
|
d fS )Ng      пrG   r   r      rH   )	start_dimg        g      ?)	attn_mask	dropout_pscale)shaper]   viewpermuter   r-   rL   r
   
functionalsoftmaxr<   r3   flattendetachscaled_dot_product_attention	transpose
contiguoussize)r5   rg   rh   ri   rd   _Drr   rk   woutputs              r$   rf   z MultiHeadAttention.qkv_attentionl   s;   
 '1adk!E)AF1AGBQBK11b11199!Q1EEMAF1AGBQBK11b111AF1AGBQBK11b11199!Q1EE} 	 		!Q1%%-AQB$YB#++BB+77::17CCAE??1a#$& &&-gg&:&:BIIKKH H 		!Q1%%-A###X(EE F  F &&q'(* **4*,,ttAFF1IIr17M7M  4<r#   )Fr*   )r   r   r   r   r!   r_   r	   r   r6   rf   rC   rD   s   @r$   r[   r[   T   s        ! ! !S !D ! ! ! ! ! ! "&
  
 
  v
  
  
  
   04	"  " " "   "  %V,	"  "  "  "  "  "  "  " r#   r[   c                   L     e Zd Zdededef fdZ	 d	dedee         fdZ xZ	S )
ResidualAttentionBlockr\   r]   r   c                 `   t                                                       t          |||          | _        t	          |          | _        |dz  }t          j        t          ||          t          j	                    t          ||                    | _
        t	          |          | _        d S )Nr      )r@   r_   r[   attnr&   attn_lnr
   
Sequentialr9   GELUmlpmlp_ln)r5   r\   r]   r   n_mlprB   s        r$   r_   zResidualAttentionBlock.__init__   s    &wJJJ	 ))!=!7!7!'w!7!79 9((r#   Nr'   rd   c                     ||                      |                     |          |          d         z   }||                     |                     |                    z   }|S )N)rd   r   )r   r   r   r   )r5   r'   rd   s      r$   r6   zResidualAttentionBlock.forward   sR    
 		$,,q//	55a88Q(((r#   r*   )
r   r   r   r   r!   r_   r	   r   r6   rC   rD   s   @r$   r   r      s        	) 	)S 	)D 	) 	) 	) 	) 	) 	) "&  v       r#   r   c                   `     e Zd Zdededededededef fdZd	ed
edeeef         fdZ xZ	S )AudioEncoderr   n_ctxr\   r]   n_layerstrider   c                 p   t                                                       || _        t          |d|d          | _        t          ddd          | _        |                     dt          |                     t          j	        fdt          |          D                       | _        d S )Nrn   r   )kernel_sizer   paddingrG   positional_embeddingc                 4    g | ]}t                     S )r   )r   ).0r~   r]   r\   r   s     r$   
<listcomp>z)AudioEncoder.__init__.<locals>.<listcomp>   s>     G
 G
 G
 #7FXFFFG
 G
 G
r#   )r@   r_   r   r>   conv1conv2register_bufferrY   r
   
ModuleListrangeblocks)	r5   r   r   r\   r]   r   r   r   rB   s	      ``  `r$   r_   zAudioEncoder.__init__   s     	F#()#)$%	' ' '

 G#()#$$%	' ' '

 	3Yug5N5NOOO8: G
 G
 G
 G
 G
 G
7^^G
 G
 G
 9 9r#   r'   x_lenr(   c                    t          |                              d          }t          j        |                     ||z                      }|dz   dz
  dz
  | j        z  dz   }t          |                              d          }t          j        |                     ||z                      }|dz   dz
  dz
  dz  dz   }t          |                              d          }|                    ddd          }t          ||j	                  }|| j
        d|j        d         ddf         z                       |j	                  }| j        D ]!} |||                    d                    }"||fS )z
        x : torch.Tensor, shape = (batch_size, n_mels, T)
            the mel spectrogram of the audio
        x_len: torch.Tensor, shape = (batch_size,)
            length of each audio in x
        r   rG   r   N)r   	unsqueezer+   gelur   r   r   ru   r   r3   r   rs   r<   r   )r5   r'   r   rd   blocks        r$   r6   zAudioEncoder.forward   sb    !''11!44F4::a$h''(([(1,<q@ ''11!44F4::a$h''(([(1,2Q6 ''11!44IIaAD!'***;AGAJ;>::>>qwGG[ 	, 	,Ea**++AA%xr#   )
r   r   r   r   r!   r_   r	   r   r6   rC   rD   s   @r$   r   r      s          	
         :  53H        r#   r   c                   ~    e Zd ZdZdedef fdZ ej                    dedefd            Z	 ej                    dedefd            Z
 ej                    d	             Z ej                    d
edefd            Z ej                    dedefd            Z ej                    d
edefd            Z xZS )EuclideanCodebookzCodebook with Euclidean distance (inference-only).
    Args:
        dim (int): Dimension.
        codebook_size (int): Codebook size.
    rI   codebook_sizec                     t                                                       t          j        ||          }|| _        |                     d|           d S )Nembed)r@   r_   rL   zerosr   r   )r5   rI   r   r   rB   s       r$   r_   zEuclideanCodebook.__init__   sN    M3//*We,,,,,r#   r'   r(   c                 &    t          |d          }|S )Nz... d -> (...) dr   r4   s     r$   
preprocesszEuclideanCodebook.preprocess   s    a+,,r#   c                 \   | j                                                             |j                  }|                    d                              dd          d|z  |z  z
  |                    d                              dd          z    }|                    d          j        }|S )NrG   r   T)keepdimr   rm   rH   )r   tr<   r3   powsummaxindices)r5   r'   r   dist	embed_inds        r$   quantizezEuclideanCodebook.quantize   s    
!!!'**qa..Q>1!!!T!223 4HHH$$,	r#   c                 &     |j         |d d          S )Nrm   )rt   )r5   r   rs   s      r$   postprocess_embz!EuclideanCodebook.postprocess_emb   s    y~uSbSz**r#   r   c                 :    t          j        || j                  }|S r*   )r+   	embeddingr   r5   r   r   s      r$   
dequantizezEuclideanCodebook.dequantize   s    ;y$*55r#   c                     |j         }|                     |          }|                     |          }|                     ||          }|S r*   )rs   r   r   r   )r5   r'   rs   r   s       r$   encodezEuclideanCodebook.encode  sF    OOAMM!$$	((E::	r#   c                 0    |                      |          }|S r*   )r   r   s      r$   decodezEuclideanCodebook.decode  s    ??9--r#   )r   r   r   __doc__r   r_   rL   inference_moder	   r   r   r   r   r   r   rC   rD   s   @r$   r   r      s        -C - - - - - - - UF v     U& V     U+ + + UF v     U 6     U 6        r#   r   c                        e Zd ZdZdedef fdZed             Z ej	                    de
de
fd            Z ej	                    d	e
de
fd
            Z xZS )VectorQuantizationzVector quantization implementation (inference-only).
    Args:
        dim (int): Dimension
        codebook_size (int): Codebook size
    rI   r   c                     t                                                       t          ||          | _        || _        d S )N)rI   r   )r@   r_   r   	_codebookr   )r5   rI   r   rB   s      r$   r_   zVectorQuantization.__init__  sE    *s9FH H H*r#   c                     | j         j        S r*   )r   r   r5   s    r$   codebookzVectorQuantization.codebook"  s    ~##r#   r'   r(   c                     t          j        |                                dd          }| j                            |          }|S )NrG   rm   )prI   )r+   	normalizer-   r   r   )r5   r'   embed_ins      r$   r   zVectorQuantization.encode&  s:    K		QB///>((++r#   r   c                 Z    | j                             |          }t          |d          }|S )Nzb n d -> b d n)r   r   r   r   s      r$   r   zVectorQuantization.decode,  s,    >((33X'788r#   )r   r   r   r   r   r_   propertyr   rL   r   r	   r   r   rC   rD   s   @r$   r   r     s         +C + + + + + + + $ $ X$ U 6    
 U 6        r#   r   c                   <    e Zd ZdZ e            fdedef fdZdededeeef         fdZ	 e
j                    dededeeef         fd	            Z e
j                    deded
ededeeef         f
d            Zed             ZdefdZdefdZd Z xZS )S3TokenizerzbS3 tokenizer implementation (inference-only).
    Args:
        config  (ModelConfig): Config
    nameconfigc           	      r   t                                                       || _        || _        t	          | j        j        | j        j        | j        j        | j        j        | j        j	        |dk    rdnd| j        j
                  | _        t          | j        j        | j        j                  | _        d S )Nspeech_tokenizer_v1_25hzrG   r   )r@   r_   r   r   r   r   r   r   r   r   r   encoderr   r   	quantizer)r5   r   r   rB   s      r$   r_   zS3Tokenizer.__init__9  s    	#KK#K%K$K%333AAK 
 
 ,DK,E,0K,GI Ir#   melmel_lenr(   c                 .    |                      ||          S r*   )r   )r5   r   r   s      r$   r6   zS3Tokenizer.forwardI  s    }}S'***r#   c                     d}||k    }|                                 r|                     ||||          S |                     ||          \  }}| j                            |          }||fS )ap  
        Quantize mel spectrogram to tokens, with automatic long audio handling.

        Args:
            mel: mel spectrogram tensor, shape (batch_size, n_mels, T)
            mel_len: mel length tensor, shape (batch_size,)

        Returns:
            code: quantized tokens, shape (batch_size, T')
            code_len: token length, shape (batch_size,)
        i  )any_quantize_mixed_batchr   r   r   )r5   r   r   
max_frameslong_audio_maskhiddencode_lencodes           r$   r   zS3Tokenizer.quantizeL  s     
 "J.   	"--c7O.8: : :  $||C99FH>((00D>!r#   r   r   c                 	   |                     d          }d}d}d}d}	||z  |z  }
|	|z  |z  }|
|z
  }g }g }g }t          |          D ]}||         }||         }||                                         }|s|ddd|f         }|                                }||
k     r|
|z
  }t          j        |d|f          }|                    |           |                    t          j        ||j                             |                    |ddd	d
           d}d}||k     rt          ||
z   |          }|dd||f         }|                     d	          }||
k     r|
|z
  }t          j        |d|f          }|                    |           |                    t          j        ||j                             |                    |d|dd
           |d	z  }||z  }||k     |}|D ]}|d         |k    r|d         r||d<   |sMt          j
        |dt          j        |j                  t          j
        |t          j        |j                  fS t          j        |          }t          j        |          }|                     ||          \  }}| j                            |          } i }!t!          |          D ]\  }"}|d         }|d         }|d         }| |"d||"                                         f                                                                                                         }#|s;t          j        |#t          j        |j                  }$|$t)          |#          f|!|<   ||!vrg |!|<   |!|                             |#           t          |          D ]}||                                         rt|!|         }%t+          | d          r| j        dk    rd}&nd}&t/          |%|	|&          }'t          j        |'t          j        |j                  }(|(t)          |'          f|!|<   t1          d |!                                D                       })t          j
        ||)t          j        |j                  }*t          j
        |t          j        |j                  }+|!                                D ]\  }\  }$}|$|*|d|f<   ||+|<   |*|+fS )a  
        Handle mixed batch with both short and long audio using unified batch processing.

        Args:
            mel: mel spectrogram tensor, shape (batch_size, n_mels, T)
            mel_len: mel length tensor, shape (batch_size,)
            long_audio_mask: boolean mask for long audio, shape (batch_size,)
            max_frames: maximum frames for short audio

        Returns:
            code: quantized tokens, shape (batch_size, T')
            code_len: token length, shape (batch_size,)
        r   i>        r   N)deviceFr   )	batch_idxis_long_audiosegment_idxtotal_segmentsTr   r   r   )r3   r   r   r   speech_tokenizer_v12      )overlap
token_ratec              3   &   K   | ]}|d          V  dS )r   Nr"   )r   	code_infos     r$   	<genexpr>z4S3Tokenizer._quantize_mixed_batch.<locals>.<genexpr>
  s&      JJI9Q<JJJJJJr#   )r}   r   itemr+   padappendrL   tensorr   minr   longstackr   r   r   	enumeratecpunumpytolistlenhasattrr   r   r   valuesitems),r5   r   r   r   r   
batch_sizesample_rate
hop_lengthwindow_sizer   frames_per_windowframes_per_overlapframes_per_strideall_segmentsall_segments_lensegment_infor   	audio_melaudio_mel_lenr   segmentseg_lenpad_sizestartr   endr   infounified_batch_melunified_batch_lensr   r   codesresultsseg_idxsegment_codecode_tensoraudio_codesr   merged_codesmerged_codes_tensormax_code_lenoutput_codesoutput_codes_lens,                                               r$   r   z!S3Tokenizer._quantize_mixed_batchj  s   " XXa[[
 
 (+5C${2j@-0BB 
 z** 7	@ 7	@III#I.M+I6;;==M  2@#AAA~~$56',,.. ...07:HeGa];;G##G,,, ''L<<<> > >##!*%*#$&'	% %     m++e&77GGC'595G%ll1ooG!222#4w#>"#%!X"?"? ''000$++WSZ@@@B B B ''%.)-'2*.	) )     1$K..E+ m++0 "-( @ @DK(I55$:O51?-.@  	6;z %*Z&)j2 2 2 49;#-).*-*46 46 466 6 "K55"[)9::  <<(9;MNN%%f-- &|44 	8 	8MGT[)I 1M}-K !2(7+002222447CEE%%''&&((  ! 
8#l<1625*> > > '23|3D3D%E	"" G++)+GI&	")),7777 z** 	N 	NIy)..00 N%i0 4!# # $'+y4I'I'I!#JJ!#J7@GCM O  O  O
 ',l<9>:=*'F 'F 'F# ':3|;L;L%M	" JJ9I9IJJJJJ{:#/).*-*6 6 6 !;z-2Z.1j: : : 3:--// 	3 	3.I.X1<LIXI-.*2Y''---r#   c                 N    t          |                                           j        S r*   )next
parametersr   r   s    r$   r   zS3Tokenizer.device  s    DOO%%&&--r#   	onnx_pathc                 V    t          |d d          }|                     |d           d S )NFTstrict)r   load_state_dict)r5   r%  ckpts      r$   init_from_onnxzS3Tokenizer.init_from_onnx  s2    )T511T$/////r#   	ckpt_pathc                 b    t          j        |dd          }|                     |d           d S )Nr   T)map_locationmmapr'  )rL   loadr)  )r5   r,  r*  s      r$   init_from_ptzS3Tokenizer.init_from_pt"  s7    z)%dCCCT$/////r#   c                 H    |                                  D ]\  }}d|_        d S )NF)named_parametersrequires_grad)r5   r~   params      r$   freezezS3Tokenizer.freeze&  s6    --// 	( 	(HAu"'E	( 	(r#   )r   r   r   r   r   strr_   r	   r   r6   rL   r   r   r   r   r   r   r+  r1  r6  rC   rD   s   @r$   r   r   3  s        
 9D I IS I+ I I I I I I +6 +F +uVV^7L + + + + U"F "V "ffn8M " " " ": Um. m.& m./5m.*-m.272Gm. m. m. m.^ . . X.0 0 0 0 00c 0 0 0 0( ( ( ( ( ( (r#   r   )rE   )"r   dataclassesr   typingr   r   r   r   rJ   rL   torch.nn.functionalr
   rv   r+   einopsr   r	   utilsr   r   r   r   r   r&   r9   r>   rY   Moduler[   r   r   r   r   r   r"   r#   r$   <module>r>     s    " ! ! ! ! ! , , , , , , , , , ,                             X X X X X X X X X X X X        	 	 	 	 	 	 	 	
 
 
 
 
RY 
 
 
O O O O ORY O O ON N N N:  :  :  :  :  :  :  : z    RY   .6 6 6 6 629 6 6 6r1 1 1 1 1	 1 1 1h       <u( u( u( u( u(") u( u( u( u( u(r#   