
    0;ji2                     @   d dl mZ d dlmZmZ d dlZd dlmZmZm	Z	 d dl
mZmZmZ d dlmZmZmZmZ e G d d                      Z G d	 d
e          Z G d dej        j                  Z G d dej        j                  Z G d dej        j                  ZdS )    )	dataclass)OptionalTupleN)Conv1d	LayerNormLinear)FSMNMultiHeadAttentionFSQVectorQuantizationprecompute_freqs_cis)make_non_pad_maskmask_to_biasmerge_tokenized_segmentsonnx2torch_v3c                   r    e Zd ZU dZeed<   dZeed<   dZeed<   dZeed<   d	Z	eed
<   dZ
eed<   dZeed<   dS )ModelConfigV3   n_melsi  n_audio_ctxi   n_audio_state   n_audio_head   n_audio_layeri  n_codebook_sizeFuse_sdpaN)__name__
__module____qualname__r   int__annotations__r   r   r   r   r   r   bool     N/root/voice-cloning/.venv/lib/python3.11/site-packages/s3tokenizer/model_v3.pyr   r      s         FCKM3L#M3OSHdr#   r   c            	       6     e Zd Z	 	 ddedededef fdZ xZS )	MultiHeadAttentionV3   Fn_staten_headkernel_sizer   c                     t                                          ||||           t          ||          | _        t          ||d          | _        t          ||          | _        t          ||          | _        d S )NF)bias)super__init__r   querykeyvalueout)selfr(   r)   r*   r   	__class__s        r$   r.   zMultiHeadAttentionV3.__init__*   sp    
 	&+x@@@GW--
'7777GW--
'7++r#   r'   F)r   r   r   r   r!   r.   __classcell__r4   s   @r$   r&   r&   (   sn        
 %'"'	
, 
,
,
, "
,  	
, 
, 
, 
, 
, 
, 
, 
, 
, 
,r#   r&   c            
            e Zd Z	 	 ddedededef fdZ	 	 	 dd	ej        d
eej                 deej                 deej                 fdZ	 xZ
S )ResidualAttentionBlockV3r'   Fr(   r)   r*   r   c                    t                                                       t          ||||          | _        t	          |d          | _        |dz  }t          j                            t          ||          t          j        
                                t          ||                    | _        t	          |d          | _        d S )Nr   gh㈵>)eps   )r-   r.   r&   attnr   attn_lntorchnn
Sequentialr   GELUmlpmlp_ln)r3   r(   r)   r*   r   n_mlpr4   s         r$   r.   z!ResidualAttentionBlockV3.__init__9   s    
 	()/)42:< < <	 !d333!8&&vgu'='=ux}}'-eW'='=? ?T222r#   Nxmaskmask_pad	freqs_cisc                     ||                      |                     |          |||          d         z   }||                     |                     |                    z   }|S )N)rH   rI   rJ   r   )r>   r?   rD   rE   )r3   rG   rH   rI   rJ   s        r$   forwardz ResidualAttentionBlockV3.forwardJ   sf    
 		LLOO$  ! !!"$ $ Q(((r#   r5   )NNN)r   r   r   r   r!   r.   r@   Tensorr   rL   r6   r7   s   @r$   r9   r9   7   s        
 %'"'	3 333 "3  	3 3 3 3 3 3& 043748		 	<	u|,	 #5<0	 $EL1		 	 	 	 	 	 	 	r#   r9   c                        e Zd Zdedededededef fdZdej        d	ej        d
eej        ej        f         fdZ	 xZ
S )AudioEncoderV3r   r(   r)   n_layerstrider   c                 h   t                                                       || _        t          |d|d          | _        t          ddd          | _        t          dd          | _        t          j	        
                    fdt          |          D                       | _        d S )N      )r*   rQ   padding   @   i   c                 4    g | ]}t                     S )r;   )r9   ).0_r)   r(   r   s     r$   
<listcomp>z+AudioEncoderV3.__init__.<locals>.<listcomp>p   s8     +
 +
 +
 %WfxHHH+
 +
 +
r#   )r-   r.   rQ   r   conv1conv2r   rJ   r@   rA   
ModuleListrangeblocks)r3   r   r(   r)   rP   rQ   r   r4   s     ``  `r$   r.   zAudioEncoderV3.__init__X   s     	F#()#)$%	' ' '

 G#()#$$%	' ' '

 .b(;;h)) +
 +
 +
 +
 +
 +
7^^+
 +
 +
  r#   rG   x_lenreturnc                    |j         d         }t          ||                              d          }t          j        j                            |                     ||z                      }|dz   dz
  dz
  | j        z  dz   }|dz   dz
  dz
  | j        z  dz   }t          ||                              d          }t          j        j                            | 	                    ||z                      }|dz   dz
  dz
  dz  dz   }|dz   dz
  dz
  dz  dz   }t          ||                              d          }|
                    ddd          }| j                            |j                  }|                    dd          }t          ||j                  }| j        D ]>} |||                    d          ||d|                    d                             }?||fS )z
        x : torch.Tensor, shape = (batch_size, n_mels, T)
            the mel spectrogram of the audio
        x_len: torch.Tensor, shape = (batch_size,)
            length of each audio in x
        rT   rV   r   N)shaper   	unsqueezer@   rA   
functionalgelur\   rQ   r]   permuterJ   todevice	transposer   dtyper`   size)	r3   rG   ra   TrH   x_slenrJ   rI   blocks	            r$   rL   zAudioEncoderV3.forwardu   s    GBK **44Q77H$$TZZD%9%9::[(1,<q@a%+%)dk9A= //99!<<H$$TZZD%9%9::[(1,2Q61*{*Q.14q8 //99!<<IIaAN%%ah//	>>!Q''D!'**[ 	M 	MEa**Hi

6KLLAA%xr#   )r   r   r   r   r!   r.   r@   rM   r   rL   r6   r7   s   @r$   rO   rO   V   s          	
        : |(-elEL.H(I       r#   rO   c                       e Zd ZdZ e            fdedef fdZdej        dej        de	ej        ej        f         fdZ
 ej                    dej        dej        de	ej        ej        f         fd	            Z ej                    dej        dej        d
ej        de	ej        ej        f         fd            Zed             ZdefdZdefdZd fd	Zd Z xZS )S3TokenizerV3zfS3 tokenizer v3 implementation (inference-only).
    Args:
        config (ModelConfigV3): Config
    nameconfigc                 L   t                                                       || _        || _        t	          | j        j        | j        j        | j        j        | j        j        d| j        j	                  | _
        t          | j        j        | j        j                  | _        d S )NrV   )r-   r.   rt   ru   rO   r   r   r   r   r   encoderr
   r   	quantizer)r3   rt   ru   r4   s      r$   r.   zS3TokenizerV3.__init__   s    	%KK%K$K%K 
 
 /K%K'
 
r#   melmel_lenrb   c                 .    |                      ||          S N)quantize)r3   ry   rz   s      r$   rL   zS3TokenizerV3.forward   s    }}S'***r#   c                     d}||k    }|                                 r|                     |||          S |                     ||          \  }}| j                            |          }||fS )Ni  )any_quantize_mixed_batchrw   rx   encode)r3   ry   rz   
max_frameslong_audio_maskhiddencode_lencodes           r$   r}   zS3TokenizerV3.quantize   sv    
 
!J.   	"--c7OLLL#||C99FH>((00D>!r#   r   c                 	   |                     d          }d}d}d}d}||z  |z  }	||z  |z  }
|	|
z
  }g }g }g }t          |          D ]}||         }||         }||                                         }|s|d d d |f         }|                                }||	k     r,|	|z
  }t          j        j                            |d|f          }|                    |           |                    t          j        ||j	                             |                    |dddd	           d}d}||k     rt          ||	z   |          }|d d ||f         }|                     d          }||	k     r,|	|z
  }t          j        j                            |d|f          }|                    |           |                    t          j        ||j	                             |                    |d
|d d	           |dz  }||z  }||k     |}|D ]}|d         |k    r|d         r||d<   |sMt          j        |dt          j        |j	                  t          j        |t          j        |j	                  fS t          j        |          }t          j        |          }|                     ||          \  }}| j                            |          }i } t#          |          D ]\  }!}|d         }|d         }||!d ||!                                         f                                                                                                         }"|s;t          j        |"t          j        |j	                  }#|#t+          |"          f| |<   || vrg | |<   | |                             |"           t          |          D ]r}||                                         rV| |         }$d}%t-          |$||%          }&t          j        |&t          j        |j	                  }'|'t+          |&          f| |<   st/          d |                                 D                       }(t          j        ||(t          j        |j	                  })t          j        |t          j        |j	                  }*|                                 D ]\  }\  }#}|#|)|d |f<   ||*|<   |)|*fS )Nr   i>        r=   )rk   FrT   )	batch_idxis_long_audiosegment_idxtotal_segmentsTr   r   r   )rm   rk      )overlap
token_ratec              3   &   K   | ]}|d          V  dS )rT   Nr"   )rY   	code_infos     r$   	<genexpr>z6S3TokenizerV3._quantize_mixed_batch.<locals>.<genexpr>-  s&      JJI9Q<JJJJJJr#   )rn   r_   itemr@   rA   rg   padappendtensorrk   minzeroslongstackrw   rx   r   	enumeratecpunumpytolistlenr   maxvaluesitems)+r3   ry   rz   r   
batch_sizesample_rate
hop_lengthwindow_sizer   frames_per_windowframes_per_overlapframes_per_strideall_segmentsall_segments_lensegment_infor   	audio_melaudio_mel_lenr   segmentseg_lenpad_sizestartr   endr   infounified_batch_melunified_batch_lensr   r   codesresultsseg_idxsegment_codecode_tensoraudio_codesr   merged_codesmerged_codes_tensormax_code_lenoutput_codesoutput_codes_lens+                                              r$   r   z#S3TokenizerV3._quantize_mixed_batch   s    XXa[[

'+5C${2j@-0BBz** -	@ -	@III#I.M+I6;;==M  (@#AAA~~$56',,.....07:H#h155g8}MMG##G,,, ''L<<<> > >##!*%*#$&'	% %     m++e&77GGC'595G%ll1ooG!222#4w#>"'("5"9"9#a]#4 #4 ''000$++WSZ@@@B B B ''%.)-'2*.	) )     1$K..E% m++& "-( @ @DK(I55$:O51?-.@  	6;z %*Z&)j2 2 2 49;#-).*-*46 46 466 6 "K55"[)9::<<(9;MNN%%f--&|44 	8 	8MGT[)I 1M 2(7+002222447CEE%%''&&((  ! 8#l<1625*> > > '23|3D3D%E	""G++)+GI&	")),7777z** 
	N 
	NIy)..00 	N%i0
7@GCM O  O  O ',l<9>:=*'F 'F 'F# ':3|;L;L%M	"JJ9I9IJJJJJ{:#/).*-*6 6 6 !;z-2Z.1j: : : 3:--// 	3 	3.I.X1<LIXI-.*2Y''---r#   c                 N    t          |                                           j        S r|   )next
parametersrk   )r3   s    r$   rk   zS3TokenizerV3.device<  s    DOO%%&&--r#   	onnx_pathc                 V    t          |d d          }|                     |d           d S )NFstrict)r   load_state_dict)r3   r   ckpts      r$   init_from_onnxzS3TokenizerV3.init_from_onnx@  s7    Y"$ $T%00000r#   	ckpt_pathc                 b    t          j        |dd          }|                     |d           d S )Nr   T)map_locationmmapr   )r@   loadr   )r3   r   r   s      r$   init_from_ptzS3TokenizerV3.init_from_ptE  s7    z)%dCCCT$/////r#   Tc                 J    t                                          ||          S )Nr   )r-   r   )r3   
state_dictr   r4   s      r$   r   zS3TokenizerV3.load_state_dictI  s"     ww&&z&&AAAr#   c                 H    |                                  D ]\  }}d|_        d S )NF)named_parametersrequires_grad)r3   rZ   params      r$   freezezS3TokenizerV3.freezeN  s6    --// 	( 	(HAu"'E	( 	(r#   )T)r   r   r   __doc__r   strr.   r@   rM   r   rL   inference_moder}   r   propertyrk   r   r   r   r   r6   r7   s   @r$   rs   rs      s        
 ;H-// 
 
S 
- 
 
 
 
 
 
"+5< ++*/el0J*K+ + + + U"EL ","+0u|1K+L" " " " U}.|}..3l}."\}. 
u|U\)	*}. }. }. }.~ . . X.1 1 1 1 1
0c 0 0 0 0B B B B B B
( ( ( ( ( ( (r#   rs   )dataclassesr   typingr   r   r@   s3tokenizer.modelr   r   r   s3tokenizer.model_v2r	   r
   r   s3tokenizer.utilsr   r   r   r   r   r&   rA   Moduler9   rO   rs   r"   r#   r$   <module>r      s   " ! ! ! ! ! " " " " " " " "  7 7 7 7 7 7 7 7 7 7O O O O O O O O O OH H H H H H H H H H H H        , , , , ,1 , , ,    ux   >9 9 9 9 9UX_ 9 9 9x~( ~( ~( ~( ~(EHO ~( ~( ~( ~( ~(r#   