
    0;jiP                        d Z ddlZddlmZ ddlmZmZmZ ddlZ	ddl
Z
ddlZddlmc mZ ddlZddlmZ defdZd$d	ed
edefdZ	 	 d$d	ed
edefdZd%dedefdZ ed          dedej        fd            Z	 	 	 d&deee	j        ej        f         dededeeeej        f                  fdZd'dej        dedej        fdZ dej        dej!        dej        fd Z"d!eej                 fd"Z#d# Z$dS )(a   Modified from https://github.com/openai/whisper/blob/main/whisper/audio.py
   Add rename_weights() & onnx2torch() & make_non_pad_mask() & mask_to_bias()
   Copy merge_tokenized_segments() from https://github.com/Mddct/s3tokenizer-long/blob/main/example.py
    N)	lru_cache)ListOptionalUnion)pad_sequenceweights_dictc                    i }|                                  D ]}d|v r"|dk    r| |         |d<   d|v r| |         ||<   (d|v r| |         ||<   8d|v r| |         ||<   Hd|v sJ |dd	                             d
d                              dd                              dd                              dd                              dd                              dd                              dd          }| |         |d| <   |S )z
    Rename onnx weights to pytorch format.

    Parameters
    ----------
    weight_dict: dict
        The dict containing weights in onnx format

    Returns
    -------
    A new weight dict containing the weights in pytorch format.
    	quantizerz,/quantizer/rq/model/layers.0/_codebook/Pow_1zquantizer._codebook.embedproject_downpositional_embeddingconvblocks   N/.MatMulweightAdd_1biasMulAddzmlp.mlpmlpzfsmn_block.Convfsmn_block.weightencoder.)keysreplace)r   new_weight_dictknew_ks       K/root/voice-cloning/.venv/lib/python3.11/site-packages/s3tokenizer/utils.py_rename_weightsr!       s\    O   B B!BBB?KA ;<<1$$%1!_"#q((!-aOAq[[!-aOAq====qrrU]]3,,44($ $$+GGV$<$<WW8>% >%%,WUF%;%;GG!5=* =*+273D3F,H ,H  3?q/O.u..//    F	onnx_path
torch_pathverbosec                 d   t          j        |           }i }d |j        j        D             }|j        j        D ]}|j        D ]}||v rd\  }}	||         }
|dv rd}nh|dv rd}na|dv rd}nZ|d	v rd
}nS|dk    rd}nJ|dk    rd}nA|dk    rd}n8|j        dk    r&|j                            dd          }|dz   }	|dz   }n|j        }|	||j        }|d         }|d         }||v r7t           j	        
                    ||                                                   nd}||v r7t           j	        
                    ||                                                   nd}d|j        _        d|j        _        t          j        |          }t          j        |          }|||<   |||	<   et           j	        
                    |
                                          }d|j        _        t          j        |          }t!          |j                  dk    s|dv r|||<   |                                ||<   t'          |          }|rM|                                D ]&\  }}t+          | d|j         d|j                    't+          d|            ~~|rt          j        ||           dS |S )a  
    Open an onnx file and convert to pytorch format.

    Parameters
    ----------
    onnx_path: str
        The onnx file to open, typically `speech_tokenizer_v1.onnx`

    torch_path: str
        The path to save the torch-formated checkpoint.

    verbose: bool
        Logging info or not.

    Returns
    -------
    A checkpoint dict containing the weights and their names, if torch_path is
    None. Otherwise save checkpoint dict to the desired path.
    c                     i | ]
}|j         |S  name.0initializers     r    
<dictcomp>zonnx2torch.<locals>.<dictcomp>Z   -        	+  r"   )NN)zonnx::Conv_1519zencoders.conv1.weightzonnx::Conv_2216encoder.conv1.weight)zonnx::Conv_1520zencoders.conv1.biaszonnx::Conv_2217encoder.conv1.bias)zonnx::Conv_1521zencoders.conv2.weightzonnx::Conv_2218encoder.conv2.weight)zonnx::Conv_1522zencoders.conv2.biaszonnx::Conv_2219encoder.conv2.biaszencoders.positional_embeddingencoder.positional_embeddingquantizer.project_in.bias%quantizer._codebook.project_down.biaszonnx::MatMul_2536'quantizer._codebook.project_down.weightLayerNormalizationz/LayerNormalization z.weightz.biasNr      T)r4    :  PyTorch weights saved to )onnxloadgraphr-   nodeinputop_typer*   r   numpy_helperto_arraycopyflags	writeabletorch
from_numpylenshapetr!   itemsprintdtypesave)r#   r$   r%   
onnx_modelr   initializer_maprA   
input_nameln_bias_nameln_weight_namer-   weight_nameln_name	ln_inputs
scale_name	bias_namescaler   weight_tensorbias_tensorweight_arraynew_weights_dictr   vs                           r    
onnx2torchrb   D   s   ( 9%%JL %+7  O  % EF EF* D	F D	FJ_,,/9,n-j9 "  
 #9KK $  
 #7KK $  
 #9KK $  
 #7KK#BBB"@KK#>>>"IKK#666"KKK|';;;"&)"3"34I2"N"N)09)<'.'8&*i!-,2J $
I!*1J )!I (?:: !-66'
35 559T 6 6 6@D 
 '/99  ,55'	24 448D 5 5 5?C  -1EK)+/DJ($)$4U$;$;M"'"24"8"8K1<L.3@L00#'#4#=#=#$% $%%)TVV !37L&0$)$4\$B$BM=.//!33{ G 8 8 5B[114AOO4E4E[1ID	FL '|44 8$**,, 	0 	0DAqQ..17..QW..////6*66777j  
#Z00000r"   c                 	   t          j        |           }i }d |j        j        D             i |j        j        D ]T}|j        dk    rG|j        D ]?}|j        dk    r2t           j        	                    |j
                  |j        d         <   @UdEfd	fd}|j        j        D ]}|j        }|j        }	|j        }
|dk    r= |
d	                   |d
<   t          |
          dk    r |
d                   |d<   n|dk    r= |
d	                   |d<   t          |
          dk    r |
d                   |d<   nD|                    d          r.|                    d          }|d	         }|                    d          d	         }d| }d|v r|	dk    r |
d	                   || d<   nd|v r5|	dk    r/ ||          }| |                                d	k    r||| d<   nd|v r|	dk    r |
d	                   || d<   nrd|v r5|	dk    r/ ||          }| |                                d	k    r||| d<   n9d|v r |
d	         d !          || d"<   nd#|v r ||          || d$<   nd%|v r |
d	         d !          || d&<   nd'|v r ||          || d(<   nd)|v r |
d	         d !          || d*<   nd+|v r ||          || d,<   nd-|v r |
d	         d !          || d.<   n}d/|v r ||          || d0<   ngd1|v r |
d	         d !          || d2<   nId3|v r ||          || d4<   n3d5|v r |
d	         d !          || d6<   nd7|v r ||          || d8<   d9|v r	 D ])}d:|v r |          |d;| <   d<|v r |          |d=<   *|j        j        D ]/}d>|j        v r$d?|j        v r |j        d	         d !          |d@<   0dA |                                D             }|rO|                                D ](\  }}|!t%          | dB|j         dC|j                    )t%          dD|            ~|rt+          j        ||           dS |S )Fz,
    Convert V3 ONNX to PyTorch format.
    c                     i | ]
}|j         |S r(   r)   r+   s     r    r.   z!onnx2torch_v3.<locals>.<dictcomp>   r/   r"   Constantvaluer   Fc                 ,   | v r8t           j                            |                                                    }n!| v r|                                          }nd S t	          j        |          }|r|j        dk    r|                                }|S )Nr:   )r>   rD   rE   rF   rI   rJ   ndimrM   )r*   	transposearrrM   constant_maprS   s       r    
get_tensorz!onnx2torch_v3.<locals>.get_tensor   s    ?""#,,_T-BCCHHJJCC\!!t$))++CC4S!! 	1Ar"   c                 >    | j         D ]} |          }||c S dS )zgHelper to find bias tensor for an Add node.
        Checks both inputs to see which one is a parameter.N)rB   )rA   inprM   rl   s      r    get_bias_tensorz&onnx2torch_v3.<locals>.get_bias_tensor   s;     : 	 	C
3A} tr"   z/conv1/Convr   r0   r:   r1   z/conv2/Convr2   r3   z/blocks.r   r   zencoder.blocks.zattn_ln/Mulr   z.attn_ln.weightzattn_ln/Addr   Nz.attn_ln.biasz
mlp_ln/Mulz.mlp_ln.weightz
mlp_ln/Addz.mlp_ln.biaszattn/query/MatMulT)ri   z.attn.query.weightzattn/query/Addz.attn.query.biaszattn/key/MatMulz.attn.key.weightzattn/key/Addz.attn.key.biaszattn/value/MatMulz.attn.value.weightzattn/value/Addz.attn.value.biaszattn/out/MatMulz.attn.out.weightzattn/out/Addz.attn.out.biaszmlp/mlp.0/MatMulz.mlp.0.weightzmlp/mlp.0/Addz.mlp.0.biaszmlp/mlp.2/MatMulz.mlp.2.weightzmlp/mlp.2/Addz.mlp.2.biaszfsmn_block/Convr   r   r5   r6   r
   r   r7   c                     i | ]
\  }}|||S )Nr(   )r,   r   ra   s      r    r.   z!onnx2torch_v3.<locals>.<dictcomp>G  s    KKKTQQ]Aq]]]r"   r;   r<   r=   )F)r>   r?   r@   r-   rA   rC   	attributer*   rD   rE   rM   outputrB   rK   
startswithsplitnumelrN   rO   rL   rP   rI   rQ   )r#   r$   r%   rR   r   rA   attrro   r*   opinputsparts
block_part	block_idxprefixrM   	init_namer   ra   rk   rl   rS   s                      @@@r    onnx2torch_v3r~      s    9%%JL %+7  O L %    <:%%    9''373D3M3M4  4 LQ0            % T Ty\ =  3=:fQi3H3HL/06{{Q5?Zq	5J5J12]""3=:fQi3H3HL/06{{Q5?Zq	5J5J12 __Z(( @	MJJsOOEqJ"((--a0I2y22F $$u;E:1I< <77788$&&2;;#OD))=QWWYY]]=>LF!9!9!9:%%"++:D*VAY:O:O66677%%"++#OD))=QWWYY]]<=LF!8!8!89 %,,>Hj1I?/ ?/ ?/:::;;!T))<KO= =88899 #d**<FJ1I=/ =/ =/888994'':I/$:O:O66677 %,,>Hj1I?/ ?/ ?/:::;;!T))<KO= =88899 #d**<FJ1I=/ =/ =/888994'':I/$:O:O66677 $t++9C1I:/ :/ :/55566 D((7Ft7L7L33344#t++9C1I:/ :/ :/55566 D((7Ft7L7L3334 $$ %  	)++3=:i3H3HL/I//0&)33DNJE EL@A  % 3 3$)##DL(@(@ >HZJqMT>3 >3 >3 9;
 LK\%7%7%9%9KKKL 8 &&(( 	4 	4DAq}22qw22223336*66777 
<,,,,,r"   >  filesrc                     t          j        |           \  }}||k    r(t          j                            ||          |          }|d         }|S )a@  
    Open an audio file and read as mono waveform, resampling as necessary

    Parameters
    ----------
    file: str
        The audio file to open

    sr: int
        The sample rate to resample the audio if necessary

    Returns
    -------
    A torch.Tensor containing the audio waveform, in float32 dtype.
    r   )
torchaudior?   
transformsResample)r   r   audiosample_rates       r    
load_audior   V  sQ      $..E;b%..{B??FF!HELr"   )maxsizen_melsreturnc                 j   |dv sJ d|             t           j                            t           j                            t                    dd          }t          j        |d          5 }t          j        |d|                    	                    |           cddd           S # 1 swxY w Y   dS )	ad  
    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
    Allows decoupling librosa dependency; saved using:

        np.savez_compressed(
            "mel_filters.npz",
            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
            mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
        )
    >   P      zUnsupported n_mels: assetszmel_filters.npzF)allow_picklemel_N)
ospathjoindirname__file__npr?   rI   rJ   to)devicer   filters_pathfs       r    _mel_filtersr   m  s     Y ?v ? ?7<< 9 98 13 3L	E	2	2	2 ?a/// 23366v>>? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?s   +0B((B,/B,r   r   paddingr   c                    t          j        |           s$t          | t                    rt	          |           } ||                     |          } |dk    rt          j        | d|f          } t          j        d                              | j	                  }t          j
        | dd|d          }|dddf                                         d	z  }t          | j	        |          }||z  }t          j        |d
                                          }	t          j        |	|	                                dz
            }	|	dz   dz  }	|	S )ay  
    Compute the log-Mel spectrogram of

    Parameters
    ----------
    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
        The path to audio or either a NumPy array or Tensor containing the
        audio waveform in 16 kHz

    n_mels: int
        The number of Mel-frequency filters, only 80 is supported

    padding: int
        Number of zero samples to pad to the right

    device: Optional[Union[str, torch.device]]
        If given, the audio tensor is moved to this device before STFT

    Returns
    -------
    torch.Tensor, shape = (128, n_frames)
        A Tensor that contains the Mel spectrogram
    Nr   i     T)windowreturn_complex.r:   g|=)ming       @g      @)rI   	is_tensor
isinstancestrr   r   Fpadhann_windowr   stftabsr   clamplog10maximummax)
r   r   r   r   r   r   
magnitudesfiltersmel_speclog_specs
             r    log_mel_spectrogramr     s2   : ?5!! &eS!! 	&u%%E  {{ea\**s##&&u|44F:eS#fTJJJDc3B3h##%%q(J5<00G#H{8///5577H}Xx||~~';<<H3#%HOr"   lengthsmax_lenc                 d   |                      d          }|dk    r|n%|                                                                 }t          j        d|t          j        | j                  }|                    d                              ||          }|                     d          }||k    }| S )a\  Make mask tensor containing indices of non-padded part.

    The sequences in a batch may have different lengths. To enable
    batch computing, padding is need to make all sequence in same
    size. To avoid the padding part pass value to context dependent
    block such as attention or convolution , this padding part is
    masked.

    1 for non-padded part and 0 for padded part.

    Parameters
    ----------
        lengths (torch.Tensor): Batch of lengths (B,).

    Returns:
    -------
        torch.Tensor: Mask tensor containing indices of padded part (B, max_T).

    Examples:
        >>> import torch
        >>> import s3tokenizer
        >>> lengths = torch.tensor([5, 3, 2])
        >>> masks = s3tokenizer.make_non_pad_mask(lengths)
        masks = [[1, 1, 1, 1, 1],
                 [1, 1, 1, 0, 0],
                 [1, 1, 0, 0, 0]]
    r   )rP   r   r   )	sizer   itemrI   arangeint64r   	unsqueezeexpand)r   r   
batch_size	seq_rangeseq_range_expandseq_length_expandmasks          r    make_non_pad_maskr     s    8 aJ 1gg'++--*<*<*>*>GQ$#(;$+N4 4 4I !**1--44ZII))"--00D5Lr"   r   rP   c                     | j         t          j        k    sJ |t          j        t          j        t          j        fv sJ |                     |          } d| z
  dz  } | S )a\  Convert bool-tensor to float-tensor for flash attention.

    Parameters
    ----------
        lengths (torch.Tensor): Batch of lengths (B, ?).

    Returns:
    -------
        torch.Tensor: Mask tensor containing indices of padded part (B, ?).

    Examples:
        >>> import torch
        >>> import s3tokenizer
        >>> lengths = torch.tensor([5, 3, 2])
        >>> masks = s3tokenizer.make_non_pad_mask(lengths)
        masks = [[1, 1, 1, 1, 1],
                 [1, 1, 1, 0, 0],
                 [1, 1, 0, 0, 0]]
        >>> new_masks = s3tokenizer.mask_to_bias(masks, torch.float32)
        new_masks =
            [[-0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00],
             [-0.0000e+00, -0.0000e+00, -0.0000e+00, -1.0000e+10, -1.0000e+10],
             [-0.0000e+00, -0.0000e+00, -1.0000e+10, -1.0000e+10, -1.0000e+10]]
    g      ?g    _)rP   rI   boolfloat32bfloat16float16r   )r   rP   s     r    mask_to_biasr     s[    2 :####U]ENEMBBBBB775>>D
 $J("DKr"   datac                     | }t          |t                    sJ t          j        d |D             t          j                  }d |D             }t          |dd          }|                    dd          |fS )	z Padding the data into batch data

    Parameters
    ----------
        data: List[Tensor], shape of Tensor (128, T)

    Returns:
    -------
        feats [B, 128, T_max], feats lengths [B]
    c                 8    g | ]}|                     d           S )r   )r   r,   ss     r    
<listcomp>zpadding.<locals>.<listcomp>  s"    !<!<!<!&&))!<!<!<r"   )rP   c                 6    g | ]}|                                 S r(   )rM   r   s     r    r   zpadding.<locals>.<listcomp>  s     ###qQSSUU###r"   Tr   )batch_firstpadding_valuer   r:   )r   listrI   tensorint32r   ri   )r   samplefeats_lengthsfeatspadded_featss        r    r   r     s     Ffd#####L!<!<V!<!<!<',{4 4 4M##F###E4qIIIL!!!Q''66r"   c                     g }|dz  |z  }t          |           D ]T\  }}|dk    rdn|}|t          |           dz
  k    r| nt          |          }|                    |||                    U|S )ap  
    Merges tokenized outputs by keeping the middle and dropping half of the overlapped tokens.

    Args:
    - tokenized_segments (List[List[int]]): List of tokenized sequences.
    - overlap (int): Overlapping duration in seconds (default: 4s).
    - token_rate (int): Number of tokens per second.

    Returns:
    - List[int]: A single merged token sequence.
    r:   r   r   )	enumeraterK   extend)	tokenized_segmentsoverlap
token_ratemerged_tokensoverlap_tokensitokenslrs	            r    merge_tokenized_segmentsr     s     M		
N 122 * *	6aAA^ C(:$;$;a$???^OOSF F 	VAaC[))))r"   )NF)r   )r   r   N)r   )%__doc__r   	functoolsr   typingr   r   r   numpyr   r>   rI   torch.nn.functionalnn
functionalr   r   torch.nn.utils.rnnr   dictr!   r   r   rb   r~   intr   Tensorr   ndarrayr   r   r   rP   r   r   r   r(   r"   r    <module>r      s   
 
			       ( ( ( ( ( ( ( ( ( (                    + + + + + +!$ ! ! ! !Hj  j # j 3 j  j  j  j  j \ %)"'b bS b!bb b b bJ S c    . 4? ? ? ? ? ?* 15	/ /bj%,./// / U3,-.	/ / / /d% %u| %c %%, % % % %P!u| !EK !EL ! ! ! !H7$u|$ 7 7 7 7*    r"   