
    0;ji                         d dl mZ d dlZd dlmc mZ ddlmZ  G d dej        j	        e          Z
 G d de
          ZdS )	    )ABCN   )Decoderc                   d     e Zd Z	 	 d	 fd	Z ej                    d
d            Zd ZddZ xZ	S )BASECFMr      c                     t                                                       || _        || _        || _        |j        | _        t          |d          r|j        | _        nd| _        d | _        d S )N	sigma_ming-C6?)	super__init__n_featsn_spksspk_emb_dimsolverhasattrr
   	estimator)selfr   
cfm_paramsr   r   	__class__s        f/root/voice-cloning/.venv/lib/python3.11/site-packages/chatterbox/models/s3gen/matcha/flow_matching.pyr   zBASECFM.__init__
   sk     	& ':{++ 	"'1DNN!DN          ?Nc                     t          j        |          |z  }t          j        dd|dz   |j                  }|                     ||||||          S )a  Forward diffusion

        Args:
            mu (torch.Tensor): output of encoder
                shape: (batch_size, n_feats, mel_timesteps)
            mask (torch.Tensor): output_mask
                shape: (batch_size, 1, mel_timesteps)
            n_timesteps (int): number of diffusion steps
            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
            spks (torch.Tensor, optional): speaker ids. Defaults to None.
                shape: (batch_size, spk_emb_dim)
            cond: Not used but kept for future purposes

        Returns:
            sample: generated mel-spectrogram
                shape: (batch_size, n_feats, mel_timesteps)
        r   r   )device)t_spanmumaskspkscond)torch
randn_likelinspacer   solve_euler)	r   r   r   n_timestepstemperaturer   r   zr   s	            r   forwardzBASECFM.forward   sY    & R  ;.1kAobiHHH&RdTXYYYr   c           	      X   |d         |d         |d         |d         z
  }	}}g }
t          dt          |                    D ]b}|                     ||||||          }||	|z  z   }||	z   }|
                    |           |t          |          dz
  k     r||dz            |z
  }	c|
d         S )aP  
        Fixed euler solver for ODEs.
        Args:
            x (torch.Tensor): random noise
            t_span (torch.Tensor): n_timesteps interpolated
                shape: (n_timesteps + 1,)
            mu (torch.Tensor): output of encoder
                shape: (batch_size, n_feats, mel_timesteps)
            mask (torch.Tensor): output_mask
                shape: (batch_size, 1, mel_timesteps)
            spks (torch.Tensor, optional): speaker ids. Defaults to None.
                shape: (batch_size, spk_emb_dim)
            cond: Not used but kept for future purposes
        r   r   )rangelenr   append)r   xr   r   r   r   r   t_dtsolstepdphi_dts                r   r#   zBASECFM.solve_euler4   s     !9fRj&)fQi*?b1 !S[[)) 	* 	*DnnQb!T4@@GBL ABAJJqMMMc&kkAo%%D1H%)2wr   c           	         |j         \  }}}t          j        |ddg|j        |j                  }t          j        |          }	dd| j        z
  |z  z
  |	z  ||z  z   }
|d| j        z
  |	z  z
  }t          j        | 	                    |
|||
                                |          |d          t          j        |          |j         d         z  z  }||
fS )a  Computes diffusion loss

        Args:
            x1 (torch.Tensor): Target
                shape: (batch_size, n_feats, mel_timesteps)
            mask (torch.Tensor): target mask
                shape: (batch_size, 1, mel_timesteps)
            mu (torch.Tensor): output of encoder
                shape: (batch_size, n_feats, mel_timesteps)
            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
                shape: (batch_size, spk_emb_dim)

        Returns:
            loss: conditional flow matching loss
            y: conditional flow
                shape: (batch_size, n_feats, mel_timesteps)
        r   )r   dtypesum)	reduction)shaper    randr   r5   r!   r
   Fmse_lossr   squeezer6   )r   x1r   r   r   r   br/   r.   r&   yulosss                r   compute_losszBASECFM.compute_lossT   s    $ (1a J1ay"(CCCR  !dn$))Q.R7!dn$))z$..D"aiikk4HH!W\]]]IdOOagaj(
 Qwr   )r   r   )r   NN)NN)
__name__
__module____qualname__r   r    inference_moder'   r#   rB   __classcell__r   s   @r   r   r   	   s        
      & UZ Z Z Z,  @       r   r   c                         e Zd Zd fd	Z xZS )CFMr   @   c                     t                                          ||||           ||dk    r|ndz   }t          d||d|| _        d S )N)r   r   r   r   r   r   )in_channelsout_channels )r   r   r   r   )r   rM   out_channelr   decoder_paramsr   r   r   s          r   r   zCFM.__init__w   si    !#	 	 	
 	
 	
 "FQJJ[[AF e[{eeVdeer   )r   rK   )rC   rD   rE   r   rG   rH   s   @r   rJ   rJ   v   sG        
f 
f 
f 
f 
f 
f 
f 
f 
f 
fr   rJ   )abcr   r    torch.nn.functionalnn
functionalr:   decoderr   Moduler   rJ   rO   r   r   <module>rX      s                         j j j j jehos j j jZf f f f f' f f f f fr   