
    0;ji"*                         d dl Z d dlZd dlmc mZ ddlmZ ddlm	Z	 d dl
m
Z
 d Z G d de          Z G d	 d
e          ZdS )    N   )BASECFM)
CFM_PARAMS)tqdmc                        fd|D             S )Nc                 l    g | ]0}|j         j        r|j         k    r|n|                              1S  )dtypeis_floating_pointto).0ar
   s     _/root/voice-cloning/.venv/lib/python3.11/site-packages/chatterbox/models/s3gen/flow_matching.py
<listcomp>zcast_all.<locals>.<listcomp>   s>    dddZ[ag/UAGu4D4DAA!$$u++ddd    r	   )r
   argss   ` r   cast_allr      s    dddd_cddddr   c                        e Zd Zddej        j        f fdZ ej                    dddd ej        dddd	          fd
            Z	ddZ
ddZ xZS )ConditionalCFMr   @   N	estimatorc                     t                                          ||||           |j        | _        |j        | _        |j        | _        ||dk    r|ndz   }|| _        d S )N)n_feats
cfm_paramsn_spksspk_emb_dimr   )super__init__t_schedulertraining_cfg_rateinference_cfg_rater   selfin_channelsr   r   r   r   	__class__s         r   r   zConditionalCFM.__init__   st    !#	 	 	
 	
 	
 &1!+!=","?!FQJJ[[AF"r         ?r   P      c	                      t          d          )a  Forward diffusion

        Args:
            mu (torch.Tensor): output of encoder
                shape: (batch_size, n_feats, mel_timesteps)
            mask (torch.Tensor): output_mask
                shape: (batch_size, 1, mel_timesteps)
            n_timesteps (int): number of diffusion steps
            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
            spks (torch.Tensor, optional): speaker ids. Defaults to None.
                shape: (batch_size, spk_emb_dim)
            cond: Not used but kept for future purposes

        Returns:
            sample: generated mel-spectrogram
                shape: (batch_size, n_feats, mel_timesteps)
        z)unused, needs updating for meanflow model)NotImplementedErrortorch
randn_liker   devicer
   shapeconcatstacklinspacer   cospisolve_euler)r#   mumaskn_timestepstemperaturespkscond
prompt_len
flow_cachez
cache_sizez_cachemu_cachet_spans                 r   forwardzConditionalCFM.forward)   s    ( ""MNNNr   Fc           
         |j         }t          ||||||| j        j                   \  }}}}}}|                    d          |                    d          }
}	t	          j        d|	z  d|
g|j        |j                   }t	          j        d|	z  d|
g|j        |j                   }t	          j        d|	z  d|
g|j        |j                   }t	          j        d|	z  g|j        |j                   }t	          j        d|	z  dg|j        |j                   }t	          j        d|	z  d|
g|j        |j                   }t	          j        d|	z  g|j        |j                   }t          |dd         |dd                   D ]\  }}|                    d	          }|                    d	          }|x|d|	<   ||	d<   |x|d|	<   ||	d<   ||d|	<   |x|d|	<   ||	d<   ||d|	<   ||d|	<   |x|d|	<   ||	d<   | j        	                    |||||||r|nd
          }t	          j
        ||	|	gd	          \  }}d| j        z   |z  | j        |z  z
  }||z
  }|||z  z   }|                    |          S )at  
        Fixed euler solver for ODEs.
        Args:
            x (torch.Tensor): random noise
            t_span (torch.Tensor): n_timesteps interpolated
                shape: (n_timesteps + 1,)
            mu (torch.Tensor): output of encoder
                shape: (batch_size, n_feats, mel_timesteps)
            mask (torch.Tensor): output_mask
                shape: (batch_size, 1, mel_timesteps)
            spks (torch.Tensor, optional): speaker ids. Defaults to None.
                shape: (batch_size, spk_emb_dim)
            cond: Not used but kept for future purposes
            meanflow: meanflow mode
        r
   r   r(   r'   r-   r
   r   N)dim)xr6   r5   tr9   r:   rr&   )r
   r   r   sizer+   zerosr-   zip	unsqueezerB   splitr!   r   )r#   rH   rA   r5   r6   r9   r:   meanflowin_dtypeBTx_inmask_inmu_int_inspks_incond_inr_inrI   rJ   dxdtcfg_dxdtdts                          r   r4   zConditionalCFM.solve_eulerN   s     7*21fb$dZ^ZhZn*o*o*o'62tT4 wwqzz166!991+q1ub!nQXQWMMM+q1uq!nQXQWMMM+q1ub!nQXQWMMM+q1unQXQWMMM+q1ubnQXQWMMM+q1ub!nQXQWMMM+q1unQXQWMMMssVABBZ00 !	 !	DAq""A""A" #$#D!HtABBx(,,GBQBK'!""+E"1"I"##D!HtABBxGBQBKGBQBK"##D!HtABBx>))W$W7",$$ *  D #[1v1===ND(422d:T=TW_=__DQBBIAA ttH~~r   c                    |j         \  }}}t          j        |ddg|j        |j                  }| j        dk    r'dt          j        |dz  t          j        z            z
  }t          j        |          }	dd| j	        z
  |z  z
  |	z  ||z  z   }
|d| j	        z
  |	z  z
  }| j
        dk    rqt          j        ||j                  | j
        k    }||                    ddd          z  }||                    dd          z  }||                    ddd          z  }|                     |
|||                                ||          }t          j        ||z  ||z  d	          t          j        |          |j         d         z  z  }||
fS )
a  Computes diffusion loss

        Args:
            x1 (torch.Tensor): Target
                shape: (batch_size, n_feats, mel_timesteps)
            mask (torch.Tensor): target mask
                shape: (batch_size, 1, mel_timesteps)
            mu (torch.Tensor): output of encoder
                shape: (batch_size, n_feats, mel_timesteps)
            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
                shape: (batch_size, spk_emb_dim)

        Returns:
            loss: conditional flow matching loss
            y: conditional flow
                shape: (batch_size, n_feats, mel_timesteps)
        r   rE   cosine      ?r   )r-   rF   sum)	reduction)r.   r+   randr-   r
   r   r2   r3   r,   	sigma_minr    viewr   squeezeFmse_lossra   )r#   x1r6   r5   r9   r:   b_rI   r=   yucfg_maskpredlosss                  r   compute_losszConditionalCFM.compute_loss   s   $ (1a J1ay"(CCCx''EIa#g0111AR  !dn$))Q.R7!dn$)) !A%%z!BI6669OOHhmmB1---B(--A...D(--Aq111D~~ar199;;dCCz$+q4x5AAAUYt__WXW^_`WaEabQwr   )r   r   N)F)NN)__name__
__module____qualname__r+   nnModuler   inference_moderL   rB   r4   rq   __classcell__r%   s   @r   r   r      s        # #UZU]Ud # # # # # # U9<4d_`mxmrmxyz|~  AB  DE  nF  nF "f "f "f "fHC C C CJ' ' ' ' ' ' ' 'r   r   c                   b     e Zd Zdedddf fd	Z ej                    d
d            Zd	 Z xZ	S )CausalConditionalCFM   r   r'   Nc                 `    t                                          |||||           d | _        d S )N)r   r   
rand_noiser"   s         r   r   zCausalConditionalCFM.__init__   s.    j&+yQQQr   r&   Fc	           	         |                     d          }	t          j        |          }
|4|                     d          |                     d          z
  }||
d|df<   t          j        dd|dz   |j        |j                  }|s2| j        dk    r'dt          j        |dz  t          j        z            z
  }|r| 	                    |
|||||	          dfS | 
                    |
||||||
          dfS )a  Forward diffusion

        Args:
            mu (torch.Tensor): output of encoder
                shape: (batch_size, n_feats, mel_timesteps)
            mask (torch.Tensor): output_mask
                shape: (batch_size, 1, mel_timesteps)
            n_timesteps (int): number of diffusion steps
            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
            spks (torch.Tensor, optional): speaker ids. Defaults to None.
                shape: (batch_size, spk_emb_dim)
            cond: Not used but kept for future purposes
            noised_mels: gt mels noised a time t
        Returns:
            sample: generated mel-spectrogram
                shape: (batch_size, n_feats, mel_timesteps)
        r   Nr(   .r   rE   r_   r`   )rA   r5   r6   r9   r:   )rA   r5   r6   r9   r:   rP   )rK   r+   r,   r1   r-   r
   r   r2   r3   basic_eulerr4   )r#   r5   r6   r7   r8   r9   r:   noised_melsrP   rR   r=   r;   rA   s                r   rB   zCausalConditionalCFM.forward   s   ( GGAJJR  "k&6&6q&9&99J"-Ac:;; 1kAobirxXXX 	<t/8;;6C<%(#:;;;F
  	d##Af$TX\#]]_ccc&RdTXckllnrrrr   c           
         |j         }t          ||||||| j        j                   \  }}}}}}t          d           t	          t          |dd df         |ddd f                   |j        d         dz
            D ]C\  }}	|d          |	d          }	}| j                            |||||||	          }
|	|z
  }|||
z  z   }D|                    |          S )NrD   zS3 Token -> Mel Inference....rF   r   )total)r6   r5   rI   r9   r:   rJ   )	r
   r   r   printr   rM   r.   rB   r   )r#   rH   rA   r5   r6   r9   r:   rQ   rI   rJ   r[   r]   s               r   r   z CausalConditionalCFM.basic_euler   s    7*21fb$dZ^ZhZn*o*o*o'62tT4,---VC"H-vc122g??v|TVGWZ[G[\\\ 	 	DAqT7AdGqA>))!$2TX\])^^DQBBIAAttH~~r   )r&   NNNF)
rr   rs   rt   r   r   r+   rw   rB   r   rx   ry   s   @r   r{   r{      s        #&:aUWcg      
 U%s %s %s %sN      r   r{   )	threadingr+   torch.nn.functionalru   
functionalrg   matcha.flow_matchingr   configsr   r   r   r   r{   r	   r   r   <module>r      s                  ) ) ) ) ) )            e e e` ` ` ` `W ` ` `F9 9 9 9 9> 9 9 9 9 9r   