
    0;jic8                        d dl Z d dlmZ d dlZd dlmZ d dlmc mZ d dl	m
Z
 d dlmZ d dlmZmZmZ ddlmZ  G d d	ej        j                  Z G d
 dej        j                  Z G d dej        j                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d de
          Z G d dej                  ZdS )    N)Optional)ConformerBlock)get_activation)pack	rearrangerepeat   )BasicTransformerBlockc                   &     e Zd Z fdZddZ xZS )SinusoidalPosEmbc                     t                                                       || _        | j        dz  dk    s
J d            d S )N   r   z(SinusoidalPosEmb requires dim to be even)super__init__dimselfr   	__class__s     `/root/voice-cloning/.venv/lib/python3.11/site-packages/chatterbox/models/s3gen/matcha/decoder.pyr   zSinusoidalPosEmb.__init__   sF    x!|q   "L           c                    |j         dk     r|                    d          }|j        }| j        dz  }t	          j        d          |dz
  z  }t          j        t          j        ||          	                                | z            }||                    d          z  |                    d          z  }t          j
        |                                |                                fd          }|S )Nr	   r   r   i'  )device)r   )ndim	unsqueezer   r   mathlogtorchexparangefloatcatsincos)r   xscaler   half_dimembs         r   forwardzSinusoidalPosEmb.forward   s    6A::AA8q=huooA.iXf===CCEELMMakk!nn$s}}Q'7'77iCGGII.B777
r   )r   __name__
__module____qualname__r   r*   __classcell__r   s   @r   r   r      sQ        M M M M M
	 	 	 	 	 	 	 	r   r   c                   &     e Zd Zd fd	Zd Z xZS )Block1D   c                 4   t                                                       t          j                            t          j                            ||dd          t          j                            ||          t          j                              | _        d S )N   r	   padding)	r   r   r   nn
SequentialConv1d	GroupNormMishblock)r   r   dim_outgroupsr   s       r   r   zBlock1D.__init__!   sm    X((HOOC!QO77Hvw//GII
 



r   c                 <    |                      ||z            }||z  S N)r=   )r   r&   maskoutputs       r   r*   zBlock1D.forward)   s!    AH%%}r   r3   r+   r0   s   @r   r2   r2       sL        
 
 
 
 
 
      r   r2   c                   &     e Zd Zd fd	Zd Z xZS )ResnetBlock1Dr3   c                    t                                                       t          j                            t          j                    t          j                            ||                    | _        t          |||          | _	        t          |||          | _
        t          j                            ||d          | _        d S )N)r?   r	   )r   r   r   r8   r9   r<   Linearmlpr2   block1block2r:   res_conv)r   r   r>   time_emb_dimr?   r   s        r   r   zResnetBlock1D.__init__/   s    8&&rwyy%(//,PW2X2XYYc76:::gwv>>>Wa88r   c                     |                      ||          }||                     |                              d          z  }|                     ||          }||                     ||z            z   }|S )Nr   )rJ   rI   r   rK   rL   )r   r&   rB   time_embhrC   s         r   r*   zResnetBlock1D.forward8   sk    KK4  	TXXh))"---KK4  T]]1t8,,,r   rD   r+   r0   s   @r   rF   rF   .   sL        9 9 9 9 9 9      r   rF   c                   $     e Zd Z fdZd Z xZS )Downsample1Dc                     t                                                       t          j                            ||ddd          | _        d S )Nr5   r   r	   )r   r   r   r8   r:   convr   s     r   r   zDownsample1D.__init__A   s:    HOOCaA66			r   c                 ,    |                      |          S rA   )rT   )r   r&   s     r   r*   zDownsample1D.forwardE   s    yy||r   r+   r0   s   @r   rR   rR   @   sG        7 7 7 7 7      r   rR   c                   R     e Zd Z	 	 	 	 d
dededededee         f
 fdZdd	Z xZS )TimestepEmbeddingsiluNin_channelstime_embed_dimact_fnout_dimpost_act_fnc                 p   t                                                       t          j        ||          | _        |t          j        ||d          | _        nd | _        t          |          | _        ||}n|}t          j        ||          | _        |	d | _	        d S t          |          | _	        d S )NF)bias)
r   r   r8   rH   linear_1	cond_projr   actlinear_2post_act)	r   rY   rZ   r[   r\   r]   cond_proj_dimtime_embed_dim_outr   s	           r   r   zTimestepEmbedding.__init__J   s     		+~>>$Y}kNNNDNN!DN!&))!(!/	.2DEE DMMM*;77DMMMr   c                     |||                      |          z   }|                     |          }| j        |                     |          }|                     |          }| j        |                     |          }|S rA   )ra   r`   rb   rc   rd   )r   sample	conditions      r   r*   zTimestepEmbedding.forwardi   sw     dnnY777Fv&&8XXf%%Fv&&=$]]6**Fr   )rX   NNNrA   )	r,   r-   r.   intstrr   r   r*   r/   r0   s   @r   rW   rW   I   s        
 %)8 88 8 	8
 8 c]8 8 8 8 8 8>       r   rW   c                   *     e Zd ZdZd fd	Zd Z xZS )	
Upsample1Da  A 1D upsampling layer with an optional convolution.

    Parameters:
        channels (`int`):
            number of channels in the inputs and outputs.
        use_conv (`bool`, default `False`):
            option to use a convolution.
        use_conv_transpose (`bool`, default `False`):
            option to use a convolution transpose.
        out_channels (`int`, optional):
            number of output channels. Defaults to `channels`.
    FTNrT   c                 B   t                                                       || _        |p|| _        || _        || _        || _        d | _        |r$t          j	        || j        ddd          | _        d S |r)t          j
        | j        | j        dd          | _        d S d S )N   r   r	   r5   r6   )r   r   channelsout_channelsuse_convuse_conv_transposenamerT   r8   ConvTranspose1dr:   )r   rp   rr   rs   rq   rt   r   s         r   r   zUpsample1D.__init__   s     (4H "4		 	R*8T5F1aPPDIII 	R	$-1BAqQQQDIII	R 	Rr   c                     |j         d         | j        k    sJ | j        r|                     |          S t	          j        |dd          }| j        r|                     |          }|S )Nr	   g       @nearest)scale_factormode)shaperp   rs   rT   Finterpolaterr   )r   inputsoutputss      r   r*   zUpsample1D.forward   sn    |A$-////" 	%99V$$$-SyIII= 	)ii((Gr   )FTNrT   )r,   r-   r.   __doc__r   r*   r/   r0   s   @r   rm   rm   x   s\         R R R R R R
 
 
 
 
 
 
r   rm   c            
       F     e Zd Zdddddddddd	 fd	
Z	 	 	 d fd	Z xZS )ConformerWrapper@   r3   ro   r      r   F)	dim_headheadsff_multconv_expansion_factorconv_kernel_sizeattn_dropout
ff_dropoutconv_dropoutconv_causalc       
         ^    t                                          |||||||||	|

  
         d S )N)
r   r   r   r   r   r   r   r   r   r   )r   r   )r   r   r   r   r   r   r   r   r   r   r   r   s              r   r   zConformerWrapper.__init__   sN     	"7-%!%# 	 	
 	
 	
 	
 	
r   Nc                 n    t                                          ||                                          S )N)r&   rB   )r   r*   bool)r   hidden_statesattention_maskencoder_hidden_statesencoder_attention_masktimestepr   s         r   r*   zConformerWrapper.forward   s+     ww^5H5H5J5JKKKr   )NNNr+   r0   s   @r   r   r      s        
 
 
 
 
 
 
 
> ##L L L L L L L L L Lr   r   c                   X     e Zd Z	 	 	 	 	 	 	 	 	 	 d fd		Zed
             Zd ZddZ xZS )Decoder   r   皙?r   r	   r   ro   snaketransformerc           
      6   	
 t                                                       t          |          }| _        | _        t          |           _        |d         dz  }t          ||d           _        t          j
        g            _        t          j
        g            _        t          j
        g            _        |t          t          |                    D ]}}||         |t          |          dz
  k    }t!          ||          }t          j
        	
 fdt          |          D                       }|st#                    nt          j        dd	          } j                            t          j
        |||g                     t          |          D ]}|d
         }|d
         }t!          ||          }t          j
        	 fdt          |          D                       } j                            t          j
        ||g                     |d d d
         |d         fz   }t          t          |          dz
            D ]}||         }||dz            |t          |          dz
  k    }t!          d|z  |          }t          j
        	 fdt          |          D                       }|st)          d          nt          j        dd	          } j                            t          j
        |||g                     t+          |d
         |d
                    _        t          j        |d
          j        d           _                                          d S )Nr   ro   rX   )rY   rZ   r[   r	   )r   r>   rM   c           
      D    g | ]}                               S  	get_block)	.0_r[   attention_head_dimdown_block_typedropout	num_headsoutput_channelr   s	     r   
<listcomp>z$Decoder.__init__.<locals>.<listcomp>   sK     
 
 
  NN'&*! 
 
 
r   r5   r6   r   c           
      D    g | ]}                               S r   r   )	r   r   r[   r   r   mid_block_typer   r   r   s	     r   r   z$Decoder.__init__.<locals>.<listcomp>	  sK     
 
 
  NN&&*! 
 
 
r   r   c           
      D    g | ]}                               S r   r   )	r   r   r[   r   r   r   r   r   up_block_types	     r   r   z$Decoder.__init__.<locals>.<listcomp>$  sK     
 
 
  NN%&*! 
 
 
r   T)rs   )r   r   tuplerY   rq   r   time_embeddingsrW   time_mlpr8   
ModuleListdown_blocks
mid_blocks	up_blocksrangelenrF   rR   r:   appendrm   r2   final_block
final_projinitialize_weights)r   rY   rq   rp   r   r   n_blocksnum_mid_blocksr   r[   r   r   r   rZ   iinput_channelis_lastresnettransformer_blocks
downsampleupsampler   r   s   `   ``  `````        @r   r   zDecoder.__init__   s2    	??&(/<<!!q)#)
 
 
 =,,-++r**$s8}}%% 	] 	]A*M%a[N3x==1,,G"}n[ijjjF!#
 
 
 
 
 
 
 
 
 
 #8__
 
 
" " 5<x^,,,>[iklvwAxAxAx  ##BM6;Mz2Z$[$[\\\\~&& 	P 	PA$RLM#B<L"}n[ijjjF!#
 
 
 
 
 
 
 
 
 
 #8__
 
 
" " O""2=&:L1M#N#NOOOODDbD>Xa[N2s8}}q()) 	Y 	YA$QKM%a!e_N3x==1,,G"%&+  F
 "$
 
 
 
 
 
 
 
 
 
 #8__
 
 
" " M
>dCCCCY~~q!LLL  N!!"-9KX0V"W"WXXXX"8B<">>)HRL$2CQGG!!!!!r   c                     | dk    rt          |||dd|||d	  	        }n-| dk    rt          |||||          }nt          d|            |S )	N	conformerr	   r   r   )	r   r   r   r   r   r   r   r   r   r   )r   num_attention_headsr   r   activation_fnzUnknown block type )r   r
   
ValueError)
block_typer   r   r   r   r[   r=   s          r   r   zDecoder.get_block>  s    $$$+&'"$$!#
 
 
EE =(()$-#5$  EE ?:??@@@r   c                    |                                  D ]A}t          |t          j                  rSt          j                            |j        d           |j        %t          j                            |j        d           pt          |t          j	                  rKt          j                            |j        d           t          j                            |j        d           t          |t          j
                  rRt          j                            |j        d           |j        %t          j                            |j        d           Cd S )Nrelu)nonlinearityr   r	   )modules
isinstancer8   r:   initkaiming_normal_weightr_   	constant_r;   rH   )r   ms     r   r   zDecoder.initialize_weightsY  s    	1 	1A!RY'' 1''v'FFF6%G%%afa000Ar|,, 1!!!(A...!!!&!,,,,Ary)) 1''v'FFF6%G%%afa000	1 	1r   Nc           	      N   |                      |          }|                     |          }t          ||gd          d         }|5t          |d|j        d                   }t          ||gd          d         }g }|g}| j        D ]\  }	}
}|d         } |	|||          }t          |d          }t          |d          }|
D ]} ||||	          }t          |d
          }t          |d          }|                    |            |||z            }|                    |dddddddf                    |dd         }|d         }| j        D ]e\  }	}
 |	|||          }t          |d          }t          |d          }|
D ]} ||||	          }t          |d
          }t          |d          }f| j	        D ]\  }	}
}|
                                } |	t          ||
                                gd          d         ||          }t          |d          }t          |d          }|
D ]} ||||	          }t          |d
          }t          |d          } |||z            }|                     ||          }|                     ||z            }||z  S )a.  Forward pass of the UNet1DConditional model.

        Args:
            x (torch.Tensor): shape (batch_size, in_channels, time)
            mask (_type_): shape (batch_size, 1, time)
            t (_type_): shape (batch_size)
            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
            cond (_type_, optional): placeholder for future use. Defaults to None.

        Raises:
            ValueError: _description_
            ValueError: _description_

        Returns:
            _type_: _description_
        zb * tr   Nzb c -> b c tr   )tzb c t -> b t czb 1 t -> b t)r   r   r   zb t c -> b c tzb t -> b 1 tr   )r   r   r   r   rz   r   r   r   r   r   popr   r   )r   r&   rB   mur   spkscondhiddensmasksr   r   r   	mask_downtransformer_blockmask_midr   mask_uprC   s                     r   r*   zDecoder.forwardk  sC   $   ##MM!!R'""1%$!'"+>>>DaY((+A6:6F 	/ 	/2F&
b	Iq)Q''A!-..A!)^<<I%7  !%%"##,  
 !-..A!)^<<INN1
1y=))ALL111aaa19-....crc
9*./ 	; 	;&F&q(A&&A!-..A >::H%7  !%%"##+  
 !-..A >::HH48N 	& 	&0F&iikkGtQ.88;WaHHA!-..A88G%7  !%%"##*  
 !-..A88GW%%AAQ((W--}r   )
r   r   r   r	   r   ro   r   r   r   r   )NN)	r,   r-   r.   r   staticmethodr   r   r*   r/   r0   s   @r   r   r      s        
 %$#r" r" r" r" r" r"j   \41 1 1$P P P P P P P Pr   r   )r   typingr   r   torch.nnr8   torch.nn.functional
functionalr{   r   r   diffusers.models.activationsr   einopsr   r   r   r   r
   Moduler   r2   rF   rR   rW   rm   r   r   r   r   r   <module>r      s-                          $ $ $ $ $ $ 7 7 7 7 7 7 * * * * * * * * * * . . . . . .    ux   $    eho       EHO   $    29   , , , , ,	 , , ,^& & & & & & & &R$L $L $L $L $L~ $L $L $LNs s s s sbi s s s s sr   