
    0;jiz1                        d Z ddlmZmZ ddlZ G d dej        j                  Z G d de          Z G d d	e          Z	 G d
 de          Z
 G d de          Z G d de          Z G d de          Z G d de          ZdS )zSubsampling layer definition.    )TupleUnionNc                   V     e Zd Z fdZdeeej        f         dedej        fdZ xZ	S )BaseSubsamplingc                 d    t                                                       d| _        d| _        d S )Nr      )super__init__right_contextsubsampling_rate)self	__class__s    i/root/voice-cloning/.venv/lib/python3.11/site-packages/chatterbox/models/s3gen/transformer/subsampling.pyr
   zBaseSubsampling.__init__   s/     !    offsetsizereturnc                 8    | j                             ||          S N)pos_encposition_encoding)r   r   r   s      r   r   z!BaseSubsampling.position_encoding   s    |--fd;;;r   )
__name__
__module____qualname__r
   r   inttorchTensorr   __classcell__r   s   @r   r   r      sq        " " " " "
<c5<.?(@ < #<(-< < < < < < < <r   r   c                        e Zd ZdZdedededej        j        f fdZ		 ddej
        d	ej
        d
eeej
        f         deej
        ej
        ej
        f         fdZ xZS )EmbedinigNoSubsamplingz(Embedding input without subsampling
    idimodimdropout_ratepos_enc_classc                     t                                                       t          j                            ||          | _        || _        d S r   )r	   r
   r   nn	Embeddingembedr   r   r"   r#   r$   r%   r   s        r   r
   zEmbedinigNoSubsampling.__init__'   s=    X''d33
$r   r   xx_maskr   r   c                 h    |                      |          }|                     ||          \  }}|||fS a  Input x.

        Args:
            x (torch.Tensor): Input tensor (#batch, time, idim).
            x_mask (torch.Tensor): Input mask (#batch, 1, time).

        Returns:
            torch.Tensor: linear input tensor (#batch, time', odim),
                where time' = time .
            torch.Tensor: linear input mask (#batch, 1, time'),
                where time' = time .

        )r)   r   r   r+   r,   r   pos_embs        r   forwardzEmbedinigNoSubsampling.forward-   s7    & JJqMM\\!V,,
7'6!!r   r   r   r   r   __doc__r   floatr   r'   Moduler
   r   r   r   r1   r   r   s   @r   r!   r!   #   s         %S % %5 % %% % % % % % ,-	" "<" " c5<'(	"
 
u|U\5<7	8" " " " " " " "r   r!   c                        e Zd ZdZdedededej        j        f fdZ		 ddej
        d	ej
        d
eeej
        f         deej
        ej
        ej
        f         fdZ xZS )LinearNoSubsamplingLinear transform the input without subsampling

    Args:
        idim (int): Input dimension.
        odim (int): Output dimension.
        dropout_rate (float): Dropout rate.

    r"   r#   r$   r%   c                 r   t                                                       t          j                            t          j                            ||          t          j                            |d          t          j                            |                    | _        || _	        d| _
        d| _        dS zConstruct an linear object.gh㈵>)epsr   r   N)r	   r
   r   r'   
SequentialLinear	LayerNormDropoutoutr   r   r   r*   s        r   r
   zLinearNoSubsampling.__init__O   s     	8&&HOOD$''Ht..H\**
 

 % !r   r   r+   r,   r   r   c                 h    |                      |          }|                     ||          \  }}|||fS r.   rA   r   r/   s        r   r1   zLinearNoSubsampling.forward\   7    & HHQKK\\!V,,
7'6!!r   r2   r3   r   s   @r   r8   r8   E   s         "S " "5 " %" " " " " "" ,-	" "<" " c5<'(	"
 
u|U\5<7	8" " " " " " " "r   r8   c                        e Zd ZdZdedededej        j        f fdZ		 ddej
        d	ej
        d
eeej
        f         deej
        ej
        ej
        f         fdZ xZS )Conv1dSubsampling2a!  Convolutional 1D subsampling (to 1/2 length).
       It is designed for Whisper, ref:
       https://github.com/openai/whisper/blob/main/whisper/model.py

    Args:
        idim (int): Input dimension.
        odim (int): Output dimension.
        dropout_rate (float): Dropout rate.

    r"   r#   r$   r%   c                    t                                                       t          j                            t          j                            ||dd          t          j                                        t          j                            ||ddd          t          j                                                  | _        || _        d| _	        d| _
        dS )z'Construct an Conv1dSubsampling2 object.   r   )kernel_sizepadding   )rI   striderJ      N)r	   r
   r   r'   r=   Conv1dGELUconvr   r   r   r*   s        r   r
   zConv1dSubsampling2.__init__   s     	H''HOOD$AqOAAHMMOOHOOD$AaOKKHMMOO	
 
	 % !"r   r   r+   r,   r   r   c                    |                     d          }|                    dd          }|                     |          }|                    dd          }|                     ||          \  }}|||dddd|dz   dz  ddf         fS )a  Subsample x.

        Args:
            x (torch.Tensor): Input tensor (#batch, time, idim).
            x_mask (torch.Tensor): Input mask (#batch, 1, time).

        Returns:
            torch.Tensor: Subsampled tensor (#batch, time', odim),
                where time' = time // 2.
            torch.Tensor: Subsampled mask (#batch, 1, time'),
                where time' = time // 2.
            torch.Tensor: positional encoding

        r   rK   N)r   	transposerP   r   )r   r+   r,   r   timer0   s         r   r1   zConv1dSubsampling2.forward   s    ( vvayyKK1IIaLLKK1\\!V,,
7'6!!!QQQQ(9(9"9:::r   r2   r3   r   s   @r   rF   rF   t   s        	 	S  5  %     * ,-	; ;<; ; c5<'(	;
 
u|U\5<7	8; ; ; ; ; ; ; ;r   rF   c                        e Zd ZdZdedededej        j        f fdZ		 ddej
        d	ej
        d
eeej
        f         deej
        ej
        ej
        f         fdZ xZS )Conv2dSubsampling4zConvolutional 2D subsampling (to 1/4 length).

    Args:
        idim (int): Input dimension.
        odim (int): Output dimension.
        dropout_rate (float): Dropout rate.

    r"   r#   r$   r%   c           
      R   t                                                       t          j                            t          j                            d|dd          t          j                                        t          j                            ||dd          t          j                                                  | _        t          j                            t          j                            ||dz
  dz  dz
  dz  z  |                    | _	        || _
        d| _        d| _        dS )z'Construct an Conv2dSubsampling4 object.r   rH   rK   rM      N)r	   r
   r   r'   r=   Conv2dReLUrP   r>   rA   r   r   r   r*   s        r   r
   zConv2dSubsampling4.__init__   s     	H''HOOAtQ**HMMOOHOOD$1--HMMOO	
 
	 8&&HOODdQh1_q%8Q$>?FFH H$ !"r   r   r+   r,   r   r   c                    |                     d          }|                     |          }|                                \  }}}}|                     |                    dd                                                              ||||z                      }|                     ||          \  }}|||dddddddf         dddddddf         fS )a  Subsample x.

        Args:
            x (torch.Tensor): Input tensor (#batch, time, idim).
            x_mask (torch.Tensor): Input mask (#batch, 1, time).

        Returns:
            torch.Tensor: Subsampled tensor (#batch, time', odim),
                where time' = time // 4.
            torch.Tensor: Subsampled mask (#batch, 1, time'),
                where time' = time // 4.
            torch.Tensor: positional encoding

        r   rK   N)	unsqueezerP   r   rA   rR   
contiguousviewr   	r   r+   r,   r   bctfr0   s	            r   r1   zConv2dSubsampling4.forward   s    ( KKNNIIaLLVVXX
1aHHQ[[A&&113388Aq1uEEFF\\!V,,
7'6!!!QQQ1*-aaaADqDj999r   r2   r3   r   s   @r   rU   rU      s         S  5  %     . ,-	: :<: : c5<'(	:
 
u|U\5<7	8: : : : : : : :r   rU   c                        e Zd ZdZdedededej        j        f fdZ		 ddej
        d	ej
        d
eeej
        f         deej
        ej
        ej
        f         fdZ xZS )Conv2dSubsampling6zConvolutional 2D subsampling (to 1/6 length).
    Args:
        idim (int): Input dimension.
        odim (int): Output dimension.
        dropout_rate (float): Dropout rate.
        pos_enc (torch.nn.Module): Custom position encoding layer.
    r"   r#   r$   r%   c           
         t                                                       t          j                            t          j                            d|dd          t          j                                        t          j                            ||dd          t          j                                                  | _        t          j                            ||dz
  dz  dz
  dz  z  |          | _	        || _
        d| _        d| _        dS )z'Construct an Conv2dSubsampling6 object.r   rH   rK      rW   
   Nr	   r
   r   r'   r=   rX   rY   rP   r>   linearr   r   r   r*   s        r   r
   zConv2dSubsampling6.__init__   s     	H''HOOAtQ**HMMOOHOOD$1--HMMOO	
 
	 hoodqQ/Bq.H&I&*, ,$ !r   r   r+   r,   r   r   c                    |                     d          }|                     |          }|                                \  }}}}|                     |                    dd                                                              ||||z                      }|                     ||          \  }}|||dddddddf         dddddddf         fS )a  Subsample x.
        Args:
            x (torch.Tensor): Input tensor (#batch, time, idim).
            x_mask (torch.Tensor): Input mask (#batch, 1, time).

        Returns:
            torch.Tensor: Subsampled tensor (#batch, time', odim),
                where time' = time // 6.
            torch.Tensor: Subsampled mask (#batch, 1, time'),
                where time' = time // 6.
            torch.Tensor: positional encoding
        r   rK   NrM   rH   r[   rP   r   ri   rR   r\   r]   r   r^   s	            r   r1   zConv2dSubsampling6.forward   s    $ KKNNIIaLLVVXX
1aKKAq))4466;;Aq!a%HHII\\!V,,
7'6!!!QQQ1*-aaaADqDj999r   r2   r3   r   s   @r   rd   rd      s          S    5   %           * ,-	: :<: : c5<'(	:
 
u|U\5<7	8: : : : : : : :r   rd   c                        e Zd ZdZdedededej        j        f fdZ		 ddej
        d	ej
        d
eeej
        f         deej
        ej
        ej
        f         fdZ xZS )Conv2dSubsampling8zConvolutional 2D subsampling (to 1/8 length).

    Args:
        idim (int): Input dimension.
        odim (int): Output dimension.
        dropout_rate (float): Dropout rate.

    r"   r#   r$   r%   c                    t                                                       t          j                            t          j                            d|dd          t          j                                        t          j                            ||dd          t          j                                        t          j                            ||dd          t          j                                                  | _        t          j                            ||dz
  dz  dz
  dz  dz
  dz  z  |          | _	        || _
        d| _        d| _        dS )z'Construct an Conv2dSubsampling8 object.r   rH   rK         Nrh   r*   s        r   r
   zConv2dSubsampling8.__init__$  s     	H''HOOAtQ**HMMOOHOOD$1--HMMOOHOOD$1--HMMOO
 
	 hootaxAo)a/!39:DB B$ !r   r   r+   r,   r   r   c                    |                     d          }|                     |          }|                                \  }}}}|                     |                    dd                                                              ||||z                      }|                     ||          \  }}|||dddddddf         dddddddf         dddddddf         fS )a  Subsample x.

        Args:
            x (torch.Tensor): Input tensor (#batch, time, idim).
            x_mask (torch.Tensor): Input mask (#batch, 1, time).

        Returns:
            torch.Tensor: Subsampled tensor (#batch, time', odim),
                where time' = time // 8.
            torch.Tensor: Subsampled mask (#batch, 1, time'),
                where time' = time // 8.
            torch.Tensor: positional encoding
        r   rK   Nrk   r^   s	            r   r1   zConv2dSubsampling8.forward7  s    & KKNNIIaLLVVXX
1aKKAq))4466;;Aq!a%HHII\\!V,,
7'6!!!QQQ1*-aaaADqDj9!!!QQQ1*EEEr   r2   r3   r   s   @r   rm   rm     s          S    5   %           . ,-	F F<F F c5<'(	F
 
u|U\5<7	8F F F F F F F Fr   rm   c                        e Zd ZdZdedededej        j        f fdZ		 ddej
        d	ej
        d
eeej
        f         deej
        ej
        ej
        f         fdZ xZS )LegacyLinearNoSubsamplingr9   r"   r#   r$   r%   c                    t                                                       t          j                            t          j                            ||          t          j                            |d          t          j                            |          t          j                                                  | _	        || _
        d| _        d| _        dS r;   )r	   r
   r   r'   r=   r>   r?   r@   rY   rA   r   r   r   r*   s        r   r
   z"LegacyLinearNoSubsampling.__init__\  s     	8&&HOOD$''Ht..H\**HMMOO	
 
 % !r   r   r+   r,   r   r   c                 h    |                      |          }|                     ||          \  }}|||fS r.   rC   r/   s        r   r1   z!LegacyLinearNoSubsampling.forwardj  rD   r   r2   r3   r   s   @r   rs   rs   R  s         "S " "5 " %" " " " " "$ ,-	" "<" " c5<'(	"
 
u|U\5<7	8" " " " " " " "r   rs   )r4   typingr   r   r   r'   r6   r   r!   r8   rF   rU   rd   rm   rs    r   r   <module>rx      s    $ #         	< 	< 	< 	< 	<eho 	< 	< 	<" " " " "_ " " "D," ," ," ," ,"/ ," ," ,"^6; 6; 6; 6; 6; 6; 6; 6;r6: 6: 6: 6: 6: 6: 6: 6:r1: 1: 1: 1: 1: 1: 1: 1:h5F 5F 5F 5F 5F 5F 5F 5Fp-" -" -" -" -" -" -" -" -" -"r   