
    0;ji%                     z    d Z ddlmZmZ ddlZddlmZ  G d dej                  Z G d dej                  ZdS )	z(Encoder self-attention layer definition.    )OptionalTupleN)nnc                   j    e Zd ZdZ	 ddedej        j        dej        j        dede	f
 fdZ
 ej        d	ej	        
           ej        d           ej        d          fdej        dej        dej        dej        dej        dej        deej        ej        ej        ej        f         fdZ xZS )TransformerEncoderLayera\  Encoder layer module.

    Args:
        size (int): Input dimension.
        self_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
            instance can be used as the argument.
        feed_forward (torch.nn.Module): Feed-forward module instance.
            `PositionwiseFeedForward`, instance can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool):
            True: use layer_norm before each sub-block.
            False: to use layer_norm after each sub-block.
    Tsize	self_attnfeed_forwarddropout_ratenormalize_beforec                    t                                                       || _        || _        t	          j        |d          | _        t	          j        |d          | _        t	          j        |          | _	        || _
        || _        dS )!Construct an EncoderLayer object.-q=epsN)super__init__r	   r
   r   	LayerNormnorm1norm2Dropoutdropoutr   r   )selfr   r	   r
   r   r   	__class__s         k/root/voice-cloning/.venv/lib/python3.11/site-packages/chatterbox/models/s3gen/transformer/encoder_layer.pyr   z TransformerEncoderLayer.__init__(   s|     	"(\$E222
\$E222
z,//	 0    r   r   r   dtyper   r   r   r   xmaskpos_embmask_pad	att_cache	cnn_cachereturnc                    |}| j         r|                     |          }|                     ||||||          \  }}	||                     |          z   }| j         s|                     |          }|}| j         r|                     |          }||                     |                     |                    z   }| j         s|                     |          }t          j        d|j        |j	                  }
|||	|
fS )uh  Compute encoded features.

        Args:
            x (torch.Tensor): (#batch, time, size)
            mask (torch.Tensor): Mask tensor for the input (#batch, time，time),
                (0, 0, 0) means fake mask.
            pos_emb (torch.Tensor): just for interface compatibility
                to ConformerEncoderLayer
            mask_pad (torch.Tensor): does not used in transformer layer,
                just for unified api with conformer.
            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
            cnn_cache (torch.Tensor): Convolution cache in conformer layer
                (#batch=1, size, cache_t2), not used here, it's for interface
                compatibility to ConformerEncoderLayer.
        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time, time).
            torch.Tensor: att_cache tensor,
                (#batch=1, head, cache_t1 + time, d_k * 2).
            torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2).

        )r#   cacher   r   device)
r   r   r	   r   r   r
   torchzerosr   r+   )r   r!   r"   r#   r$   r%   r&   residualx_attnew_att_cachefake_cnn_caches              r   forwardzTransformerEncoderLayer.forward:   s    @   	

1A#~~aAtWT]~^^}t||E***$ 	

1A  	

1At||D$5$5a$8$8999$ 	

1AYagahOOO$~55r   )T)__name__
__module____qualname____doc__intr,   r   Modulefloatboolr   onesr-   Tensorr   r2   __classcell__r   s   @r   r   r      s>        * "&1 11 8?1 ho	1
 1 1 1 1 1 1 1. ",IUZ!H!H!H"-%+l";";"-%+l";";06 06<06 l06 	06
 ,06 <06 <06 
u|U\5<E	F06 06 06 06 06 06 06 06r   r   c                       e Zd ZdZ	 	 	 	 	 ddedej        j        deej                 deej                 d	eej                 d
e	de
f fdZ ej        dej
                   ej        d           ej        d          fdej        dej        dej        dej        dej        dej        deej        ej        ej        ej        f         fdZ xZS )ConformerEncoderLayera  Encoder layer module.
    Args:
        size (int): Input dimension.
        self_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
            instance can be used as the argument.
        feed_forward (torch.nn.Module): Feed-forward module instance.
            `PositionwiseFeedForward` instance can be used as the argument.
        feed_forward_macaron (torch.nn.Module): Additional feed-forward module
             instance.
            `PositionwiseFeedForward` instance can be used as the argument.
        conv_module (torch.nn.Module): Convolution module instance.
            `ConvlutionModule` instance can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool):
            True: use layer_norm before each sub-block.
            False: use layer_norm after each sub-block.
    N皙?Tr   r	   r
   feed_forward_macaronconv_moduler   r   c                    t                                                       || _        || _        || _        || _        t          j        |d          | _        t          j        |d          | _	        |#t          j        |d          | _
        d| _        nd| _        | j        6t          j        |d          | _        t          j        |d          | _        t          j        |          | _        || _        || _        dS )r   r   r   Ng      ?g      ?)r   r   r	   r
   rB   rC   r   r   norm_ffnorm_mhanorm_ff_macaronff_scale	norm_conv
norm_finalr   r   r   r   )	r   r   r	   r
   rB   rC   r   r   r   s	           r   r   zConformerEncoderLayer.__init__   s     	"($8!&|De444Tu555+#%<%#@#@#@D DMMDM'\$E:::DN l%! ! !DOz,//	 0r   r   r   r    r!   r"   r#   r$   r%   r&   r'   c                    | j         m|}| j        r|                     |          }|| j        |                     |                      |                    z  z   }| j        s|                     |          }|}| j        r|                     |          }|                     ||||||          \  }}	||                     |          z   }| j        s|                     |          }t          j        d|j	        |j
                  }
| j        l|}| j        r|                     |          }|                     |||          \  }}
||                     |          z   }| j        s|                     |          }|}| j        r|                     |          }|| j        |                     |                     |                    z  z   }| j        s|                     |          }| j        |                     |          }|||	|
fS )u   Compute encoded features.

        Args:
            x (torch.Tensor): (#batch, time, size)
            mask (torch.Tensor): Mask tensor for the input (#batch, time，time),
                (0, 0, 0) means fake mask.
            pos_emb (torch.Tensor): positional encoding, must not be None
                for ConformerEncoderLayer.
            mask_pad (torch.Tensor): batch padding mask used for conv module.
                (#batch, 1，time), (0, 0, 0) means fake mask.
            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
            cnn_cache (torch.Tensor): Convolution cache in conformer layer
                (#batch=1, size, cache_t2)
        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time, time).
            torch.Tensor: att_cache tensor,
                (#batch=1, head, cache_t1 + time, d_k * 2).
            torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
        Nr   r*   )rB   r   rG   rH   r   rF   r	   r,   r-   r   r+   rC   rI   rE   r
   rJ   )r   r!   r"   r#   r$   r%   r&   r.   r/   r0   new_cnn_caches              r   r2   zConformerEncoderLayer.forward   s   @ $0H$ ,((++4=4<<))!,,,. ,. . .A( ,((++   	!a  A#~~aAtW.7 9  9}t||E***$ 	!a  A IQWQXNNN'H$ &NN1%%#//8YGGA}4<<??*A( &NN1%%   	 QAt}t||D4E4Ea4H4H'I'III$ 	 QA'""A$}44r   )NNNrA   T)r3   r4   r5   r6   r7   r,   r   r8   r   r9   r:   r   r;   r-   r<   r   r2   r=   r>   s   @r   r@   r@   m   s        . -148+/!!%1 11 8?1 ry)	1
 'ry11 bi(1 1 1 1 1 1 1 1H ",IUZ!H!H!H"-%+l";";"-%+l";";L5 L5<L5 lL5 	L5
 ,L5 <L5 <L5 
u|U\5<E	FL5 L5 L5 L5 L5 L5 L5 L5r   r@   )	r6   typingr   r   r,   r   r8   r   r@    r   r   <module>rO      s     / . " " " " " " " "       R6 R6 R6 R6 R6bi R6 R6 R6j5 5 5 5 5BI 5 5 5 5 5r   