
    ~VjiT'                        d dl mZmZ d dlZdgZdej        dej        fdZ G d dej        j                  Z	 G d	 d
ej        j                  Z
 G d dej        j                  Z G d dej        j                  ZdS )    )OptionalTupleN	Conformerlengthsreturnc                    | j         d         }t          t          j        |                                                     }t          j        || j        | j                                      ||          | 	                    d          k    }|S )Nr   )devicedtype   )
shapeinttorchmaxitemaranger	   r
   expand	unsqueeze)r   
batch_size
max_lengthpadding_masks       U/root/voice-cloning/.venv/lib/python3.11/site-packages/torchaudio/models/conformer.py_lengths_to_padding_maskr   	   s~    q!JUYw'',,..//J<
7>WWW^^J 			1		L     c                   l     e Zd ZdZ	 	 	 ddededededed	ed
df fdZdej	        d
ej	        fdZ
 xZS )_ConvolutionModulea  Conformer convolution module.

    Args:
        input_dim (int): input dimension.
        num_channels (int): number of depthwise convolution layer input channels.
        depthwise_kernel_size (int): kernel size of depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        bias (bool, optional): indicates whether to add bias term to each convolution layer. (Default: ``False``)
        use_group_norm (bool, optional): use GroupNorm rather than BatchNorm. (Default: ``False``)
            F	input_dimnum_channelsdepthwise_kernel_sizedropoutbiasuse_group_normr   Nc                 .   t                                                       |dz
  dz  dk    rt          d          t          j                            |          | _        t          j                            t          j                            |d|z  ddd|          t          j        	                    d          t          j                            |||d|dz
  dz  ||          |r!t          j        
                    d|          nt          j                            |          t          j                                        t          j                            ||ddd|	          t          j                            |                    | _        d S )
Nr      r   z<depthwise_kernel_size must be odd to achieve 'SAME' padding.)stridepaddingr!   )dim)r%   r&   groupsr!   )
num_groupsr   )kernel_sizer%   r&   r!   )super__init__
ValueErrorr   nn	LayerNorm
layer_norm
SequentialConv1dGLU	GroupNormBatchNorm1dSiLUDropout
sequential)selfr   r   r   r    r!   r"   	__class__s          r   r,   z_ConvolutionModule.__init__   sq    	!A%*a//[\\\(,,Y77(--HOOL     HLLQLHOO%.2q8#    4EH!,GGG%%l33HMMOOHOO    HW%%? 
  
r   inputc                     |                      |          }|                    dd          }|                     |          }|                    dd          S )z
        Args:
            input (torch.Tensor): with shape `(B, T, D)`.

        Returns:
            torch.Tensor: output, with shape `(B, T, D)`.
        r   r$   )r0   	transposer8   )r9   r;   xs      r   forwardz_ConvolutionModule.forwardM   sM     OOE""KK1OOA{{1a   r   r   FF)__name__
__module____qualname____doc__r   floatboolr,   r   Tensorr?   __classcell__r:   s   @r   r   r      s        	 	  $-
 -
-
 -
  #	-

 -
 -
 -
 
-
 -
 -
 -
 -
 -
^!U\ !el ! ! ! ! ! ! ! !r   r   c            	       Z     e Zd ZdZddedededdf fdZd	ej        dej        fd
Z	 xZ
S )_FeedForwardModulezPositionwise feed forward layer.

    Args:
        input_dim (int): input dimension.
        hidden_dim (int): hidden dimension.
        dropout (float, optional): dropout probability. (Default: 0.0)
    r   r   
hidden_dimr    r   Nc                     t                                                       t          j                            t          j                            |          t          j                            ||d          t          j                                        t          j                            |          t          j                            ||d          t          j                            |                    | _	        d S )NT)r!   )
r+   r,   r   r.   r1   r/   Linearr6   r7   r8   )r9   r   rL   r    r:   s       r   r,   z_FeedForwardModule.__init__d   s    (--Hy))HOOIzO==HMMOOHW%%HOOJ	O==HW%%
 
r   r;   c                 ,    |                      |          S )z
        Args:
            input (torch.Tensor): with shape `(*, D)`.

        Returns:
            torch.Tensor: output, with shape `(*, D)`.
        )r8   )r9   r;   s     r   r?   z_FeedForwardModule.forwardo   s     u%%%r   )r   )rA   rB   rC   rD   r   rE   r,   r   rG   r?   rH   rI   s   @r   rK   rK   [   s         	
 	
# 	
3 	
 	
QU 	
 	
 	
 	
 	
 	
&U\ &el & & & & & & & &r   rK   c                        e Zd ZdZ	 	 	 ddededededed	ed
eddf fdZdej	        dej	        fdZ
dej	        deej	                 dej	        fdZ xZS )ConformerLayera  Conformer layer that constitutes Conformer.

    Args:
        input_dim (int): input dimension.
        ffn_dim (int): hidden layer dimension of feedforward network.
        num_attention_heads (int): number of attention heads.
        depthwise_conv_kernel_size (int): kernel size of depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
            in the convolution module. (Default: ``False``)
        convolution_first (bool, optional): apply the convolution module ahead of
            the attention module. (Default: ``False``)
    r   Fr   ffn_dimnum_attention_headsdepthwise_conv_kernel_sizer    r"   convolution_firstr   Nc                    t                                                       t          |||          | _        t          j                            |          | _        t          j                            |||          | _	        t          j        
                    |          | _        t          ||||d|          | _        t          |||          | _        t          j                            |          | _        || _        d S )N)r    T)r   r   r   r    r!   r"   )r+   r,   rK   ffn1r   r.   r/   self_attn_layer_normMultiheadAttention	self_attnr7   self_attn_dropoutr   conv_moduleffn2final_layer_normrU   )	r9   r   rR   rS   rT   r    r"   rU   r:   s	           r   r,   zConformerLayer.__init__   s     	&y'7KKK	$)H$6$6y$A$A!44Y@S]d4ee!&!1!1'!:!:-""<)
 
 
 'y'7KKK	 % 2 29 = =!2r   r;   c                     |}|                     dd          }|                     |          }|                     dd          }||z   }|S )Nr   r   )r=   r\   )r9   r;   residuals      r   _apply_convolutionz!ConformerLayer._apply_convolution   sO    1%%  ''1%%5 r   key_padding_maskc                    |}|                      |          }|dz  |z   }| j        r|                     |          }|}|                     |          }|                     ||||d          \  }}|                     |          }||z   }| j        s|                     |          }|}|                     |          }|dz  |z   }|                     |          }|S )a
  
        Args:
            input (torch.Tensor): input, with shape `(T, B, D)`.
            key_padding_mask (torch.Tensor or None): key padding mask to use in self attention layer.

        Returns:
            torch.Tensor: output, with shape `(T, B, D)`.
        g      ?F)querykeyvaluerb   need_weights)rW   rU   ra   rX   rZ   r[   r]   r^   )r9   r;   rb   r`   r>   _s         r   r?   zConformerLayer.forward   s     IIeGh! 	+''**A%%a((~~-  
 
1 ""1%%L% 	+''**AIIaLLGh!!!$$r   r@   )rA   rB   rC   rD   r   rE   rF   r,   r   rG   ra   r   r?   rH   rI   s   @r   rQ   rQ   z   s        ( $"'3 33 3 !	3
 %(3 3 3  3 
3 3 3 3 3 3>     $U\ $Xel=S $X]Xd $ $ $ $ $ $ $ $r   rQ   c                        e Zd ZdZ	 	 	 ddededededed	ed
edef fdZdej	        dej	        de
ej	        ej	        f         fdZ xZS )r   a(  Conformer architecture introduced in
    *Conformer: Convolution-augmented Transformer for Speech Recognition*
    :cite:`gulati2020conformer`.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads in each Conformer layer.
        ffn_dim (int): hidden layer dimension of feedforward networks.
        num_layers (int): number of Conformer layers to instantiate.
        depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
            in the convolution module. (Default: ``False``)
        convolution_first (bool, optional): apply the convolution module ahead of
            the attention module. (Default: ``False``)

    Examples:
        >>> conformer = Conformer(
        >>>     input_dim=80,
        >>>     num_heads=4,
        >>>     ffn_dim=128,
        >>>     num_layers=4,
        >>>     depthwise_conv_kernel_size=31,
        >>> )
        >>> lengths = torch.randint(1, 400, (10,))  # (batch,)
        >>> input = torch.rand(10, int(lengths.max()), input_dim)  # (batch, num_frames, input_dim)
        >>> output = conformer(input, lengths)
    r   Fr   	num_headsrR   
num_layersrT   r    r"   rU   c	           	          t                                                       t          j                            fdt          |          D                       | _        d S )Nc                 <    g | ]}t                     S ))r    r"   rU   )rQ   )	.0rh   rU   rT   r    rR   r   rj   r"   s	     r   
<listcomp>z&Conformer.__init__.<locals>.<listcomp>  sO         .##1&7    r   )r+   r,   r   r.   
ModuleListrangeconformer_layers)
r9   r   rj   rR   rk   rT   r    r"   rU   r:   s
    ``` ````r   r,   zConformer.__init__   s     	 % 3 3          z**  !
 !
r   r;   r   r   c                     t          |          }|                    dd          }| j        D ]} |||          }|                    dd          |fS )aX  
        Args:
            input (torch.Tensor): with shape `(B, T, input_dim)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.

        Returns:
            (torch.Tensor, torch.Tensor)
                torch.Tensor
                    output frames, with shape `(B, T, input_dim)`
                torch.Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output frames.
        r   r   )r   r=   rr   )r9   r;   r   encoder_padding_maskr>   layers         r   r?   zConformer.forward  sd      8@@OOAq!!* 	/ 	/Ea-..AA{{1a  '))r   r@   )rA   rB   rC   rD   r   rE   rF   r,   r   rG   r   r?   rH   rI   s   @r   r   r      s         H $"'
 

 
 	

 
 %(
 
 
  
 
 
 
 
 
8*U\ *EL *U5<Y^YeKeEf * * * * * * * *r   )typingr   r   r   __all__rG   r   r.   Moduler   rK   rQ   r    r   r   <module>rz      s3   " " " " " " " "  -el u|    F! F! F! F! F! F! F! F!R& & & & & & & &>Z Z Z Z ZUX_ Z Z ZzN* N* N* N* N* N* N* N* N* N*r   