
    ~VjiZ                        d dl Z d dlmZmZmZmZ d dlZd dlmZmZ d dl	m
Z dgZd%ded	ed
ededej        j        f
dZ	 	 	 	 	 	 d&dededededeeeeee         f                  ded
ededej        j        fdZdedefdZ G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d  d!ej                  Z G d" d#ej                  Z G d$ dej                  ZdS )'    N)ListOptionalTupleUnion)nnTensor)
functional	Tacotron2Tlinearin_dimout_dimbiasw_init_gainreturnc                     t           j                            | ||          }t           j        j                            |j        t           j        j                            |                     |S )a  Linear layer with xavier uniform initialization.

    Args:
        in_dim (int): Size of each input sample.
        out_dim (int): Size of each output sample.
        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias. (Default: ``True``)
        w_init_gain (str, optional): Parameter passed to ``torch.nn.init.calculate_gain``
            for setting the gain parameter of ``xavier_uniform_``. (Default: ``linear``)

    Returns:
        (torch.nn.Linear): The corresponding linear layer.
    r   gain)torchr   Linearinitxavier_uniform_weightcalculate_gain)r   r   r   r   r   s        U/root/voice-cloning/.venv/lib/python3.11/site-packages/torchaudio/models/tacotron2.py_get_linear_layerr   )   sT     X__VW4_88F	HM!!&-ehm6R6RS^6_6_!```M       in_channelsout_channelskernel_sizestridepaddingdilationc           	      P   |0|dz  dk    rt          d          t          ||dz
  z  dz            }t          j                            | ||||||          }t          j        j                            |j        t          j        j                            |                     |S )al  1D convolution with xavier uniform initialization.

    Args:
        in_channels (int): Number of channels in the input image.
        out_channels (int): Number of channels produced by the convolution.
        kernel_size (int, optional): Number of channels in the input image. (Default: ``1``)
        stride (int, optional): Number of channels in the input image. (Default: ``1``)
        padding (str, int or tuple, optional): Padding added to both sides of the input.
            (Default: dilation * (kernel_size - 1) / 2)
        dilation (int, optional): Number of channels in the input image. (Default: ``1``)
        w_init_gain (str, optional): Parameter passed to ``torch.nn.init.calculate_gain``
            for setting the gain parameter of ``xavier_uniform_``. (Default: ``linear``)

    Returns:
        (torch.nn.Conv1d): The corresponding Conv1D layer.
    N   r   zkernel_size must be odd)r!   r"   r#   r$   r   r   )	
ValueErrorintr   r   Conv1dr   r   r   r   )	r   r    r!   r"   r#   r$   r   r   conv1ds	            r   _get_conv1d_layerr+   ;   s    4 ?a6777h+/2Q677X__   F 
HM!!&-ehm6R6RS^6_6_!```Mr   lengthsc                    t          j        |                                           }t          j        d|| j        | j                  }||                     d          k                                     }t          j        |d          }|S )al  Returns a binary mask based on ``lengths``. The ``i``-th row and ``j``-th column of the mask
    is ``1`` if ``j`` is smaller than ``i``-th element of ``lengths.

    Args:
        lengths (Tensor): The length of each element in the batch, with shape (n_batch, ).

    Returns:
        mask (Tensor): The binary mask, with shape (n_batch, max of ``lengths``).
    r   )devicedtyper   )	r   maxitemaranger.   r/   	unsqueezebytele)r,   max_lenidsmasks       r   _get_mask_from_lengthsr9   i   su     i  %%''G
,q''.
N
N
NC'##A&&&,,..D8D!DKr   c                   @     e Zd ZdZdededef fdZdedefdZ xZS )	_LocationLayera  Location layer used in the Attention model.

    Args:
        attention_n_filter (int): Number of filters for attention model.
        attention_kernel_size (int): Kernel size for attention model.
        attention_hidden_dim (int): Dimension of attention hidden representation.
    attention_n_filterattention_kernel_sizeattention_hidden_dimc           	          t                                                       t          |dz
  dz            }t          d|||ddd          | _        t          ||dd          | _        d S )Nr   r&   F)r!   r#   r   r"   r$   tanhr   r   )super__init__r(   r+   location_convr   location_dense)selfr<   r=   r>   r#   	__class__s        r   rC   z_LocationLayer.__init__   s     	,q0A566.-
 
 
 0 45f
 
 
r   attention_weights_catr   c                     |                      |          }|                    dd          }|                     |          }|S )a  Location layer used in the Attention model.

        Args:
            attention_weights_cat (Tensor): Cumulative and previous attention weights
                with shape (n_batch, 2, max of ``text_lengths``).

        Returns:
            processed_attention (Tensor): Cumulative and previous attention weights
                with shape (n_batch, ``attention_hidden_dim``).
        r   r&   )rD   	transposerE   )rF   rH   processed_attentions      r   forwardz_LocationLayer.forward   sJ     #001FGG1;;AqAA"112EFF""r   	__name__
__module____qualname____doc__r(   rC   r   rL   __classcell__rG   s   @r   r;   r;   z   s         

  #
 "	
 
 
 
 
 
*#V # # # # # # # # #r   r;   c                        e Zd ZdZdedededededdf fd	Zd
edededefdZdedededededeeef         fdZ	 xZ
S )
_Attentiona  Locally sensitive attention model.

    Args:
        attention_rnn_dim (int): Number of hidden units for RNN.
        encoder_embedding_dim (int): Number of embedding dimensions in the Encoder.
        attention_hidden_dim (int): Dimension of attention hidden representation.
        attention_location_n_filter (int): Number of filters for Attention model.
        attention_location_kernel_size (int): Kernel size for Attention model.
    attention_rnn_dimencoder_embedding_dimr>   attention_location_n_filterattention_location_kernel_sizer   Nc                 ,   t                                                       t          ||dd          | _        t          ||dd          | _        t          |dd          | _        t          |||          | _        t          d           | _	        d S )NFr@   rA   r   r   inf)
rB   rC   r   query_layermemory_layervr;   location_layerfloatscore_mask_value)rF   rV   rW   r>   rX   rY   rG   s         r   rC   z_Attention.__init__   s     	,->@T[`ntuuu-!#7eQW
 
 
 ##7GGG,'* 
 

 "'ur   queryprocessed_memoryrH   c                    |                      |                    d                    }|                     |          }|                     t	          j        ||z   |z                       }|                    d          }|S )a=  Get the alignment vector.

        Args:
            query (Tensor): Decoder output with shape (n_batch, n_mels * n_frames_per_step).
            processed_memory (Tensor): Processed Encoder outputs
                with shape (n_batch, max of ``text_lengths``, attention_hidden_dim).
            attention_weights_cat (Tensor): Cumulative and previous attention weights
                with shape (n_batch, 2, max of ``text_lengths``).

        Returns:
            alignment (Tensor): attention weights, it is a tensor with shape (batch, max of ``text_lengths``).
        r   r&   )r\   r3   r_   r^   r   r@   squeeze)rF   rb   rc   rH   processed_queryprocessed_attention_weightsenergies	alignments           r   _get_alignment_energiesz"_Attention._get_alignment_energies   sx     **5??1+=+=>>&*&9&9:O&P&P#66%*_7R%RUe%effgg$$Q''	r   attention_hidden_statememoryr8   c                    |                      |||          }|                    || j                  }t          j        |d          }t          j        |                    d          |          }|                    d          }||fS )a  Pass the input through the Attention model.

        Args:
            attention_hidden_state (Tensor): Attention rnn last output with shape (n_batch, ``attention_rnn_dim``).
            memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
            processed_memory (Tensor): Processed Encoder outputs
                with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``).
            attention_weights_cat (Tensor): Previous and cumulative attention weights
                with shape (n_batch, current_num_frames * 2, max of ``text_lengths``).
            mask (Tensor): Binary mask for padded data with shape (n_batch, current_num_frames).

        Returns:
            attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
            attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
        r   dim)	rj   masked_fillra   Fsoftmaxr   bmmr3   re   )	rF   rk   rl   rc   rH   r8   ri   attention_weightsattention_contexts	            r   rL   z_Attention.forward   s    . 001GIY[pqq	))$0EFF	IiQ777!I&7&A&A!&D&DfMM-55a88 "333r   )rN   rO   rP   rQ   r(   rC   r   rj   r   rL   rR   rS   s   @r   rU   rU      s
        ..  #. "	.
 &). ),. 
. . . . . .*V v fl qw    *4 &4 4 !	4
  &4 4 
vv~	4 4 4 4 4 4 4 4r   rU   c                   L     e Zd ZdZdedee         ddf fdZdedefdZ xZ	S )	_PrenetzPrenet Module. It is consists of ``len(output_size)`` linear layers.

    Args:
        in_dim (int): The size of each input sample.
        output_sizes (list): The output dimension of each linear layers.
    r   	out_sizesr   Nc                     t                                                       |g|d d         z   }t          j        d t	          ||          D                       | _        d S )Nc                 8    g | ]\  }}t          ||d           S )Fr   )r   ).0in_sizeout_sizes      r   
<listcomp>z$_Prenet.__init__.<locals>.<listcomp>  s-    rrrBU7Hwu===rrrr   )rB   rC   r   
ModuleListziplayers)rF   r   rx   in_sizesrG   s       r   rC   z_Prenet.__init__
  sb    8in,mrrY\]egpYqYqrrr
 
r   xc                 ~    | j         D ]4}t          j        t          j         ||                    dd          }5|S )zPass the input through Prenet.

        Args:
            x (Tensor): The input sequence to Prenet with shape (n_batch, in_dim).

        Return:
            x (Tensor): Tensor with shape (n_batch, sizes[-1])
              ?T)ptraining)r   rq   dropoutrelu)rF   r   r   s      r   rL   z_Prenet.forward  sF     k 	C 	CF	!&++sTBBBAAr   )
rN   rO   rP   rQ   r(   r   rC   r   rL   rR   rS   s   @r   rw   rw     s         
s 
tCy 
T 
 
 
 
 
 
 F        r   rw   c                   D     e Zd ZdZdedededef fdZdedefd	Z xZS )
_Postneta  Postnet Module.

    Args:
        n_mels (int): Number of mel bins.
        postnet_embedding_dim (int): Postnet embedding dimension.
        postnet_kernel_size (int): Postnet kernel size.
        postnet_n_convolution (int): Number of postnet convolutions.
    n_melspostnet_embedding_dimpostnet_kernel_sizepostnet_n_convolutionc                    t                                                       t          j                    | _        t          |          D ]}|dk    r|n|}||dz
  k    r|n|}||dz
  k    rdnd}||dz
  k    r|n|}	| j                            t          j        t          |||dt          |dz
  dz            d|          t          j
        |	                               t          | j                  | _        d S )Nr   r   r   r@   r&   r!   r"   r#   r$   r   )rB   rC   r   r   convolutionsrangeappend
Sequentialr+   r(   BatchNorm1dlenn_convs)rF   r   r   r   r   ir   r    	init_gainnum_featuresrG   s             r   rC   z_Postnet.__init__*  s0    	MOO,-- 	 	A$%FF&&0EK%&+@1+D%E%E66K`L$%*?!*C$D$D&I%&+@1+D%E%E66K`L$$%#$$7  #%81%<$A B B!"$-   N<00     4,--r   r   r   c                    t          | j                  D ]p\  }}|| j        dz
  k     r8t          j        t          j         ||                    d| j                  }Kt          j         ||          d| j                  }q|S )a  Pass the input through Postnet.

        Args:
            x (Tensor): The input sequence with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).

        Return:
            x (Tensor): Tensor with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
        r   r   )r   )	enumerater   r   rq   r   r   r@   r   )rF   r   r   convs       r   rL   z_Postnet.forwardJ  s     !!233 	D 	DGAt4<!###Ieja113OOOIdd1ggsT]CCCr   rM   rS   s   @r   r   r      s         ..  #. !	.
  #. . . . . .@ F        r   r   c                   H     e Zd ZdZdedededdf fdZded	edefd
Z xZS )_Encodera  Encoder Module.

    Args:
        encoder_embedding_dim (int): Number of embedding dimensions in the encoder.
        encoder_n_convolution (int): Number of convolution layers in the encoder.
        encoder_kernel_size (int): The kernel size in the encoder.

    Examples
        >>> encoder = _Encoder(3, 512, 5)
        >>> input = torch.rand(10, 20, 30)
        >>> output = encoder(input)  # shape: (10, 30, 512)
    rW   encoder_n_convolutionencoder_kernel_sizer   Nc                    t                                                       t          j                    | _        t          |          D ]j}t          j        t          |||dt          |dz
  dz            dd          t          j	        |                    }| j        
                    |           kt          j        |t          |dz            ddd          | _        | j                                         d S )Nr   r&   r   r   T)batch_firstbidirectional)rB   rC   r   r   r   r   r   r+   r(   r   r   LSTMlstmflatten_parameters)rF   rW   r   r   _
conv_layerrG   s         r   rC   z_Encoder.__init__k  s	    	MOO,-- 	1 	1A!)) 3!4q!8A =>> &   455 J $$Z0000G!%)**
 
 
	 		$$&&&&&r   r   input_lengthsc                    | j         D ]8}t          j        t          j         ||                    d| j                  }9|                    dd          }|                                }t          j        j	        
                    ||d          }|                     |          \  }}t          j        j	                            |d          \  }}|S )a_  Pass the input through the Encoder.

        Args:
            x (Tensor): The input sequences with shape (n_batch, encoder_embedding_dim, n_seq).
            input_lengths (Tensor): The length of each input sequence with shape (n_batch, ).

        Return:
            x (Tensor): A tensor with shape (n_batch, n_seq, encoder_embedding_dim).
        r   r   r&   T)r   )r   rq   r   r   r   rJ   cpur   utilsrnnpack_padded_sequencer   pad_packed_sequence)rF   r   r   r   outputsr   s         r   rL   z_Encoder.forward  s     % 	? 	?D	!&a//3>>AAKK1%))++HL--aD-QQYYq\\
X\55g45PP
r   rM   rS   s   @r   r   r   ]  s         '"'  #' !	'
 
' ' ' ' ' 'B  6        r   r   c            !           e Zd ZdZdededededededed	ed
ededededededdf fdZdedefdZ	dede
eeeeeeeef         fdZdedefdZdededede
eeef         fdZdedededed ed!ed"ed#eded$ed%ede
eeeeeeeeef	         fd&Zded'ed(ede
eeef         fd)Zdedefd*Zej        j        ded(ede
eeeef         fd+            Z xZS ),_Decodera,  Decoder with Attention model.

    Args:
        n_mels (int): number of mel bins
        n_frames_per_step (int): number of frames processed per step, only 1 is supported
        encoder_embedding_dim (int): the number of embedding dimensions in the encoder.
        decoder_rnn_dim (int): number of units in decoder LSTM
        decoder_max_step (int): maximum number of output mel spectrograms
        decoder_dropout (float): dropout probability for decoder LSTM
        decoder_early_stopping (bool): stop decoding when all samples are finished
        attention_rnn_dim (int): number of units in attention LSTM
        attention_hidden_dim (int): dimension of attention hidden representation
        attention_location_n_filter (int): number of filters for attention model
        attention_location_kernel_size (int): kernel size for attention model
        attention_dropout (float): dropout probability for attention LSTM
        prenet_dim (int): number of ReLU units in prenet layers
        gate_threshold (float): probability threshold for stop token
    r   n_frames_per_steprW   decoder_rnn_dimdecoder_max_stepdecoder_dropoutdecoder_early_stoppingrV   r>   rX   rY   attention_dropout
prenet_dimgate_thresholdr   Nc                 (   t                                                       || _        || _        || _        || _        || _        || _        || _        || _	        || _
        || _        || _        t          ||z  ||g          | _        t          j        ||z   |          | _        t%          |||	|
|          | _        t          j        ||z   |d          | _        t+          ||z   ||z            | _        t+          ||z   ddd          | _        d S )NTr   sigmoidrA   )rB   rC   r   r   rW   rV   r   r   r   r   r   r   r   rw   prenetr   LSTMCellattention_rnnrU   attention_layerdecoder_rnnr   linear_projection
gate_layer)rF   r   r   rW   r   r   r   r   rV   r>   rX   rY   r   r   r   rG   s                  r   rC   z_Decoder.__init__  s2   $ 	!2%:"!2.$ 0,!2.&<#f'88:z:RSS[6K)KM^__)! '* 
  
 ;'8;P'PRacghh!2?EZ3Z\bev\v!w!w+33QTy
 
 
r   rl   c                     |                     d          }|j        }|j        }t          j        || j        | j        z  ||          }|S )am  Gets all zeros frames to use as the first decoder input.

        Args:
            memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).

        Returns:
            decoder_input (Tensor): all zeros frames with shape
                (n_batch, max of ``text_lengths``, ``n_mels * n_frames_per_step``).
        r   r/   r.   sizer/   r.   r   zerosr   r   rF   rl   n_batchr/   r.   decoder_inputs         r   _get_initial_framez_Decoder._get_initial_frame  K     ++a..GT[4;Q-QY^gmnnnr   c                 <   |                     d          }|                     d          }|j        }|j        }t          j        || j        ||          }t          j        || j        ||          }t          j        || j        ||          }t          j        || j        ||          }	t          j        ||||          }
t          j        ||||          }t          j        || j        ||          }| j        	                    |          }||||	|
|||fS )a  Initializes attention rnn states, decoder rnn states, attention
        weights, attention cumulative weights, attention context, stores memory
        and stores processed memory.

        Args:
            memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).

        Returns:
            attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
            attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
            decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
            decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
            attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
            attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``).
            attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
            processed_memory (Tensor): Processed encoder outputs
                with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``).
        r   r   r   )
r   r/   r.   r   r   rV   r   rW   r   r]   )rF   rl   r   max_timer/   r.   attention_hiddenattention_celldecoder_hiddendecoder_cellrt   attention_weights_cumru   rc   s                 r   _initialize_decoder_statesz#_Decoder._initialize_decoder_states  s-   * ++a..;;q>> ;w0Fe\bcccWd.DEZ`aaaWd.B%X^___{7D,@V\]]]!KvVVV %GXUSY Z Z Z!K1KSXaghhh/<<VDD !	
 		
r   decoder_inputsc                    |                     dd          }|                    |                    d          t          |                    d          | j        z            d          }|                     dd          }|S )ak  Prepares decoder inputs.

        Args:
            decoder_inputs (Tensor): Inputs used for teacher-forced training, i.e. mel-specs,
                with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``)

        Returns:
            inputs (Tensor): Processed decoder inputs with shape (max of ``mel_specgram_lengths``, n_batch, ``n_mels``).
        r   r&   r   rz   )rJ   viewr   r(   r   )rF   r   s     r   _parse_decoder_inputsz_Decoder._parse_decoder_inputs.  s     (11!Q77',,""##A&&)??@@
 
 (11!Q77r   mel_specgramgate_outputs
alignmentsc                 f   |                     dd                                          }|                     dd                                          }|                     dd                                          }|j        d         d| j        f} |j        | }|                     dd          }|||fS )aq  Prepares decoder outputs for output

        Args:
            mel_specgram (Tensor): mel spectrogram with shape (max of ``mel_specgram_lengths``, n_batch, ``n_mels``)
            gate_outputs (Tensor): predicted stop token with shape (max of ``mel_specgram_lengths``, n_batch)
            alignments (Tensor): sequence of attention weights from the decoder
                with shape (max of ``mel_specgram_lengths``, n_batch, max of ``text_lengths``)

        Returns:
            mel_specgram (Tensor): mel spectrogram with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``)
            gate_outputs (Tensor): predicted stop token with shape (n_batch, max of ``mel_specgram_lengths``)
            alignments (Tensor): sequence of attention weights from the decoder
                with shape (n_batch, max of ``mel_specgram_lengths``, max of ``text_lengths``)
        r   r   rz   r&   )rJ   
contiguousshaper   r   )rF   r   r   r   r   s        r   _parse_decoder_outputsz_Decoder._parse_decoder_outputsC  s    &  ))!Q//::<<
#--a33>>@@#--a33>>@@#A&DK8(|(%0#--a33\:55r   r   r   r   r   r   rt   r   ru   rc   r8   c           	         t          j        ||fd          }|                     |||f          \  }}t          j        || j        | j                  }t          j        |                    d          |                    d          fd          }|                     ||	|
||          \  }}||z  }t          j        ||fd          }| 	                    |||f          \  }}t          j        || j
        | j                  }t          j        ||fd          }|                     |          }|                     |          }|||||||||f	S )a&	  Decoder step using stored states, attention and memory

        Args:
            decoder_input (Tensor): Output of the Prenet with shape (n_batch, ``prenet_dim``).
            attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
            attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
            decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
            decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
            attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
            attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``).
            attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
            memory (Tensor): Encoder output with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
            processed_memory (Tensor): Processed Encoder outputs
                with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``).
            mask (Tensor): Binary mask for padded data with shape (n_batch, current_num_frames).

        Returns:
            decoder_output: Predicted mel spectrogram for the current frame with shape (n_batch, ``n_mels``).
            gate_prediction (Tensor): Prediction of the stop token with shape (n_batch, ``1``).
            attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
            attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
            decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
            decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
            attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
            attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``).
            attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
        rz   r   rn   )r   catr   rq   r   r   r   r3   r   r   r   r   r   )rF   r   r   r   r   r   rt   r   ru   rl   rc   r8   
cell_inputrH    decoder_hidden_attention_contextdecoder_outputgate_predictions                    r   decodez_Decoder.decodec  s   R Y/@A2FF
+/+=+=jK[]kJl+m+m(.9%5t7Mt}]] %	+<+F+Fq+I+IK`KjKjklKmKm*ntu v v v/3/C/Cf&68Mt0
 0
,, 	!22	#35F"GLL'+'7'7XdGe'f'f$>43GWW+09nFW5X^_+`+`+`(//0PQQ//*JKK !

 
	
r   mel_specgram_truthmemory_lengthsc                 x   |                      |                              d          }|                     |          }t          j        ||fd          }|                     |          }t          |          }|                     |          \  }}}	}
}}}}g g g }}}t          |          |	                    d          dz
  k     r|t          |                   }| 
                    ||||	|
||||||          \	  }}}}}	}
}}}||                    d          gz  }||                    d          gz  }||gz  }t          |          |	                    d          dz
  k     |                     t          j        |          t          j        |          t          j        |                    \  }}}|||fS )a  Decoder forward pass for training.

        Args:
            memory (Tensor): Encoder outputs
                with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
            mel_specgram_truth (Tensor): Decoder ground-truth mel-specs for teacher forcing
                with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
            memory_lengths (Tensor): Encoder output lengths for attention masking
                (the same as ``text_lengths``) with shape (n_batch, ).

        Returns:
            mel_specgram (Tensor): Predicted mel spectrogram
                with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
            gate_outputs (Tensor): Predicted stop token for each timestep
                with shape (n_batch,  max of ``mel_specgram_lengths``).
            alignments (Tensor): Sequence of attention weights from the decoder
                with shape (n_batch,  max of ``mel_specgram_lengths``, max of ``text_lengths``).
        r   rn   r   )r   r3   r   r   r   r   r9   r   r   r   r   re   r   stack)rF   rl   r   r   r   r   r8   r   r   r   r   rt   r   ru   rc   mel_outputsr   r   
mel_outputgate_outputr   s                        r   rL   z_Decoder.forward  s   , //77AA!DD334FGGM>#BJJJ^44%n55 ++F33		
! 13B:\+!4!4Q!7!7!!;;;*3{+;+;<M  !%!  
 !%! J..q1122K[003344L,--J9 +!4!4Q!7!7!!;;;< 261L1LK$$ek,&?&?ZAXAX2
 2
.lJ \:55r   c                     |                     d          }|j        }|j        }t          j        || j        | j        z  ||          }|S )aU  Gets all zeros frames to use as the first decoder input

        args:
            memory (Tensor): Encoder outputs
                with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).

        returns:
            decoder_input (Tensor): All zeros frames with shape(n_batch, ``n_mels`` * ``n_frame_per_step``).
        r   r   r   r   s         r   _get_go_framez_Decoder._get_go_frame  r   r   c                    |                     d          |j        }}|                     |          }t          |          }|                     |          \  }}}	}
}}}}t          j        |gt
          j        |          }t          j        |gt
          j        |          }g }g }g }t          | j
                  D ]
}|                     |          }|                     ||||	|
||||||          \	  }}}}}	}
}}}|                    |                    d                     |                    |                    dd                     |                    |           || xx         dz  cc<   |t          j        |                    d                    | j        k    z  }| j        rt          j        |          r n|}t+          |          | j
        k    rt-          j        d           t          j        |d          }t          j        |d          }t          j        |d          }|                     |||          \  }}}||||fS )a  Decoder inference

        Args:
            memory (Tensor): Encoder outputs
                with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
            memory_lengths (Tensor): Encoder output lengths for attention masking
                (the same as ``text_lengths``) with shape (n_batch, ).

        Returns:
            mel_specgram (Tensor): Predicted mel spectrogram
                with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
            mel_specgram_lengths (Tensor): the length of the predicted mel spectrogram (n_batch, ))
            gate_outputs (Tensor): Predicted stop token for each timestep
                with shape (n_batch,  max of ``mel_specgram_lengths``).
            alignments (Tensor): Sequence of attention weights from the decoder
                with shape (n_batch,  max of ``mel_specgram_lengths``, max of ``text_lengths``).
        r   r   r   zZReached max decoder steps. The generated spectrogram might not cover the whole transcript.rn   )r   r.   r   r9   r   r   r   int32boolr   r   r   r   r   r3   rJ   r   re   r   r   allr   warningswarnr   r   )rF   rl   r   
batch_sizer.   r   r8   r   r   r   r   rt   r   ru   rc   mel_specgram_lengthsfinishedmel_specgramsr   r   r   r   r   s                          r   inferz_Decoder.infer
  s   & $[[^^V]F
**622%n55 ++F33		
!  %{J<u{SYZZZ;
|5:fMMM&(%'#%
t,-- #	) #	)A KK66M  !%!  
 !%!   !7!7!:!:;;; 5 5a ; ;<<</000 (+++q0+++k&9&9!&<&<==@SSSH* uy/B/B (MM}!666Mo   	-Q777y1555Yzq111
262M2Mm]iku2v2v/|Z2L*LLr   )rN   rO   rP   rQ   r(   r`   r   rC   r   r   r   r   r   r   r   rL   r   r   jitexportr   rR   rS   s   @r   r   r     s        &1
1
 1
  #	1

 1
 1
 1
 !%1
 1
 "1
 &)1
 ),1
 !1
 1
 1
  
!1
 1
 1
 1
 1
 1
f F    "/
/
	vvvvvvvvM	N/
 /
 /
 /
bF v    *6"6286FL6	vvv%	&6 6 6 6@H
H
 !H
 	H

 H
 H
 "H
  &H
 "H
 H
 !H
 H
 
vvvvvvvvvU	VH
 H
 H
 H
TJ6J628J6JPJ6	vvv%	&J6 J6 J6 J6XF v    " YWMF WMF WMuVVU[]cEc?d WM WM WM WM WM WM WM WMr   r   c            /       6    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d2dedededededededededededededed ed!ed"ed#ed$ed%ed&ed'ed(d)f. fd*Zd+ed,ed-ed.ed(e	eeeef         f
d/Z
ej        j        d3d+ed0ee         d(e	eeef         fd1            Z xZS )4r
   a	  Tacotron2 model from *Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions*
    :cite:`shen2018natural` based on the implementation from
    `Nvidia Deep Learning Examples <https://github.com/NVIDIA/DeepLearningExamples/>`_.

    See Also:
        * :class:`torchaudio.pipelines.Tacotron2TTSBundle`: TTS pipeline with pretrained model.

    Args:
        mask_padding (bool, optional): Use mask padding (Default: ``False``).
        n_mels (int, optional): Number of mel bins (Default: ``80``).
        n_symbol (int, optional): Number of symbols for the input text (Default: ``148``).
        n_frames_per_step (int, optional): Number of frames processed per step, only 1 is supported (Default: ``1``).
        symbol_embedding_dim (int, optional): Input embedding dimension (Default: ``512``).
        encoder_n_convolution (int, optional): Number of encoder convolutions (Default: ``3``).
        encoder_kernel_size (int, optional): Encoder kernel size (Default: ``5``).
        encoder_embedding_dim (int, optional): Encoder embedding dimension (Default: ``512``).
        decoder_rnn_dim (int, optional): Number of units in decoder LSTM (Default: ``1024``).
        decoder_max_step (int, optional): Maximum number of output mel spectrograms (Default: ``2000``).
        decoder_dropout (float, optional): Dropout probability for decoder LSTM (Default: ``0.1``).
        decoder_early_stopping (bool, optional): Continue decoding after all samples are finished (Default: ``True``).
        attention_rnn_dim (int, optional): Number of units in attention LSTM (Default: ``1024``).
        attention_hidden_dim (int, optional): Dimension of attention hidden representation (Default: ``128``).
        attention_location_n_filter (int, optional): Number of filters for attention model (Default: ``32``).
        attention_location_kernel_size (int, optional): Kernel size for attention model (Default: ``31``).
        attention_dropout (float, optional): Dropout probability for attention LSTM (Default: ``0.1``).
        prenet_dim (int, optional): Number of ReLU units in prenet layers (Default: ``256``).
        postnet_n_convolution (int, optional): Number of postnet convolutions (Default: ``5``).
        postnet_kernel_size (int, optional): Postnet kernel size (Default: ``5``).
        postnet_embedding_dim (int, optional): Postnet embedding dimension (Default: ``512``).
        gate_threshold (float, optional): Probability threshold for stop token (Default: ``0.5``).
    FP      r                 皙?T             r   mask_paddingr   n_symbolr   symbol_embedding_dimrW   r   r   r   r   r   r   rV   r>   rX   rY   r   r   r   r   r   r   r   Nc                    t                                                       || _        || _        || _        t          j        ||          | _        t          j        j	        
                    | j        j                   t          |||          | _        t          ||||	|
|||||||||          | _        t!          ||||          | _        d S N)rB   rC   r  r   r   r   	Embedding	embeddingr   r   r   r   r   encoderr   decoderr   postnet)rF   r  r   r  r   r  rW   r   r   r   r   r   r   rV   r>   rX   rY   r   r   r   r   r   r   rG   s                          r   rC   zTacotron2.__init__  s    2 	(!2h0DEE%%dn&;<<< 57LNabb!" '*
 
   (=?RTijjr   tokenstoken_lengthsr   r   c                 t   |                      |                              dd          }|                     ||          }|                     |||          \  }}}|                     |          }	||	z   }	| j        rt          |          }
|
                    | j        |
	                    d          |
	                    d                    }
|

                    ddd          }
|                    |
d           |	                    |
d           |                    |
dddddf         d           ||	||fS )a  Pass the input through the Tacotron2 model. This is in teacher
        forcing mode, which is generally used for training.

        The input ``tokens`` should be padded with zeros to length max of ``token_lengths``.
        The input ``mel_specgram`` should be padded with zeros to length max of ``mel_specgram_lengths``.

        Args:
            tokens (Tensor): The input tokens to Tacotron2 with shape `(n_batch, max of token_lengths)`.
            token_lengths (Tensor): The valid length of each sample in ``tokens`` with shape `(n_batch, )`.
            mel_specgram (Tensor): The target mel spectrogram
                with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
            mel_specgram_lengths (Tensor): The length of each mel spectrogram with shape `(n_batch, )`.

        Returns:
            [Tensor, Tensor, Tensor, Tensor]:
                Tensor
                    Mel spectrogram before Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
                Tensor
                    Mel spectrogram after Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
                Tensor
                    The output for stop token at each time step with shape `(n_batch, max of mel_specgram_lengths)`.
                Tensor
                    Sequence of attention weights from the decoder with
                    shape `(n_batch, max of mel_specgram_lengths, max of token_lengths)`.
        r   r&   )r   r   g        Ng     @@)r  rJ   r  r  r  r  r9   expandr   r   permutemasked_fill_)rF   r  r  r   r   embedded_inputsencoder_outputsr   r   mel_specgram_postnetr8   s              r   rL   zTacotron2.forward  s<   B ..00::1a@@,,FF15\- 2> 2
 2
.lJ  $||L99+.BB 	:)*>??D;;t{DIIaLL$))A,,GGD<<1a((D%%dC000 --dC888%%d111a7mS9991<KKr   r,   c                     |j         \  }}|Ft          j        |g                              |                              |j        |j                  }|J |                     |                              dd          }| 	                    ||          }| j
                            ||          \  }}}	}
|                     |          }||z   }|
                    d||                              dd          }
|||
fS )a  Using Tacotron2 for inference. The input is a batch of encoded
        sentences (``tokens``) and its corresponding lengths (``lengths``). The
        output is the generated mel spectrograms, its corresponding lengths, and
        the attention weights from the decoder.

        The input `tokens` should be padded with zeros to length max of ``lengths``.

        Args:
            tokens (Tensor): The input tokens to Tacotron2 with shape `(n_batch, max of lengths)`.
            lengths (Tensor or None, optional):
                The valid length of each sample in ``tokens`` with shape `(n_batch, )`.
                If ``None``, it is assumed that the all the tokens are valid. Default: ``None``

        Returns:
            (Tensor, Tensor, Tensor):
                Tensor
                    The predicted mel spectrogram with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
                Tensor
                    The length of the predicted mel spectrogram with shape `(n_batch, )`.
                Tensor
                    Sequence of attention weights from the decoder with shape
                    `(n_batch, max of mel_specgram_lengths, max of lengths)`.
        Nr   r&   r   )r   r   tensorr  tor.   r/   r  rJ   r  r  r   r  unfold)rF   r  r,   r   
max_lengthr  r  r   r   r   r   mel_outputs_postnets               r   r   zTacotron2.infer  s    2 %l?lJ<0077@@CCFMSYS_``G"""..00::1a@@,,@@<@L<N<N`g<h<h9*Az"ll<88*-@@&&q'7;;EEaKK
"$8*DDr   )Fr  r  r   r  r  r  r  r  r  r	  Tr  r
  r  r  r	  r  r  r  r  r   r  )rN   rO   rP   rQ   r   r(   r`   rC   r   r   rL   r   r   r   r   r   rR   rS   s   @r   r
   r
   e  sO        D #!"$'%(%&#$# $!$'+!%$'+-.0#&%&#$%( #/1k 1k1k 1k 	1k
 1k "1k  #1k  #1k !1k 1k 1k 1k !%1k 1k "1k  &)!1k" ),#1k$ !%1k& '1k(  #)1k* !+1k,  #-1k. /1k0 
11k 1k 1k 1k 1k 1kf4L4L 4L 	4L
 %4L 
vvvv-	.4L 4L 4L 4Ll Y&E &EF &EXf-= &EvW]_eOeIf &E &E &E &E &E &E &E &Er   )Tr   )r   r   Nr   Tr   )r   typingr   r   r   r   r   r   r   torch.nnr	   rq   __all__r(   r   strr   r   r)   r+   r9   Moduler;   rU   rw   r   r   r   r
    r   r   <module>r-     s  8  / / / / / / / / / / / /          $ $ $ $ $ $ 
 c C t QT didlds    * 59+ +++ + 	+
 eCeCj012+ + + + X_+ + + +\F v    ".# .# .# .# .#RY .# .# .#bT4 T4 T4 T4 T4 T4 T4 T4n    bi   <: : : : :ry : : :zE E E E Ery E E EP}M }M }M }M }Mry }M }M }M@qE qE qE qE qE	 qE qE qE qE qEr   