
    ~VjiV<                        d dl Z d dlmZmZmZ d dlZd dlmc mZ	 d dlmZm
Z
 g dZ G d dej                  Z G d dej                  Z G d	 d
ej                  Z G d dej                  Z G d dej                  ZdS )    N)ListOptionalTuple)nnTensor)ResBlock	MelResNet	Stretch2dUpsampleNetworkWaveRNNc                   >     e Zd ZdZd	deddf fdZdedefdZ xZS )
r   af  ResNet block based on *Efficient Neural Audio Synthesis* :cite:`kalchbrenner2018efficient`.

    Args:
        n_freq: the number of bins in a spectrogram. (Default: ``128``)

    Examples
        >>> resblock = ResBlock()
        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
        >>> output = resblock(input)  # shape: (10, 128, 512)
       n_freqreturnNc                 H   t                                                       t          j        t          j        ||dd          t          j        |          t          j        d          t          j        ||dd          t          j        |                    | _        d S )N   Fin_channelsout_channelskernel_sizebiasTinplace)super__init__r   
SequentialConv1dBatchNorm1dReLUresblock_model)selfr   	__class__s     S/root/voice-cloning/.venv/lib/python3.11/site-packages/torchaudio/models/wavernn.pyr   zResBlock.__init__   s     mI&v1SXYYYN6""GD!!!I&v1SXYYYN6""
 
    specgramc                 2    |                      |          |z   S )zPass the input through the ResBlock layer.
        Args:
            specgram (Tensor): the input sequence to the ResBlock layer (n_batch, n_freq, n_time).

        Return:
            Tensor shape: (n_batch, n_freq, n_time)
        )r    r!   r%   s     r#   forwardzResBlock.forward(   s     ""8,,x77r$   )r   	__name__
__module____qualname____doc__intr   r   r(   __classcell__r"   s   @r#   r   r      s|        	 		
 	
s 	
T 	
 	
 	
 	
 	
 	
	8 	86 	8 	8 	8 	8 	8 	8 	8 	8r$   r   c                   P     e Zd ZdZ	 ddedededed	ed
df fdZded
efdZ xZS )r	   a  MelResNet layer uses a stack of ResBlocks on spectrogram.

    Args:
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)

    Examples
        >>> melresnet = MelResNet()
        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
        >>> output = melresnet(input)  # shape: (10, 128, 508)
    
   r      n_res_blockr   n_hiddenn_outputr   r   Nc                 V   t                                                       fdt          |          D             }t          j        t          j        ||d          t          j                  t          j        d          g|t          j        |d          R  | _        d S )Nc                 .    g | ]}t                    S  )r   ).0_r5   s     r#   
<listcomp>z&MelResNet.__init__.<locals>.<listcomp>I   s!    DDDAXh''DDDr$   Fr   Tr   r   )r   r   r   )	r   r   ranger   r   r   r   r   melresnet_model)r!   r4   r   r5   r6   r   	ResBlocksr"   s      `   r#   r   zMelResNet.__init__D   s     	DDDD{1C1CDDD	!}I&x[_deeeN8$$GD!!! 
 	 

 I(qQQQ 
  
  
r$   r%   c                 ,    |                      |          S )zPass the input through the MelResNet layer.
        Args:
            specgram (Tensor): the input sequence to the MelResNet layer (n_batch, n_freq, n_time).

        Return:
            Tensor shape: (n_batch, n_output, n_time - kernel_size + 1)
        )r>   r'   s     r#   r(   zMelResNet.forwardS   s     ##H---r$   r2   r   r   r   r3   r)   r0   s   @r#   r	   r	   4   s           vw
 

-0
BE
WZ
or
	
 
 
 
 
 
	. 	.6 	. 	. 	. 	. 	. 	. 	. 	.r$   r	   c                   @     e Zd ZdZdededdf fdZdedefdZ xZS )	r
   a  Upscale the frequency and time dimensions of a spectrogram.

    Args:
        time_scale: the scale factor in time dimension
        freq_scale: the scale factor in frequency dimension

    Examples
        >>> stretch2d = Stretch2d(time_scale=10, freq_scale=5)

        >>> input = torch.rand(10, 100, 512)  # a random spectrogram
        >>> output = stretch2d(input)  # shape: (10, 500, 5120)
    
time_scale
freq_scaler   Nc                 d    t                                                       || _        || _        d S N)r   r   rD   rC   )r!   rC   rD   r"   s      r#   r   zStretch2d.__init__m   s+    $$r$   r%   c                 j    |                     | j        d                               | j        d          S )zPass the input through the Stretch2d layer.

        Args:
            specgram (Tensor): the input sequence to the Stretch2d layer (..., n_freq, n_time).

        Return:
            Tensor shape: (..., n_freq * freq_scale, n_time * time_scale)
        )repeat_interleaverD   rC   r'   s     r#   r(   zStretch2d.forwards   s1     ))$/2>>PPQUQ`bdeeer$   r)   r0   s   @r#   r
   r
   _   s         %3 %C %D % % % % % %
f 
f6 
f 
f 
f 
f 
f 
f 
f 
fr$   r
   c                   x     e Zd ZdZ	 	 	 	 	 ddee         dededed	ed
eddf fdZdedeeef         fdZ	 xZ
S )r   a  Upscale the dimensions of a spectrogram.

    Args:
        upsample_scales: the list of upsample scales.
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)

    Examples
        >>> upsamplenetwork = UpsampleNetwork(upsample_scales=[4, 4, 16])
        >>> input = torch.rand(10, 128, 10)  # a random spectrogram
        >>> output = upsamplenetwork(input)  # shape: (10, 128, 1536), (10, 128, 1536)
    r2   r   r3   upsample_scalesr4   r   r5   r6   r   r   Nc                 <   t                                                       d}|D ]}||z  }|| _        |dz
  dz  |z  | _        t	          |||||          | _        t          |d          | _        g }	|D ]}
t          |
d          }t          j	        ddd|
dz  dz   fd|
fd          }t          j        j                            |j        d|
dz  dz   z             |	                    |           |	                    |           t          j        |	 | _        d S )Nr      r   F)r   r   r   paddingr         ?)r   r   total_scaleindentr	   resnetr
   resnet_stretchr   Conv2dtorchinit	constant_weightappendr   upsample_layers)r!   rL   r4   r   r5   r6   r   rQ   upsample_scale	up_layersscalestretchconvr"   s                r#   r   zUpsampleNetwork.__init__   sE    	- 	* 	*N>)KK +"Q1,{:VXxUU'Q77	$ 	# 	#Eq))G9AAuqy1};MXY[`Wahm  D HM##DK	A1FGGGW%%%T""""!}i8r$   r%   c                 `   |                      |                              d          }|                     |          }|                    d          }|                    d          }|                     |          }|                    d          dddd| j        | j         f         }||fS )a  Pass the input through the UpsampleNetwork layer.

        Args:
            specgram (Tensor): the input sequence to the UpsampleNetwork layer (n_batch, n_freq, n_time)

        Return:
            Tensor shape: (n_batch, n_freq, (n_time - kernel_size + 1) * total_scale),
                          (n_batch, n_output, (n_time - kernel_size + 1) * total_scale)
        where total_scale is the product of all elements in upsample_scales.
        r   N)rS   	unsqueezerT   squeezer[   rR   )r!   r%   resnet_outputupsampling_outputs       r#   r(   zUpsampleNetwork.forward   s     H--77::++M::%--a00%%a(( 00::-55a88AAAt{dk\?Y9YZ -//r$   rA   )r*   r+   r,   r-   r   r.   r   r   r   r(   r/   r0   s   @r#   r   r      s         & 9 9c9 9 	9
 9 9 9 
9 9 9 9 9 9>0 05+@ 0 0 0 0 0 0 0 0r$   r   c                        e Zd ZdZ	 	 	 	 	 	 	 ddee         deded	ed
ededededededdf fdZdededefdZe	j
        j        ddedee         deeee         f         fd            Z xZS )r   aW  WaveRNN model from *Efficient Neural Audio Synthesis* :cite:`wavernn`
    based on the implementation from `fatchord/WaveRNN <https://github.com/fatchord/WaveRNN>`_.

    The original implementation was introduced in *Efficient Neural Audio Synthesis*
    :cite:`kalchbrenner2018efficient`. The input channels of waveform and spectrogram have to be 1.
    The product of `upsample_scales` must equal `hop_length`.

    See Also:
        * `Training example <https://github.com/pytorch/audio/tree/release/0.12/examples/pipeline_wavernn>`__
        * :class:`torchaudio.pipelines.Tacotron2TTSBundle`: TTS pipeline with pretrained model.

    Args:
        upsample_scales: the list of upsample scales.
        n_classes: the number of output classes.
        hop_length: the number of samples between the starts of consecutive frames.
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_rnn: the dimension of RNN layer. (Default: ``512``)
        n_fc: the dimension of fully connected layer. (Default: ``512``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)

    Example
        >>> wavernn = WaveRNN(upsample_scales=[5,5,8], n_classes=512, hop_length=200)
        >>> waveform, sample_rate = torchaudio.load(file)
        >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
        >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
        >>> output = wavernn(waveform, specgram)
        >>> # output shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, n_classes)
    r2      r3   r   rL   	n_classes
hop_lengthr4   n_rnnn_fcr   r   r5   r6   r   Nc                    t                                                       || _        |dz  r|dz
  n|dz  | _        || _        |
dz  | _        || _        || _        t          t          j
        | j                            | _        d}|D ]}||z  }|| j        k    rt          d| d|           t          ||||	|
|          | _        t          j        || j        z   dz   |          | _        t          j        ||d          | _        t          j        || j        z   |d          | _        t          j        d          | _        t          j        d          | _        t          j        || j        z   |          | _        t          j        || j        z   |          | _        t          j        || j                  | _        d S )	NrN   r      z/Expected: total_scale == hop_length, but found z != T)batch_firstr   )r   r   r   _padrj   n_auxri   rh   r.   mathlog2n_bits
ValueErrorr   upsampler   LinearfcGRUrnn1rnn2r   relu1relu2fc1fc2fc3)r!   rL   rh   ri   r4   rj   rk   r   r   r5   r6   rQ   r\   r"   s                r#   r   zWaveRNN.__init__   s    	&(3aH[1__[QN	
]
$"ty8899- 	* 	*N>)KK$/))l{ll`jllmmm'fhX`bmnn)FTZ/!3U;;F5%T:::	F54:-u$GGG	WT***
WT***
9UTZ/669TDJ.559T4>22r$   waveformr%   c                     |                     d          dk    rt          d          |                     d          dk    rt          d          |                    d          |                    d          }}|                     d          }t          j        d| j        |j        |j                  }t          j        d| j        |j        |j                  }                     |          \  }}|	                    dd          }|	                    dd          } fdt          d          D             }|d	d	d	d	|d         |d         f         }|d	d	d	d	|d         |d         f         }	|d	d	d	d	|d         |d
         f         }
|d	d	d	d	|d
         |d         f         }t          j        |                    d          ||gd          }                     |          }|}                     ||          \  }}||z   }|}t          j        ||	gd          }                     ||          \  }}||z   }t          j        ||
gd          }                     |          }                     |          }t          j        ||gd          }                     |          }                     |          }                     |          }|                    d          S )a  Pass the input through the WaveRNN model.

        Args:
            waveform: the input waveform to the WaveRNN layer (n_batch, 1, (n_time - kernel_size + 1) * hop_length)
            specgram: the input spectrogram to the WaveRNN layer (n_batch, 1, n_freq, n_time)

        Return:
            Tensor: shape (n_batch, 1, (n_time - kernel_size + 1) * hop_length, n_classes)
        r   z*Require the input channel of waveform is 1z*Require the input channel of specgram is 1r   )dtypedevicerN   c                 $    g | ]}j         |z  S r9   rp   )r:   ir!   s     r#   r<   z#WaveRNN.forward.<locals>.<listcomp>.  s    444a4:>444r$   r3   N   rm   rI   dim)sizert   rc   rV   zerosrj   r   r   ru   	transposer=   catrb   rw   ry   rz   r}   r{   r~   r|   r   )r!   r   r%   
batch_sizeh1h2auxaux_idxa1a2a3a4xresr;   s   `              r#   r(   zWaveRNN.forward  s    ==q  IJJJ==q  IJJJ%--a00(2B2B12E2E(]]1%%
[J
(.QYQ`aaa[J
(.QYQ`aaa h//#%%a++mmAq!!4444588444AAAwqzGAJ../AAAwqzGAJ../AAAwqzGAJ../AAAwqzGAJ../Ix))"--x<"EEEGGAJJyyB1GIq"g2&&&yyB1GIq"g2&&&HHQKKJJqMMIq"g2&&&HHQKKJJqMMHHQKK {{1~~r$   lengthsc                     |j         }|j        }t          j        j                            | j         j        f          }                     |          \  }|| j        j        z  }g }|	                                \  }}}t          j
        d| j        f||          }	t          j
        d| j        f||          }
t          j
        |df||          } fdt          d          D             }t          |          D ]Ԋ|ddddf         }fd|D             \  }}}}t          j        |||gd          }                     |          }                     |                    d          |	          \  }}	||	d         z   }t          j        ||gd          }                     |                    d          |
          \  }}
||
d         z   }t          j        ||gd          }t%          j                             |                    }t          j        ||gd          }t%          j                             |                    }                     |          }t%          j        |d          }t          j        |d                                          }d	|z  d	 j        z  d
z
  z  d
z
  }|                    |           t          j        |                              dd	d          |fS )a  Inference method of WaveRNN.

        This function currently only supports multinomial sampling, which assumes the
        network is trained on cross entropy loss.

        Args:
            specgram (Tensor):
                Batch of spectrograms. Shape: `(n_batch, n_freq, n_time)`.
            lengths (Tensor or None, optional):
                Indicates the valid length of each audio in the batch.
                Shape: `(batch, )`.
                When the ``specgram`` contains spectrograms with different durations,
                by providing ``lengths`` argument, the model will compute
                the corresponding valid output lengths.
                If ``None``, it is assumed that all the audio in ``waveforms``
                have valid length. Default: ``None``.

        Returns:
            (Tensor, Optional[Tensor]):
            Tensor
                The inferred waveform of size `(n_batch, 1, n_time)`.
                1 stands for a single channel.
            Tensor or None
                If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
                is returned.
                It indicates the valid length in time axis of the output Tensor.
        Nr   )r   r   c                 X    g | ]&}d d j         |z  j         |dz   z  d d f         'S )Nr   r   )r:   r   r   r!   s     r#   r<   z!WaveRNN.infer.<locals>.<listcomp>x  sC    XXX!SDJNTZ1q5-AA111DEXXXr$   rm   c                 .    g | ]}|d d d d f         S rF   r9   )r:   ar   s     r#   r<   z!WaveRNN.infer.<locals>.<listcomp>~  s+    %D%D%DQa111aj%D%D%Dr$   r   r   rN   rP   )r   r   rV   r   
functionalpadro   ru   rQ   r   r   rj   r=   r   rw   ry   rb   rz   Frelur}   r~   r   softmaxmultinomialfloatrs   rZ   stackpermute)r!   r%   r   r   r   outputb_sizer;   seq_lenr   r   r   	aux_splitm_ta1_ta2_ta3_ta4_tinplogits	posteriorr   r   s   `                    @@r#   inferzWaveRNN.inferK  s   < 8&**8di5KLLh//# 99G!%]]__7[!VTZ0uMMM[!VTZ0uMMMKF%@@@XXXXXuUVxxXXX	w 	 	A111aaa7#C%D%D%D%D)%D%D%D"D$d	1c4.a000A

AIIakk!nnb11EArBqE	A)QI1---CIIcmmA..33EArBqE	A	1d)+++Atxx{{##A	1d)+++Atxx{{##AXXa[[F	&a000I!)Q//5577AADK#-.4AMM!{6""**1a33W<<r$   )r2   rg   rg   r3   r   r   r   rF   )r*   r+   r,   r-   r   r.   r   r   r(   rV   jitexportr   r   r   r/   r0   s   @r#   r   r      sb        J (3 (3c(3 (3 	(3
 (3 (3 (3 (3 (3 (3 (3 
(3 (3 (3 (3 (3 (3T7 7& 7V 7 7 7 7r YM= M=f M=x/? M=5QWYabhYiQiKj M= M= M= M= M= M= M= M=r$   r   )rq   typingr   r   r   rV   torch.nn.functionalr   r   r   r   __all__Moduler   r	   r
   r   r   r9   r$   r#   <module>r      s}    ( ( ( ( ( ( ( ( ( (                     8  8  8  8  8ry  8  8  8F(. (. (. (. (.	 (. (. (.Vf f f f f	 f f fBD0 D0 D0 D0 D0bi D0 D0 D0NR= R= R= R= R=bi R= R= R= R= R=r$   