
    ~Vjib                        d dl Z d dlZd dlmZmZmZmZ d dlZd dlmZ d dl	m
Z  G d dej        j                  Z G d dej        j                  Z G d	 d
ej        j                  Z G d dej        j                  Z G d dej        j                  Z G d dej        j                  Z G d dej                  Z G d dej                  Zdej        dededej        fdZd Zd(dej        dedededej        f
d Zd)d!ej        ded"ededej        f
d#Zd$ee         defd%Zd$ee         defd&Zd$ee         defd'ZdS )*    N)AnyDictListOptional)nn)
functionalc            	            e Zd ZdZddedededef fdZed	e	j
        fd
            Zde	j
        d	e	j
        fdZ xZS )_ScaledEmbeddingaF  Make continuous embeddings and boost learning rate

    Args:
        num_embeddings (int): number of embeddings
        embedding_dim (int): embedding dimensions
        scale (float, optional): amount to scale learning rate (Default: 10.0)
        smooth (bool, optional): choose to apply smoothing (Default: ``False``)
          $@Fnum_embeddingsembedding_dimscalesmoothc                    t                                                       t          j        ||          | _        |rrt          j        | j        j        j        d          }|t          j	        d|dz             
                                d d d f         z  }|| j        j        j        d d <   | j        j        xj        |z  c_        || _        d S )Nr   dim   )super__init__r   	Embedding	embeddingtorchcumsumweightdataarangesqrtr   )selfr   r   r   r   r   	__class__s         T/root/voice-cloning/.venv/lib/python3.11/site-packages/torchaudio/models/_hdemucs.pyr   z_ScaledEmbedding.__init__-   s    nmDD 	3\$."7"<!DDDFel1nq.@AAFFHHDQQF,2DN!&qqq)""e+""


    returnc                 *    | j         j        | j        z  S N)r   r   r   )r   s    r    r   z_ScaledEmbedding.weight8   s    ~$tz11r!   xc                 @    |                      |          | j        z  }|S )zForward pass for embedding with scale.
        Args:
            x (torch.Tensor): input tensor of shape `(num_embeddings)`

        Returns:
            (Tensor):
                Embedding output of shape `(num_embeddings, embedding_dim)`
        )r   r   )r   r%   outs      r    forwardz_ScaledEmbedding.forward<   s!     nnQ$*,
r!   )r   F)__name__
__module____qualname____doc__intfloatboolr   propertyr   Tensorr   r(   __classcell__r   s   @r    r
   r
   #   s         	 	s 	3 	u 	]a 	 	 	 	 	 	 2 2 2 2 X2
 
%, 
 
 
 
 
 
 
 
r!   r
   c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 dd	ed
ededededededededeeee	f                  def fdZ
ddej        deej                 dej        fdZ xZS )
_HEncLayerat  Encoder layer. This used both by the time and the frequency branch.
    Args:
        chin (int): number of input channels.
        chout (int): number of output channels.
        kernel_size (int, optional): Kernel size for encoder (Default: 8)
        stride (int, optional): Stride for encoder layer (Default: 4)
        norm_groups (int, optional): number of groups for group norm. (Default: 4)
        empty (bool, optional): used to make a layer with just the first conv. this is used
            before merging the time and freq. branches. (Default: ``False``)
        freq (bool, optional): boolean for whether conv layer is for frequency domain (Default: ``True``)
        norm_type (string, optional): Norm type, either ``group_norm `` or ``none`` (Default: ``group_norm``)
        context (int, optional): context size for the 1x1 conv. (Default: 0)
        dconv_kw (Dict[str, Any] or None, optional): dictionary of kwargs for the DConv class. (Default: ``None``)
        pad (bool, optional): true to pad the input. Padding is done so that the output size is
            always the input size / stride. (Default: ``True``)
          FT
group_normr   Nchinchoutkernel_sizestridenorm_groupsemptyfreq	norm_typecontextdconv_kwpadc                    t                                                       |
i }
d }|dk    rfd}|r|dz  nd}t          j        }|| _        || _        || _        || _        || _        |r|dg}|dg}|dg}t          j	        } ||||||          | _
         ||          | _        | j        rJt          j                    | _        t          j                    | _        t          j                    | _        d S  ||d|z  dd|	z  z   d|	          | _         |d|z            | _        t!          |fi |
| _        d S )Nc                 (    t          j                    S r$   r   Identityds    r    <lambda>z%_HEncLayer.__init__.<locals>.<lambda>m       BKMM r!   r8   c                 .    t          j        |           S r$   r   	GroupNormrI   r=   s    r    rJ   z%_HEncLayer.__init__.<locals>.<lambda>o       [! < < r!   r7   r   r      )r   r   r   Conv1dr?   r;   r<   r>   rC   Conv2dconvnorm1rG   rewritenorm2dconv_DConv)r   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   norm_fnpad_valklassr   s        `         r    r   z_HEncLayer.__init__\   sd    	H))$$<<<<G&)0+""q		&
 	&*Ka[FlGIEE${FGDD	WU^^
: 	3;==DLDJDJJJ 5E	1q7{?AwOODL U++DJ2222DJJJr!   r%   injectr"   c                 6   | j         s;|                                dk    r#|j        \  }}}}|                    |d|          }| j         sB|j        d         }|| j        z  dk    s't          j        |d| j        || j        z  z
  f          }|                     |          }| j        r|S |o|j        d         |j        d         k    rt          d          |                                dk    r'|                                dk    r|dddddf         }||z   }t          j
        |                     |                    }| j         r}|j        \  }}}}|                    dddd                              d||          }|                     |          }|                    ||||                              dddd          }n|                     |          }|                     |                     |                    }	t          j        |	d	          }	|	S )
a]  Forward pass for encoding layer.

        Size depends on whether frequency or time

        Args:
            x (torch.Tensor): tensor input of shape `(B, C, F, T)` for frequency and shape
                `(B, C, T)` for time
            inject (torch.Tensor, optional): on last layer, combine frequency and time branches through inject param,
                same shape as x (default: ``None``)

        Returns:
            Tensor
                output tensor after encoder layer of shape `(B, C, F / stride, T)` for frequency
                    and shape `(B, C, ceil(T / stride))` for time
        r7   r   NzInjection shapes do not align   rQ   r   r   )r?   r   shapeviewr<   FrC   rT   r>   
ValueErrorgelurU   permutereshaperX   rW   rV   glu)
r   r%   r]   BCFrTleyzs
             r    r(   z_HEncLayer.forward   s   " y 	!QUUWW\\'KAq"aq"a  Ay 	DB#q((E!aT[0@!ABCCIIaLL: 	H|B172;.. !@AAAzz||q  QUUWW\\111d
+F
AF4::a==!!9 	'KAq"a		!Q1%%--b!Q77A

1Aq"a##++Aq!Q77AA

1AJJt||A''E!OOOr!   )	r6   r7   r7   FTr8   r   NTr$   r)   r*   r+   r,   r-   r/   strr   r   r   r   r   r1   r(   r2   r3   s   @r    r5   r5   I   s        * %-1*3 *3*3 *3 	*3
 *3 *3 *3 *3 *3 *3 4S>**3 *3 *3 *3 *3 *3 *3X, , ,x/E ,QVQ] , , , , , , , ,r!   r5   c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 dd	ed
edededededededededeeee	f                  def fdZ
dej        deej                 fdZ xZS )
_HDecLayera  Decoder layer. This used both by the time and the frequency branches.
    Args:
        chin (int): number of input channels.
        chout (int): number of output channels.
        last (bool, optional): whether current layer is final layer (Default: ``False``)
        kernel_size (int, optional): Kernel size for encoder (Default: 8)
        stride (int): Stride for encoder layer (Default: 4)
        norm_groups (int, optional): number of groups for group norm. (Default: 1)
        empty (bool, optional): used to make a layer with just the first conv. this is used
            before merging the time and freq. branches. (Default: ``False``)
        freq (bool, optional): boolean for whether conv layer is for frequency (Default: ``True``)
        norm_type (str, optional): Norm type, either ``group_norm `` or ``none`` (Default: ``group_norm``)
        context (int, optional): context size for the 1x1 conv. (Default: 1)
        dconv_kw (Dict[str, Any] or None, optional): dictionary of kwargs for the DConv class. (Default: ``None``)
        pad (bool, optional): true to pad the input. Padding is done so that the output size is
            always the input size / stride. (Default: ``True``)
    Fr6   r7   r   Tr8   Nr9   r:   lastr;   r<   r=   r>   r?   r@   rA   rB   rC   c                    t                                                       |i }d }|	dk    rfd}|r$||z
  dz  dk    rt          d          ||z
  dz  }nd}|| _        || _        || _        || _        || _        || _        || _	        t          j        }t          j        }|r |dg}|dg}t          j        }t          j        } |||||          | _         ||          | _        | j        r2t          j                    | _        t          j                    | _        d S  ||d|z  dd|
z  z   d|
          | _         |d|z            | _        d S )Nc                 (    t          j                    S r$   rF   rH   s    r    rJ   z%_HDecLayer.__init__.<locals>.<lambda>   rK   r!   r8   c                 .    t          j        |           S r$   rM   rO   s    r    rJ   z%_HDecLayer.__init__.<locals>.<lambda>   rP   r!   rQ   r   z#Kernel size and stride do not alignr   )r   r   rd   rC   rt   r?   r9   r>   r<   r;   r   rR   ConvTranspose1drS   ConvTranspose2dconv_trrW   rG   rV   rU   )r   r9   r:   rt   r;   r<   r=   r>   r?   r@   rA   rB   rC   rZ   r\   klass_trr   s         `         r    r   z_HDecLayer.__init__   s    	H))$$<<<<G 	f$)Q.. !FGGG'A-CCC			
&	% 	*&*Ka[FIE)Hxe[&AAWU^^
: 	+;==DLDJJJ 5q4xQ[!WMMDL T**DJJJr!   r%   skipc                    | j         r@|                                dk    r(|j        \  }}}|                    || j        d|          }| j        sB||z   }t          j        |                     | 	                    |                    d          }n|}|t          d          |                     |                     |                    }| j         r"| j        r|d| j        | j         ddf         }n9|d| j        | j        |z   f         }|j        d         |k    rt          d          | j        st          j        |          }||fS )	a,  Forward pass for decoding layer.

        Size depends on whether frequency or time

        Args:
            x (torch.Tensor): tensor input of shape `(B, C, F, T)` for frequency and shape
                `(B, C, T)` for time
            skip (torch.Tensor, optional): on first layer, separate frequency and time branches using param
                (default: ``None``)
            length (int): Size of tensor for output

        Returns:
            (Tensor, Tensor):
                Tensor
                    output tensor after decoder layer of shape `(B, C, F * stride, T)` for frequency domain except last
                        frequency layer shape is `(B, C, kernel_size, T)`. Shape is `(B, C, stride * T)`
                        for time domain.
                Tensor
                    contains the output just before final transposed convolution, which is used when the
                        freq. and time branch separate. Otherwise, does not matter. Shape is
                        `(B, C, F, T)` for frequency and `(B, C, T)` for time.
        r`   r_   r   r   Nz%Skip must be none when empty is true..z'Last index of z must be equal to length)r?   r   ra   rb   r9   r>   rc   rh   rU   rV   rd   rW   rz   rC   rt   re   )	r   r%   r|   lengthri   rj   rl   rn   ro   s	            r    r(   z_HDecLayer.forward   sL   . 9 	,AgGAq!q$)R++Az 	JDAdjja11q999AAA !HIIIJJt||A''9 	Lx 4c48txi/23#tx$(V"3334Awr{f$$ !JKKKy 	q		A!tr!   )
Fr6   r7   r   FTr8   r   NTrp   r3   s   @r    rs   rs      s        , %-10+ 0+0+ 0+ 	0+
 0+ 0+ 0+ 0+ 0+ 0+ 0+ 4S>*0+ 0+ 0+ 0+ 0+ 0+ 0+d. .Xel-C . . . . . . . .r!   rs   c            +           e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d3dee         dededededededededededededededededed ed!ed"ef* fd#Z	d$ Z
d4d&Zd5d)ej        d*ed+ed,ed-ef
d.Zd/ Zd0 Zd1ej        fd2Z xZS )6HDemucsa#
  Hybrid Demucs model from
    *Hybrid Spectrogram and Waveform Source Separation* :cite:`defossez2021hybrid`.

    See Also:
        * :class:`torchaudio.pipelines.SourceSeparationBundle`: Source separation pipeline with pre-trained models.

    Args:
        sources (List[str]): list of source names. List can contain the following source
            options: [``"bass"``, ``"drums"``, ``"other"``, ``"mixture"``, ``"vocals"``].
        audio_channels (int, optional): input/output audio channels. (Default: 2)
        channels (int, optional): initial number of hidden channels. (Default: 48)
        growth (int, optional): increase the number of hidden channels by this factor at each layer. (Default: 2)
        nfft (int, optional): number of fft bins. Note that changing this requires careful computation of
            various shape parameters and will not work out of the box for hybrid models. (Default: 4096)
        depth (int, optional): number of layers in encoder and decoder (Default: 6)
        freq_emb (float, optional): add frequency embedding after the first frequency layer if > 0,
            the actual value controls the weight of the embedding. (Default: 0.2)
        emb_scale (int, optional): equivalent to scaling the embedding learning rate (Default: 10)
        emb_smooth (bool, optional): initialize the embedding with a smooth one (with respect to frequencies).
            (Default: ``True``)
        kernel_size (int, optional): kernel_size for encoder and decoder layers. (Default: 8)
        time_stride (int, optional): stride for the final time layer, after the merge. (Default: 2)
        stride (int, optional): stride for encoder and decoder layers. (Default: 4)
        context (int, optional): context for 1x1 conv in the decoder. (Default: 4)
        context_enc (int, optional): context for 1x1 conv in the encoder. (Default: 0)
        norm_starts (int, optional): layer at which group norm starts being used.
            decoder layers are numbered in reverse order. (Default: 4)
        norm_groups (int, optional): number of groups for group norm. (Default: 4)
        dconv_depth (int, optional): depth of residual DConv branch. (Default: 2)
        dconv_comp (int, optional): compression of DConv branch. (Default: 4)
        dconv_attn (int, optional): adds attention layers in DConv branch starting at this layer. (Default: 4)
        dconv_lstm (int, optional): adds a LSTM layer in DConv branch starting at this layer. (Default: 4)
        dconv_init (float, optional): initial scale for the DConv branch LayerScale. (Default: 1e-4)
    rQ   0         皙?
   Tr6   r7   r   r   -C6?sourcesaudio_channelschannelsgrowthnfftdepthfreq_emb	emb_scale
emb_smoothr;   time_strider<   rA   context_encnorm_startsr=   dconv_depth
dconv_comp
dconv_attn
dconv_lstm
dconv_initc                    t                                                       || _        || _        || _        || _        |
| _        || _        || _        || _	        | j        dz  | _
        d | _        t          j                    | _        t          j                    | _        t          j                    | _        t          j                    | _        |}|dz  }|}|}| j        dz  }t%          | j                  D ]}||k    }||k    }||k    rdnd}|dk    }|} |
}!|s|dk    rt'          d          |dz  }!|} d}"d}#|r||
k    r|}!d}"d}#|!| ||"|||||||d	d
}$t)          |$          }%d|%d<   |
|%d<   ||%d<   d|%d<   t)          |$          }&|#rt+          ||          }|}t-          ||fd|i|$}'|r?|#du r|dk    r
d|%d<   d|%d<   t-          ||f||#d|%}(| j                            |(           | j                            |'           |dk    r!| j        t1          | j                  z  }|dz  }t3          ||f|dk    |d|&})|r1t3          ||f|#|dk    |d|%}*| j                            d|*           | j                            d|)           |}|}t7          ||z            }t7          ||z            }|r||
k    rd}n||z  }|dk    r!|rt9          |||	|          | _        || _        t=          |            d S )Nr7   rQ   r8   noner   z$When freq is false, freqs must be 1.TF)lstmattnr   compressinit)r;   r<   r?   rC   r@   r=   rB   r   r?   r;   r<   rC   rA      )rA   r>   )rt   rA   )r>   rt   rA   )r   r   )r   r   r   r   r   r   r;   rA   r<   r   
hop_lengthr   r   
ModuleListfreq_encoderfreq_decodertime_encodertime_decoderrangerd   dictmaxr5   appendlenrs   insertr-   r
   freq_emb_scale_rescale_module),r   r   r   r   r   r   r   r   r   r   r;   r   r<   rA   r   r   r=   r   r   r   r   r   r9   chin_zr:   chout_zfreqsindexr   r   r@   r?   strikerrC   	last_freqkwkwtkw_decenctencdectdecr   s,                                              r    r   zHDemucs.__init__Q  s   0 	
	,& )q.MOOMOOMOOMOO	Q4:&& K	/ K	/EJ&DJ&D(-(<(<&I19DDC #A::$%KLLL!Ao"CI !,, 	  #&*  ( *&  B r((CCK!,C"CMCJ"XXF  eW--VWHHkHRHHC /$$$%CM)*C&!$[{)[[WZ[[!((...$$S)))zz*S->->>WfY5A:wYYRXYYC 2!%hYUaZY`hhdghh!((D111$$Q,,,DF''E&7*++G %K''EEf$Ezzhz 0zYb c c c&.#r!   c                    | j         }| j        }|}||dz  k    rt          d          t          t	          j        |j        d         |z                      }|dz  dz  }|                     |||||z  z   |j        d         z
  d          }t          |||          dd dd d f         }|j        d         |dz   k    rt          d	          |ddd|z   f         }|S )
Nr7   zHop length must be nfft // 4r_   rQ   r`   reflect)mode.zESpectrogram's last dimension must be 4 + input size divided by stride)	r   r   rd   r-   mathceilra   _pad1d_spectro)r   r%   hlr   x0rm   rC   ro   s           r    _speczHDemucs._spec  s    _y ??;<<<172;+,,--AgkKK3b2g ;)KLLQb!!#ssAAA+.72;"q&  deeec1q2v:or!   Nc                    | j         }t          j        |g d          }t          j        |ddg          }|dz  dz  }|t          t	          j        ||z                      z  d|z  z   }t          |||          }|d|||z   f         }|S )N)r   r   r   r   rQ   r`   )r~   .)r   rc   rC   r-   r   r   	_ispectro)r   ro   r~   r   rC   rm   r%   s          r    _ispeczHDemucs._ispec  s    _E!\\\""E!aVAgk#di,,---C7aB'''c3v%%&r!   zero        r%   padding_leftpadding_rightr   valuec                     |j         d         }|dk    r3t          ||          }||k    rt          j        |d||z
  dz   f          }t          j        |||f||          S )zWrapper around F.pad, in order for reflect padding when num_frames is shorter than max_pad.
        Add extra zero padding around in order for padding to not break.r_   r   r   r   )ra   r   rc   rC   )r   r%   r   r   r   r   r~   max_pads           r    r   zHDemucs._pad1d  so     9,66G  E!a6!1A!5677uQ}5tUCCCr!   c                     |j         \  }}}}t          j        |                              ddddd          }|                    ||dz  ||          }|S )Nr   r   r7   rQ   r`   )ra   r   view_as_realrf   rg   )r   ro   ri   rj   rk   rl   ms          r    
_magnitudezHDemucs._magnitude  sW    g1b!q!!))!Q1a88IIaQA&&r!   c                     |j         \  }}}}}|                    ||dd||                              dddddd          }t          j        |                                          }|S )Nr_   rQ   r   r   r7      r`   )ra   rb   rf   r   view_as_complex
contiguous)r   r   ri   Srj   rk   rl   r'   s           r    _maskzHDemucs._mask  se    1aQffQ2q"a((00Aq!QBB#CNN$4$455
r!   inputc                 t	   |j         dk    rt          d|j                   |j        d         | j        k    rt          d|j        d          d          |}|j        d         }|                     |          }|                     |          }|}|j        \  }}}}	|                    dd	          }
|                    dd	          }||
z
  d
|z   z  }|}|                    dd	          }|                    dd	          }||z
  d
|z   z  }g }g }g }g }t          | j	                  D ]@\  }}|
                    |j        d                    d}|t          | j                  k     rW|
                    |j        d                    | j        |         } ||          }|j        s|
                    |           n|} |||          }|dk    r| j        {t          j        |j        d         |j                  }|                     |                                          ddddddf                             |          }|| j        |z  z   }|
                    |           Bt          j        |          }t          j        |          }t          | j                  D ]\  }}|                    d          } ||||                    d                    \  }}| j        t          | j                  z
  }||k    r| j        ||z
           }|                    d          }|j        rH|j        d         dk    rt          d|j                   |dddddf         } ||d|          \  }}|                    d          } ||||          \  }}t          |          dk    rt5          d          t          |          dk    rt5          d          t          |          dk    rt5          d          t          | j                  } |                    || d||	          }||dddf         z  |
dddf         z   }|                     |          }!|                     |!|          }|                    || d|          }||dddf         z  |dddf         z   }||z   }|S )a  HDemucs forward call

        Args:
            input (torch.Tensor): input mixed tensor of shape `(batch_size, channel, num_frames)`

        Returns:
            Tensor
                output tensor split into sources of shape `(batch_size, num_sources, channel, num_frames)`
        r`   zDExpected 3D tensor with dimensions (batch, channel, frames). Found: r   zZThe channel dimension of input Tensor must match `audio_channels` of HDemucs model. Found:.r_   )r   rQ   r`   T)r   keepdimgh㈵>)r   rQ   Nr   )devicerQ   z0If tdec empty is True, pre shape does not match zsaved is not emptyzlengths_t is not emptyzsaved_t is not empty)ndimrd   ra   r   r   r   meanstd	enumerater   r   r   r   r>   r   r   r   r   t	expand_asr   
zeros_liker   popr   r   AssertionErrorr   rb   r   r   )"r   r   r%   r~   ro   magri   rj   Fqrl   r   r   xtmeantstdtsavedsaved_tlengths	lengths_tidxencoder]   r   frsembdecoder|   preoffsetr   length_t_r   zouts"                                     r    r(   zHDemucs.forward  s    :??qdidoqqrrr;q>T000+Q+ + +  
 JJuooa  g1b! vv)Tv22ee	4e00X$*% FD11vv&$v//5jTD[)!	$T%677 	 	KCNN172;'''FS*++++  "...(-T"XXz  NN2&&&&  Fq&!!AaxxDM5 l172;qx@@@mmC((**,,T111aaa-=>HHKK+c11LLOOOOQa   %T%677 	5 	5KC99R==DVAtW[[__55FAs Z#d&7"8"88Ff}}(v6$==,,: 5y|q((()g\_\e)g)ghhhaaaAg,C DdH55EB";;r??D DT844EBu::?? !5666y>>Q !9:::w<<1 !7888FF1aR##AAAtGtAAAtG},zz!}}KKf%%WWQ2v&&$qqq$w-%4.0Fr!   )rQ   r   rQ   r   r   r   r   Tr6   rQ   r7   r   r   r7   r7   rQ   r7   r7   r7   r   r$   )r   r   )r)   r*   r+   r,   r   rq   r-   r.   r/   r   r   r   r   r1   r   r   r   r(   r2   r3   s   @r    r   r   -  s-       ! !L   -~ ~c~ ~ 	~
 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~  !~" #~$ %~& '~( )~* +~, -~ ~ ~ ~ ~ ~@  0   D D DC D DSV Dhm D D D D    oU\ o o o o o o o or!   r   c                   f     e Zd ZdZ	 	 	 	 	 	 	 	 	 dded	ed
edededededededef fdZd Z	 xZ
S )rY   a  
    New residual branches in each encoder layer.
    This alternates dilated convolutions, potentially with LSTMs and attention.
    Also before entering each residual branch, dimension is projected on a smaller subspace,
    e.g. of dim `channels // compress`.

    Args:
        channels (int): input/output channels for residual branch.
        compress (float, optional): amount of channel compression inside the branch. (default: 4)
        depth (int, optional): number of layers in the residual branch. Each layer has its own
            projection, and potentially LSTM and attention.(default: 2)
        init (float, optional): initial scale for LayerNorm. (default: 1e-4)
        norm_type (bool, optional): Norm type, either ``group_norm `` or ``none`` (Default: ``group_norm``)
        attn (bool, optional): use LocalAttention. (Default: ``False``)
        heads (int, optional): number of heads for the LocalAttention.  (default: 4)
        ndecay (int, optional): number of decay controls in the LocalAttention. (default: 4)
        lstm (bool, optional): use LSTM. (Default: ``False``)
        kernel_size (int, optional): kernel size for the (dilated) convolutions. (default: 3)
    r7   rQ   r   r8   Fr`   r   r   r   r   r@   r   headsndecayr   r;   c                    t                                                       |
dz  dk    rt          d          || _        || _        t          |          | _        |dk    }d }|dk    rd }t          ||z            }t          j	        }t          j
        g           | _        t          | j                  D ]}|rt          d|          nd}||
dz  z  }t          j        |||
||           ||           |            t          j        |d|z  d           |d|z            t          j        d          t!          ||          g}|r&|                    d	t%          |||
                     |	r&|                    d	t'          |dd                     t          j        | }| j                            |           d S )NrQ   r   z(Kernel size should not be divisible by 2c                 (    t          j                    S r$   rF   rH   s    r    rJ   z!_DConv.__init__.<locals>.<lambda>  rK   r!   r8   c                 ,    t          j        d|           S )Nr   rM   rH   s    r    rJ   z!_DConv.__init__.<locals>.<lambda>  s    Q 2 2 r!   r   )dilationpaddingr`   )r   r   T)layersr|   )r   r   rd   r   r   absr   r-   r   GELUr   r  r   powrR   GLU_LayerScaler   _LocalState_BLSTM
Sequentialr   )r   r   r   r   r   r@   r   r   r   r   r;   dilaterZ   hiddenactrI   r  r  modslayerr   s                       r    r   z_DConv.__init__  s    	?aGHHH  ZZ
 *)$$22GX())gmB''tz"" 	& 	&A$*1s1ayyyH+"23G	(FK(T[\\\	&!h,22H%%q		Hd++D  PA{6vNNNOOO DAvfQTBBBCCCM4(EKu%%%%#	& 	&r!   c                 6    | j         D ]}| ||          z   }|S )zDConv forward call

        Args:
            x (torch.Tensor): input tensor for convolution

        Returns:
            Tensor
                Output after being run through layers.
        )r  )r   r%   r  s      r    r(   z_DConv.forward  s-     [ 	 	EEE!HHAAr!   )	r7   rQ   r   r8   Fr7   r7   Fr`   )r)   r*   r+   r,   r-   r.   rq   r/   r   r(   r2   r3   s   @r    rY   rY   }  s         . %1& 1&1& 1& 	1&
 1& 1& 1& 1& 1& 1& 1& 1& 1& 1& 1& 1&f      r!   rY   c                   R     e Zd ZdZd
dedef fdZdej        dej        fd	Z	 xZ
S )r
  ae  
    BiLSTM with same hidden units as input dim.
    If `max_steps` is not None, input will be splitting in overlapping
    chunks and the LSTM applied separately on each chunk.
    Args:
        dim (int): dimensions at LSTM layer.
        layers (int, optional): number of LSTM layers. (default: 1)
        skip (bool, optional): (default: ``False``)
    r   Fr  r|   c                     t                                                       d| _        t          j        d|||          | _        t          j        d|z  |          | _        || _        d S )N   T)bidirectional
num_layershidden_size
input_sizerQ   )	r   r   	max_stepsr   LSTMr   Linearlinearr|   )r   r   r  r|   r   s       r    r   z_BLSTM.__init__  s]    G$6s_bccc	iC--			r!   r%   r"   c           	         |j         \  }}}|}d}d}d}d}	| j        d|| j        k    rY| j        }|dz  }t          |||          }
|
j         d         }	d}|
                    dddd                              d||          }|                    ddd          }|                     |          d         }|                     |          }|                    ddd          }|rg }|                    |d||          }
|dz  }t          |	          D ]}|dk    r'|                    |
dd|ddd| f                    /||	dz
  k    r&|                    |
dd|dd|df                    ^|                    |
dd|dd|| f                    t          j
        |d          }|d	d|f         }|}| j        r||z   }|S )
a  BLSTM forward call

        Args:
            x (torch.Tensor): input tensor for BLSTM shape is `(batch_size, dim, time_steps)`

        Returns:
            Tensor
                Output after being run through bidirectional LSTM. Shape is `(batch_size, dim, time_steps)`
        Fr   NrQ   Tr   r`   r_   .)ra   r  _unfoldrf   rg   r   r  r   r   r   catr|   )r   r%   ri   rj   rl   rn   framedwidthr<   nframesframesr'   limitks                 r    r(   z_BLSTM.forward  s    '1a>%!dn*<*<NEaZFQv..Fl1oGFq!Q**222q%@@AIIaAIIaLLOKKNNIIaA 	CYYq"a//FaKE7^^ > >66JJvaaaAAAww&678888'A+%%JJvaaaAAAuvvo67777JJvaaaAAAueV|&;<====)C$$Cc2A2g,CA9 	AAr!   )r   F)r)   r*   r+   r,   r-   r/   r   r   r1   r(   r2   r3   s   @r    r
  r
    s          C 4      . .%, . . . . . . . .r!   r
  c                   V     e Zd ZdZd
dededef fdZdej        dej        fd	Z xZ	S )r	  a   Local state allows to have attention based only on data (no positional embedding),
    but while setting a constraint on the time window (e.g. decaying penalty term).
    Also a failed experiments with trying to provide some frequency based attention.
    r7   r   r   r   c                 \   t          t          |                                            ||z  dk    rt          d          || _        || _        t          j        ||d          | _        t          j        ||d          | _	        t          j        ||d          | _
        t          j        |||z  d          | _        |rK| j        j        xj        dz  c_        | j        j        t          d          d| j        j        j        dd<   t          j        ||dz  z   |d          | _        dS )z
        Args:
            channels (int): Size of Conv1d layers.
            heads (int, optional):  (default: 4)
            ndecay (int, optional): (default: 4)
        r   z$Channels must be divisible by heads.r   g{Gz?Nzbias must not be None.r   )r   r	  r   rd   r   r   r   rR   contentquerykeyquery_decayr   r   biasproj)r   r   r   r   r   s       r    r   z_LocalState.__init__  s    	k4  ))+++eq  CDDD
y8Q77Yx155
9Xx339Xuv~qAA 	/#((D0(($, !9:::,.D!&qqq)Ih2Ha@@			r!   r%   r"   c                    |j         \  }}}| j        }t          j        ||j        |j                  }|dddf         |dddf         z
  }|                     |                              ||d|          }|                     |                              ||d|          }	t          j	        d|	|          }
|
t          j        |	j         d                   z  }
| j        rt          j        d| j        dz   |j        |j                  }|                     |                              ||d|          }t          j        |          dz  }|                    ddd           |                                z  t          j        | j                  z  }|
t          j	        d||          z  }
|
                    t          j        ||
j        t          j                  d           t          j        |
d	          }|                     |                              ||d|          }t          j	        d
||          }|                    |d|          }||                     |          z   S )zLocalState forward call

        Args:
            x (torch.Tensor): input tensor for LocalState

        Returns:
            Tensor
                Output after being run through LocalState layer.
        )r   dtypeNr_   zbhct,bhcs->bhtsrQ   r   zfts,bhfs->bhtsir   zbhts,bhct->bhcs)ra   r   r   r   r   r/  r)  rb   r*  einsumr   r   r   r+  sigmoidr  masked_fill_eyer/   softmaxr(  rg   r-  )r   r%   ri   rj   rl   r   indexesdeltaquerieskeysdotsdecaysdecay_qdecay_kernelweightsr(  results                    r    r(   z_LocalState.forward6  s%    '1a
,qAAA4 747#33**Q--$$Qr155xx{{5"a00|-tW==	$*Q-(((; 	J\!T[1_QXQWUUUF&&q))..q%Q??GmG,,q0G"KKAq111EIIKK?$)DKBXBXXLEL!1<IIID 	%)AdkLLLdSSS-!,,,,,q//&&q%Q77/'BB2q))499V$$$$r!   )r7   r7   )
r)   r*   r+   r,   r-   r   r   r1   r(   r2   r3   s   @r    r	  r	    s         
A A AS Ac A A A A A A2#% #%%, #% #% #% #% #% #% #% #%r!   r	  c                   R     e Zd ZdZd	dedef fdZdej        dej        fdZ	 xZ
S )
r  zLayer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
    This rescales diagonally residual outputs close to 0 initially, then learnt.
    r   r   r   c                     t                                                       t          j        t	          j        |d                    | _        || j        j        dd<   dS )z
        Args:
            channels (int): Size of  rescaling
            init (float, optional): Scale to default to (default: 0)
        T)requires_gradN)r   r   r   	Parameterr   zerosr   r   )r   r   r   r   s      r    r   z_LayerScale.__init__a  sS     	\%+hd"K"K"KLL
!
r!   r%   r"   c                 *    | j         dddf         |z  S )zLayerScale forward call

        Args:
            x (torch.Tensor): input tensor for LayerScale

        Returns:
            Tensor
                Output after rescaling tensor.
        N)r   )r   r%   s     r    r(   z_LayerScale.forwardk  s     z!!!T'"Q&&r!   )r   )r)   r*   r+   r,   r-   r.   r   r   r1   r(   r2   r3   s   @r    r  r  \  s         " " "E " " " " " "
' 
'%, 
' 
' 
' 
' 
' 
' 
' 
'r!   r  ar;   r<   r"   c                     t           j        dd                   }t           j        d                   }t          j        ||z            }|dz
  |z  |z   }t          j         d||z
  g            fdt                                                     D             }|d         dk    rt          d          |dd         |dgz   }|
                    |           |
                    |                                ||          S )zGiven input of size [*OT, T], output Tensor of size [*OT, F, K]
    with K the kernel size, by extracting frames with the given stride.
    This will pad the input so that `F = ceil(T / K)`.
    see https://github.com/pytorch/pytorch/issues/60466
    Nr_   r   r   )r   rC   c                 :    g | ]}                     |          S  )r<   ).0r   rE  s     r    
<listcomp>z_unfold.<locals>.<listcomp>  s#    777qxx}}777r!   zData should be contiguous.)listra   r-   r   r   rc   rC   r   r   rd   r   
as_strided)rE  r;   r<   ra   r~   n_frames
tgt_lengthstridess   `       r    r  r  x  s    "EFy&))HQ,&(;6J	AAzF23444A7777aeegg777Gr{a5666crclfa[(G	LL	LL<<w'''r!   c                 t   |                                  D ]}t          |t          j        t          j        t          j        t          j        f          rd|j                                        	                                }|dz  dz  }|j        xj
        |z  c_
        |j        |j        xj
        |z  c_
        dS )zI
    Rescales initial weight scale for all models within the module.
    g?g      ?N)modules
isinstancer   rR   rx   rS   ry   r   r   detachr   r,  )modulesubr   r   s       r    r   r     s     ~~ ' 'cBIr'929bFXYZZ 	'*..""))++C3Y3&EJOOu$OOx#&' 'r!      r%   n_fftr   rC   c                    t          | j        d d                   }t          | j        d                   }|                     d|          } t	          j        | |d|z   z  |t	          j        |                              |           |dddd	  	        }|j        \  }}}	|                    ||	g           |	                    |          S )Nr_   r   Tr   )window
win_length
normalizedcenterreturn_complexpad_mode)
rK  ra   r-   rg   r   stfthann_windowtoextendrb   )
r%   rW  r   rC   otherr~   ro   r   r   frames
             r    r   r     s    "EF			"fA
	S ''**1--
	 
	 
	A gOAue	LL%   66%==r!   ro   r~   c           
         t          | j        d d                   }t          | j        d                   }t          | j        d                   }d|z  dz
  }|                     d||          } |d|z   z  }t	          j        | ||t	          j        |                              | j                  |d|d          }	|	j        \  }
}|	                    |           |	                    |          S )Nr   r_   rQ   r   T)rY  rZ  r[  r~   r\  )
rK  ra   r-   rb   r   istftr`  ra  realr   )ro   r   r~   rC   rc  r   r#  rW  rZ  r%   r   s              r    r   r     s    "EEFIME	r5&!!A1s7#J	 ,,//77		 		 		A IAv	LL66%==r!   r   c                 &    t          | dd          S )zBuilds low nfft (1024) version of :class:`HDemucs`, suitable for sample rates around 8 kHz.

    Args:
        sources (List[str]): See :py:func:`HDemucs`.

    Returns:
        HDemucs:
            HDemucs model.
    i   r   r   r   r   r   r   s    r    hdemucs_lowrl         7Q7777r!   c                 &    t          | dd          S )a  Builds medium nfft (2048) version of :class:`HDemucs`, suitable for sample rates of 16-32 kHz.

    .. note::

        Medium HDemucs has not been tested against the original Hybrid Demucs as this nfft and depth configuration is
        not compatible with the original implementation in https://github.com/facebookresearch/demucs

    Args:
        sources (List[str]): See :py:func:`HDemucs`.

    Returns:
        HDemucs:
            HDemucs model.
    r   r   ri  rj  rk  s    r    hdemucs_mediumro    s      7Q7777r!   c                 &    t          | dd          S )zBuilds medium nfft (4096) version of :class:`HDemucs`, suitable for sample rates of 44.1-48 kHz.

    Args:
        sources (List[str]): See :py:func:`HDemucs`.

    Returns:
        HDemucs:
            HDemucs model.
    r   r   ri  rj  rk  s    r    hdemucs_highrq    rm  r!   )rV  r   r   )r   r   r   )r   typingtpr   r   r   r   r   r   torch.nnr   rc   Moduler
   r5   rs   r   rY   r
  r	  r  r1   r-   r  r   r   r   rq   rl  ro  rq  rH  r!   r    <module>rv     sS  4      , , , , , , , , , , , ,        $ $ $ $ $ $# # # # #ux # # #Lk k k k k k k k\s s s s s s s slM M M M Meho M M M`
T T T T TUX_ T T Tn@ @ @ @ @UX_ @ @ @FB% B% B% B% B%") B% B% B%J' ' ' ' '") ' ' '8(u| (# (s (u| ( ( ( ((
' 
' 
'  S C # V[Vb    (  3 C # V[Vb    .8c 8w 8 8 8 88DI 8' 8 8 8 8&8$s) 8 8 8 8 8 8 8r!   