
    0;ji,                     (   d Z ddlZddlmZmZ ddlZddlmc mZ	 ddl
Z G d dej        j                  Z G d de          Z G d d	e          Z G d
 de          Z G d dej        j                  Z G d dej        j                  ZdS )zPositonal Encoding Module.    N)TupleUnionc            	            e Zd ZdZ	 	 ddedededef fdZ	 dd
ej	        de
eej	        f         deej	        ej	        f         fdZ	 dde
eej	        f         dededej	        fdZ xZS )PositionalEncodingzPositional encoding.

    :param int d_model: embedding dim
    :param float dropout_rate: dropout rate
    :param int max_len: maximum input length

    PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
    PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
      Fd_modeldropout_ratemax_lenreversec                 (   t                                                       || _        t          j        | j                  | _        t          j                            |          | _	        || _
        t          j        | j
        | j                  | _        t          j        d| j
        t          j                                      d          }t          j        t          j        d| j        dt          j                  t          j        d          | j        z   z            }t          j        ||z            | j        dddddf<   t          j        ||z            | j        dddddf<   | j                            d          | _        dS )'Construct an PositionalEncoding object.pr   dtype           @N)super__init__r   mathsqrtxscaletorchnnDropoutdropoutr
   zerospearangefloat32	unsqueezeexplogsincos)selfr   r	   r
   r   positiondiv_term	__class__s          g/root/voice-cloning/.venv/lib/python3.11/site-packages/chatterbox/models/s3gen/transformer/embedding.pyr   zPositionalEncoding.__init__%   sR    	i--x'','77+dlDL99<4<&+m5 5 55>Yq\\ 	9LDL!5=AAAhw$,./01 1 !9X%89914a4 9X%89914a4'##A&&    r   xoffsetreturnc                    | j                             |j                  | _         |                     ||                    d          d          }|| j        z  |z   }|                     |          |                     |          fS )aN  Add positional encoding.

        Args:
            x (torch.Tensor): Input. Its shape is (batch, time, ...)
            offset (int, torch.tensor): position offset

        Returns:
            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
            torch.Tensor: for compatibility to RelPositionalEncoding
        r   F)r   todeviceposition_encodingsizer   r   r'   r-   r.   pos_embs       r+   forwardzPositionalEncoding.forward;   sm     '**QX&&((EBBOg%||AW 5 555r,   Tr4   apply_dropoutc                 ~   t          |t                    r'||z   | j        k    sJ | j        dd|||z   f         }nt          |t          j                  r?|                                dk    r'||z   | j        k    sJ | j        dd|||z   f         }nt	          j        |          |z   | j        k    sJ |                    d          t	          j	        d|          
                    |j                  z   }|dk    }||z  }t          j        || j        d                   }|r|                     |          }|S )   For getting encoding in a streaming fashion

        Attention!!!!!
        we apply dropout only once at the whole utterance level in a none
        streaming way, but will call this function several times with
        increasing input size in a streaming scenario, so the dropout will
        be applied several times.

        Args:
            offset (int or torch.tensor): start offset
            size (int): required size of position encoding

        Returns:
            torch.Tensor: Corresponding encoding
        Nr   r   )
isinstanceintr
   r   r   Tensordimmaxr"   r    r1   r2   F	embeddingr   )r'   r.   r4   r8   r6   indexflags          r+   r3   z$PositionalEncoding.position_encodingO   sJ   * fc"" 	5D=DL0000gaaa!556GG-- 
	5&**,,!2C2CD=DL0000gaaa!556GG9V$$t+t|;;;;$$Q''Q%%((778E19DDLEk%44G 	,ll7++Gr,   )r   Fr   )T)__name__
__module____qualname____doc__r<   floatboolr   r   r=   r   r   r7   r3   __classcell__r*   s   @r+   r   r      s         !%!&	' ''$' ' 	' ' ' ' ' '0 456 6<6c5</06 U\5</06 6 6 6. 15& &"'U\(9":& #& *.& :?& & & & & & & &r,   r   c            	            e Zd ZdZddededef fdZ	 ddej        d	e	eej        f         d
e
ej        ej        f         fdZ xZS )RelPositionalEncodingzRelative positional encoding module.
    See : Appendix B in https://arxiv.org/abs/1901.02860
    Args:
        d_model (int): Embedding dimension.
        dropout_rate (float): Dropout rate.
        max_len (int): Maximum input length.
    r   r   r	   r
   c                 R    t                                          |||d           dS )zInitialize class.T)r   N)r   r   r'   r   r	   r
   r*   s       r+   r   zRelPositionalEncoding.__init__   s*    ,FFFFFr,   r   r-   r.   r/   c                    | j                             |j                  | _         || j        z  }|                     ||                    d          d          }|                     |          |                     |          fS )a  Compute positional encoding.
        Args:
            x (torch.Tensor): Input tensor (batch, time, `*`).
        Returns:
            torch.Tensor: Encoded tensor (batch, time, `*`).
            torch.Tensor: Positional embedding tensor (1, time, `*`).
        r   F)r   r1   r2   r   r3   r4   r   r5   s       r+   r7   zRelPositionalEncoding.forward   sh     '**QX&&O((EBB||AW 5 555r,   r   rD   )rE   rF   rG   rH   r<   rI   r   r   r=   r   r   r7   rK   rL   s   @r+   rN   rN   x   s         G G G5 G3 G G G G G G 456 6<6c5</06 U\5</06 6 6 6 6 6 6 6r,   rN   c                   2     e Zd ZdZddededef fdZ xZS )WhisperPositionalEncodingz@ Sinusoids position encoding used in openai-whisper.encoder
      r   r	   r
   c                 X   t                                          |||           d| _        t          j        d          |dz  dz
  z  }t          j        | t          j        |dz            z            }t          j        |          d d t          j        f         |t          j        d d f         z  }t          j	        t          j
        |          t          j        |          gd          }t          | d           |                     d|                    d                     d S )N      ?i'  r   r   r>   r   r   )r   r   r   npr$   r   r#   r    newaxiscatr%   r&   delattrregister_bufferr"   )	r'   r   r	   r
   log_timescale_incrementinv_timescalesscaled_timer   r*   s	           r+   r   z"WhisperPositionalEncoding.__init__   s   ,888"$&--7a<!3C"D$;#;#(<1#=#=$> ? ?l7++AAArzM:2:qqq=)*Y	+..	+0F0FGQOOOdT2<<??33333r,   )rU   rE   rF   rG   rH   r<   rI   r   rK   rL   s   @r+   rT   rT      sa         
4 
4 
45 
43 
4 
4 
4 
4 
4 
4 
4 
4 
4 
4r,   rT   c                   2     e Zd ZdZddededef fdZ xZS )LearnablePositionalEncodingz@ Learnable position encoding used in openai-whisper.decoder
      r   r	   r
   c                     t                                          |||           t          j                            t          j        d||                    | _        d| _        d S )Nr   rW   )r   r   r   r   	Parameteremptyr   r   rP   s       r+   r   z$LearnablePositionalEncoding.__init__   sO    ,888($$U[GW%E%EFFr,   )rd   ra   rL   s   @r+   rc   rc      sa           5 3          r,   rc   c            	            e Zd ZdZdedef fdZ	 ddej        de	eej        f         de
ej        ej        f         fd	Zde	eej        f         d
edej        fdZ xZS )NoPositionalEncodingz No position encoding
    r   r	   c                     t                                                       || _        t          j                            |          | _        d S )Nr   )r   r   r   r   r   r   r   )r'   r   r	   r*   s      r+   r   zNoPositionalEncoding.__init__   s>    x'','77r,   r   r-   r.   r/   c                     t          j        d|                    d          | j                                      |j                  }|                     |          |fS )z= Just return zero vector for interface compatibility
        r   )r   r   r4   r   r1   r2   r   r5   s       r+   r7   zNoPositionalEncoding.forward   sH     +aDL99<<QXFF||A''r,   r4   c                 8    t          j        d|| j                  S )Nr   )r   r   r   )r'   r.   r4   s      r+   r3   z&NoPositionalEncoding.position_encoding   s    {1dDL111r,   rD   )rE   rF   rG   rH   r<   rI   r   r   r=   r   r   r7   r3   rK   rL   s   @r+   ri   ri      s         8 85 8 8 8 8 8 8 45( (<(c5</0( U\5</0( ( ( (2c5<.?(@ 2 #2(-2 2 2 2 2 2 2 2r,   ri   c            	            e Zd ZdZddededef fdZdej        fdZ	ddej        d
e
eej        f         deej        ej        f         fdZd
e
eej        f         dedej        fdZ xZS )EspnetRelPositionalEncodingaR  Relative positional encoding module (new implementation).

    Details can be found in https://github.com/espnet/espnet/pull/2816.

    See : Appendix B in https://arxiv.org/abs/1901.02860

    Args:
        d_model (int): Embedding dimension.
        dropout_rate (float): Dropout rate.
        max_len (int): Maximum input length.

    r   r   r	   r
   c                 n   t          t          |                                            || _        t	          j        | j                  | _        t          j        	                    |          | _
        d| _        |                     t          j        d                              d|                     dS )r   r   Ng        r   )r   rn   r   r   r   r   r   r   r   r   r   r   	extend_petensorexpandrP   s       r+   r   z$EspnetRelPositionalEncoding.__init__   s    )40099;;;i--x'','77u|C((//7;;<<<<<r,   r-   c                 <   | j         | j                             d          |                    d          dz  dz
  k    rW| j         j        |j        k    s| j         j        |j        k    r+| j                             |j        |j                  | _         dS t          j        |                    d          | j                  }t          j        |                    d          | j                  }t          j        d|                    d          t
          j	                  
                    d          }t          j        t          j        d| j        dt
          j	                  t          j        d          | j        z   z            }t          j        ||z            |dddddf<   t          j        ||z            |dddddf<   t          j        d|z  |z            |dddddf<   t          j        d|z  |z            |dddddf<   t          j        |dg          
                    d          }|dd         
                    d          }t          j        ||gd	          }|                    |j        |j        
          | _         dS )zReset the positional encodings.Nr   r   )r   r2   r   r   r   rX   )r2   r   )r   r4   r   r2   r1   r   r   r   r    r!   r"   r#   r   r$   r%   r&   flipr[   )r'   r-   pe_positivepe_negativer(   r)   r   s          r+   rp   z%EspnetRelPositionalEncoding.extend_pe   sH   7 w||A!&&))a-!"3337=AG++tw~/I/I"gjjqwqxjHHDG k!&&))T\::k!&&))T\::<166!99EMBBBLLQOO9LDL!5=AAA!!DL012
 
  %yH)<==AAAqt!tG$yH)<==AAAqt!tG$yh)ABBAAAqt!tG$yh)ABBAAAqt!tG
 jqc22<<Q??!!""o//22Y[1q999%%qxqw%77r,   r   r.   r/   c                     |                      |           || j        z  }|                     |                    d          |          }|                     |          |                     |          fS )zAdd positional encoding.

        Args:
            x (torch.Tensor): Input tensor (batch, time, `*`).

        Returns:
            torch.Tensor: Encoded tensor (batch, time, `*`).

        r   )r4   r.   )rp   r   r3   r4   r   r5   s       r+   r7   z#EspnetRelPositionalEncoding.forward   sd     	qO((affQii(GG||AW 5 555r,   r4   c                     | j         dd| j                             d          dz  |z
  dz   | j                             d          dz  |z   f         }|S )r:   Nr   r   )r   r4   )r'   r.   r4   r6   s       r+   r3   z-EspnetRelPositionalEncoding.position_encoding  s]    $ 'AAGLLOOq 4'!+TW\\!__-AD-HHJ
 r,   rR   rD   )rE   rF   rG   rH   r<   rI   r   r   r=   rp   r   r   r7   r3   rK   rL   s   @r+   rn   rn      s        = = =5 =3 = = = = = =85< 8 8 8 8@6 6 6uS%,5F/G 6U\5</06 6 6 6 "'U\(9": #(-       r,   rn   )rH   r   typingr   r   r   torch.nn.functionalr   
functionalr@   numpyrY   Moduler   rN   rT   rc   ri   rn    r,   r+   <module>r      s    !                         [ [ [ [ [ [ [ [|6 6 6 6 6. 6 6 6<4 4 4 4 4 2 4 4 4"    "4   2 2 2 2 258? 2 2 2.] ] ] ] ]%(/ ] ] ] ] ]r,   