
    0;jiO8                     t    d Z ddlZddlmZ ddlZddlmZ  G d dej                  Z G d de          ZdS )	z&Multi-Head Attention layer definition.    N)Tuple)nnc                       e Zd ZdZ	 ddedededef fdZdej	        d	ej	        d
ej	        de
ej	        ej	        ej	        f         fdZ ej        dej                  fd
ej	        dej	        dej	        dej	        fdZ ej        dej                   ej        d           ej        d          fdej	        d	ej	        d
ej	        dej	        dej	        dej	        de
ej	        ej	        f         fdZ xZS )MultiHeadedAttentionzMulti-Head Attention layer.

    Args:
        n_head (int): The number of heads.
        n_feat (int): The number of features.
        dropout_rate (float): Dropout rate.

    Tn_headn_featdropout_ratekey_biasc                    t                                                       ||z  dk    sJ ||z  | _        || _        t	          j        ||          | _        t	          j        |||          | _        t	          j        ||          | _        t	          j        ||          | _	        t	          j
        |          | _        dS )z)Construct an MultiHeadedAttention object.r   bias)pN)super__init__d_khr   Linearlinear_qlinear_klinear_v
linear_outDropoutdropoutselfr   r   r	   r
   	__class__s        g/root/voice-cloning/.venv/lib/python3.11/site-packages/chatterbox/models/s3gen/transformer/attention.pyr   zMultiHeadedAttention.__init__$   s     	!####V#	&&11	&&x@@@	&&11)FF33zL111    querykeyvaluereturnc                    |                     d          }|                     |                              |d| j        | j                  }|                     |                              |d| j        | j                  }|                     |                              |d| j        | j                  }|                    dd          }|                    dd          }|                    dd          }|||fS )aK  Transform query, key and value.

        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).

        Returns:
            torch.Tensor: Transformed query tensor, size
                (#batch, n_head, time1, d_k).
            torch.Tensor: Transformed key tensor, size
                (#batch, n_head, time2, d_k).
            torch.Tensor: Transformed value tensor, size
                (#batch, n_head, time2, d_k).

        r         )sizer   viewr   r   r   r   	transpose)r   r   r    r!   n_batchqkvs           r   forward_qkvz MultiHeadedAttention.forward_qkv5   s    & **Q--MM%  %%gr4648DDMM###GRBBMM%  %%gr4648DDKK1KK1KK1!Qwr   r   r   r   dtypescoresmaskc                    |                     d          }|                     d          dk    r|                    d                              d          }|ddddddd|                     d          f         }|                    |t	          d                     }t          j        |d                              |d          }nt          j        |d          }|                     |          }t          j        ||          }|	                    dd          
                                                    |d| j        | j        z            }|                     |          S )	a/  Compute attention context vector.

        Args:
            value (torch.Tensor): Transformed value, size
                (#batch, n_head, time2, d_k).
            scores (torch.Tensor): Attention score, size
                (#batch, n_head, time1, time2).
            mask (torch.Tensor): Mask, size (#batch, 1, time2) or
                (#batch, time1, time2), (0, 0, 0) means fake mask.

        Returns:
            torch.Tensor: Transformed value (#batch, time1, d_model)
                weighted by the attention score (#batch, time1, time2).

        r   r&   r%   Nr$   infdimg        )r'   	unsqueezeeqmasked_fillfloattorchsoftmaxr   matmulr)   
contiguousr(   r   r   r   )r   r!   r2   r3   r*   attnp_attnxs           r   forward_attentionz&MultiHeadedAttention.forward_attentionR   sM   * **Q--
 99Q<<!>>!$$''**D111aaa!1&++b//!112D''uU||m<<F=R000<<c DD =R000Dd##L''[[A))++00"15$(1BD D q!!!r   r   r   r   r   r   pos_embcachec                    |                      |||          \  }}}	|                    d          dk    r`t          j        ||                    d          dz  d          \  }
}t          j        |
|gd          }t          j        ||	gd          }	t          j        ||	fd          }t          j        ||                    dd                    t          j        | j	                  z  }| 
                    |	||          |fS )a  Compute scaled dot product attention.

        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).
            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
                (#batch, time1, time2).
                1.When applying cross attention between decoder and encoder,
                the batch padding mask for input is in (#batch, 1, T) shape.
                2.When applying self attention of encoder,
                the mask is in (#batch, T, T)  shape.
                3.When applying self attention of decoder,
                the mask is in (#batch, L, L)  shape.
                4.If the different position in decoder see different block
                of the encoder, such as Mocha, the passed in mask could be
                in (#batch, L, T) shape. But there is no such case in current
                CosyVoice.
            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
                where `cache_t == chunk_size * num_decoding_left_chunks`
                and `head * d_k == size`


        Returns:
            torch.Tensor: Output tensor (#batch, time1, d_model).
            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
                where `cache_t == chunk_size * num_decoding_left_chunks`
                and `head * d_k == size`

        r   r$   r&   r6   )r.   r'   r<   splitcatr>   r)   mathsqrtr   rC   )r   r   r    r!   r3   rE   rF   r+   r,   r-   	key_cachevalue_cache	new_cacher2   s                 r   forwardzMultiHeadedAttention.forward   s   N ""5#u551a$ ::a==1%*[16B11D57&9 &9 &9"I{ 	9a.a000A	;*222A Iq!f"---	aR!4!455	$(8K8KK%%a66	AAr   T)__name__
__module____qualname____doc__intr;   boolr   r<   Tensorr   r.   onesrC   emptyzerosrP   __classcell__r   s   @r   r   r      s         #'	2 222  %2  	2 2 2 2 2 2"\(-=B\	u|U\5<7	8   B (UZ	DDD	-" -"|-" -" l	-"
 
-" -" -" -"h (UZ	DDD +A)ek,77DB DB|DB \DB |	DB
 lDB DB |DB 
u|U\)	*DB DB DB DB DB DB DB DBr   r   c                   J    e Zd ZdZ	 ddedededef fdZdej	        d	ej	        fd
Z
 ej        dej                   ej        d           ej        d          fdej	        dej	        dej	        dej	        dej	        dej	        d	eej	        ej	        f         fdZ xZS )RelPositionMultiHeadedAttentionzMulti-Head Attention layer with relative position encoding.
    Paper: https://arxiv.org/abs/1901.02860
    Args:
        n_head (int): The number of heads.
        n_feat (int): The number of features.
        dropout_rate (float): Dropout rate.
    Tr   r   r	   r
   c                    t                                          ||||           t          j        ||d          | _        t          j        t          j        | j        | j	                            | _
        t          j        t          j        | j        | j	                            | _        t          j        j                            | j
                   t          j        j                            | j                   dS )z4Construct an RelPositionMultiHeadedAttention object.Fr   N)r   r   r   r   
linear_pos	Parameterr<   rX   r   r   
pos_bias_u
pos_bias_vinitxavier_uniform_r   s        r   r   z(RelPositionMultiHeadedAttention.__init__   s     	x@@@)FF??? ,u|DFDH'E'EFF,u|DFDH'E'EFF%%do666%%do66666r   rB   r"   c                    t          j        |                                d         |                                d         |                                d         df|j        |j                  }t          j        ||gd          }|                    |                                d         |                                d         |                    d          dz   |                    d                    }|ddddddf                             |          ddddddd|                    d          dz  dz   f         }|S )	zCompute relative positional encoding.

        Args:
            x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
            time1 means the length of query vector.

        Returns:
            torch.Tensor: Output tensor.

        r   r%   r&   )devicer1   r$   r6      N)r<   r[   r'   rh   r1   rJ   r(   view_as)r   rB   zero_padx_paddeds       r   	rel_shiftz)RelPositionMultiHeadedAttention.rel_shift   s#    ;QVVXXa[!&&((1+qI&'h%&W. . . 9h]333==!!"!!"Qq		; ; QQQ122X&&q))AAqqq!!!*qvvbzzQ***
 r   r/   r0   r   rD   r   r    r!   r3   rE   rF   c                 |   |                      |||          \  }}}	|                    dd          }|                    d          dk    r`t          j        ||                    d          dz  d          \  }
}t          j        |
|gd          }t          j        ||	gd          }	t          j        ||	fd          }|                    d          }|                     |                              |d| j        | j	                  }|                    dd          }|| j
                            |j                  z                       dd          }|| j                            |j                  z                       dd          }t          j        ||                    dd                    }t          j        ||                    dd                    }|j        |j        k    r|                     |          }||z   t#          j        | j	                  z  }|                     |	||          |fS )a  Compute 'Scaled Dot Product Attention' with rel. positional encoding.
        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).
            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
                (#batch, time1, time2), (0, 0, 0) means fake mask.
            pos_emb (torch.Tensor): Positional embedding tensor
                (#batch, time2, size).
            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
                where `cache_t == chunk_size * num_decoding_left_chunks`
                and `head * d_k == size`
        Returns:
            torch.Tensor: Output tensor (#batch, time1, d_model).
            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
                where `cache_t == chunk_size * num_decoding_left_chunks`
                and `head * d_k == size`
        r%   r&   r   r$   r6   rH   )r.   r)   r'   r<   rI   rJ   ra   r(   r   r   rc   torh   rd   r>   shaperm   rK   rL   rC   )r   r   r    r!   r3   rE   rF   r+   r,   r-   rM   rN   rO   n_batch_posr   q_with_bias_uq_with_bias_v	matrix_ac	matrix_bdr2   s                       r   rP   z'RelPositionMultiHeadedAttention.forward   s   6 ""5#u551aKK1$ ::a==1%*[16B11D57&9 &9 &9"I{ 	9a.a000A	;*222A Iq!f"---	ll1ooOOG$$))+r4648LLKK1 T_//999DDQJJT_//999DDQJJ LB0C0CDD	 LB0C0CDD	?io--y11Ii'49H, ,  %%a66	AAr   rQ   )rR   rS   rT   rU   rV   r;   rW   r   r<   rX   rm   rY   rZ   r[   r   rP   r\   r]   s   @r   r_   r_      sf         #'	7 777  %7  	7 7 7 7 7 7 5< EL    : (UZ	DDD +A)ek,77QB QB|QB \QB |	QB
 lQB QB |QB 
u|U\)	*QB QB QB QB QB QB QB QBr   r_   )	rU   rK   typingr   r<   r   Moduler   r_    r   r   <module>ry      s   " - ,              kB kB kB kB kB29 kB kB kB\BB BB BB BB BB&: BB BB BB BB BBr   