
    0;ji~5                        d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ d
dlmZmZmZmZ d
dlmZ d
dlmZ  G d dej                  Z G d dej                  Z G d dej        j                  ZdS )zEncoder definition.    )TupleN)nn)
functional   )ConvolutionModule)ConformerEncoderLayer)PositionwiseFeedForward   )COSYVOICE_EMB_CLASSESCOSYVOICE_SUBSAMPLE_CLASSESCOSYVOICE_ATTENTION_CLASSESCOSYVOICE_ACTIVATION_CLASSES)make_pad_mask)add_optional_chunk_maskc                   V     e Zd ZdZd
dededef fdZdej        dej        fd	Z xZ	S )
Upsample1Da  A 1D upsampling layer with an optional convolution.

    Parameters:
        channels (`int`):
            number of channels in the inputs and outputs.
        use_conv (`bool`, default `False`):
            option to use a convolution.
        use_conv_transpose (`bool`, default `False`):
            option to use a convolution transpose.
        out_channels (`int`, optional):
            number of output channels. Defaults to `channels`.
    r
   channelsout_channelsstridec                     t                                                       || _        || _        || _        t          j        | j        | j        |dz  dz   dd          | _        d S )Nr
   r   r   )r   padding)super__init__r   r   r   r   Conv1dconv)selfr   r   r   	__class__s       n/root/voice-cloning/.venv/lib/python3.11/site-packages/chatterbox/models/s3gen/transformer/upsample_encoder.pyr   zUpsample1D.__init__3   s^     (IdmT->
QWXbcddd			    inputsinput_lengthsc                     t          j        |t          | j                  d          }t          j        || j        dz  dfd          }|                     |          }||| j        z  fS )Nnearest)scale_factormoder
   r           )value)Finterpolatefloatr   padr   )r   r    r!   outputss       r   forwardzUpsample1D.forward;   sf    -U4;5G5GiXXX%$+/1!5SAAA))G$$333r   )r
   )
__name__
__module____qualname____doc__intr   torchTensorr-   __classcell__r   s   @r   r   r   %   s         e e eC e e e e e e e4el 45< 4 4 4 4 4 4 4 4r   r   c                   N     e Zd Zddedef fdZdej        dej        fdZ xZS )	PreLookaheadLayerr   r   pre_lookahead_lenc                     t                                                       || _        || _        t	          j        |||dz   dd          | _        t	          j        ||ddd          | _        d S )Nr   r   )kernel_sizer   r      )r   r   r   r9   r   r   conv1conv2)r   r   r9   r   s      r   r   zPreLookaheadLayer.__init__C   s|     !2Yh)A-a
 
 


 Yh!Q
 
 



r   r    returnc                    |                     dd                                          }t          j        |d| j        fdd          }t          j        |                     |                    }t          j        |ddd          }|                     |          }|                     dd                                          }||z   }|S )z9
        inputs: (batch_size, seq_len, channels)
        r   r
   r   constantr&   )r%   r'   )r
   r   )	transpose
contiguousr(   r+   r9   
leaky_relur=   r>   )r   r    r,   s      r   r-   zPreLookaheadLayer.forwardQ   s     ""1a((3355%!T%;!<:UXYYY,tzz'2233%jDDD**W%%##Aq))4466 F"r   )r   )	r.   r/   r0   r2   r   r3   r4   r-   r5   r6   s   @r   r8   r8   B   sv        
 
 
 
 
 
 
 
 
el u|        r   r8   c            3           e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d8dedededededededededededededej        j	        ded ed!ed"ed#ed$ed%ed&ed'ed(ed)ef2 fd*Z
d+efd,Z	 	 d9d.ej        d/ej        d0ed1ed+eej        ej        f         f
d2Zd.ej        d3ej        d4ej        d5ej        d+ej        f
d6Zd.ej        d3ej        d4ej        d5ej        d+ej        f
d7Z xZS ):UpsampleConformerEncoder            皙?linearrel_pos_espnetTr   FNr   rel_selfattnswish   
batch_norm
input_sizeoutput_sizeattention_headslinear_units
num_blocksdropout_ratepositional_dropout_rateattention_dropout_rateinput_layerpos_enc_layer_typenormalize_beforestatic_chunk_sizeuse_dynamic_chunkglobal_cmvnuse_dynamic_left_chunkpositionwise_conv_kernel_sizemacaron_styleselfattention_layer_typeactivation_typeuse_cnn_modulecnn_module_kernelcausalcnn_module_normkey_biasgradient_checkpointingc                    t                                                       | _        || _        t	          |	         |t          |
         |                    | _        | _        t          j	        
                    d          | _        || _        || _        || _        || _        t!          |                     }|||f||f||||ft#          dd          | _        t          j	                            f	dt)          |          D                       | _        t-          ddd          | _        t	          |	         |t          |
         |                    | _        t          j	                            f	d	t)          d
          D                       | _        dS )a  
        Args:
            input_size (int): input dim
            output_size (int): dimension of attention
            attention_heads (int): the number of heads of multi head attention
            linear_units (int): the hidden units number of position-wise feed
                forward
            num_blocks (int): the number of decoder blocks
            dropout_rate (float): dropout rate
            attention_dropout_rate (float): dropout rate in attention
            positional_dropout_rate (float): dropout rate after adding
                positional encoding
            input_layer (str): input layer type.
                optional [linear, conv2d, conv2d6, conv2d8]
            pos_enc_layer_type (str): Encoder positional encoding layer type.
                opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
            normalize_before (bool):
                True: use layer_norm before each sub-block of a layer.
                False: use layer_norm after each sub-block of a layer.
            static_chunk_size (int): chunk size for static chunk training and
                decoding
            use_dynamic_chunk (bool): whether use dynamic chunk size for
                training or not, You can only use fixed chunk(chunk_size > 0)
                or dyanmic chunk size(use_dynamic_chunk = True)
            global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
            use_dynamic_left_chunk (bool): whether use dynamic left chunk in
                dynamic chunk training
            key_bias: whether use bias in attention.linear_k, False for whisper models.
            gradient_checkpointing: rerunning a forward-pass segment for each
                checkpointed segment during backward.
        gh㈵>)epsrG   r<   )r   r9   c                    	 g | ]A}t          t          	          t           r	t           nd 
r	t           nd           BS Nr   r   r	   r   .0_convolution_layer_argsrW   encoder_selfattn_layer_argsrb   r\   rS   positionwise_layer_argsrc   re   s     r   
<listcomp>z5UpsampleConformerEncoder.__init__.<locals>.<listcomp>   s     -
 -
 -
  "+,DE02')@A1>I',. .DH0>I!+- -DH  -
 -
 -
r   r
   )r   r   r   c                    	 g | ]A}t          t          	          t           r	t           nd 
r	t           nd           BS rn   ro   rp   s     r   rv   z5UpsampleConformerEncoder.__init__.<locals>.<listcomp>   s     0
 0
 0
  "+,DE02')@A1>I',. .DH0>I!+- -DH  0
 0
 0
r      N)r   r   _output_sizer_   r   r   embedr\   r3   r   	LayerNorm
after_normr]   r^   r`   rj   r   r8   pre_lookahead_layer
ModuleListrangeencodersr   up_layerup_embedup_encoders)r   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   
activationrs   rt   ru   r   s     `   `    `     `` `      @@@r   r   z!UpsampleConformerEncoder.__init__e   sW   v 	'&0=!"45k6MO O	
 

 !1(,,[d,CC!2!2&<#&<#1/BDD
 "	'
# 	#
 #./@*"16";#4cUV#W#W#W ++ -
 -
 -
 -
 -
 -
 -
 -
 -
 -
 -
 -
 Z((-
 -
 -
   #Cc!LLL3K@!"45k6MO O	
 
 !8.. 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 Qxx0
 0
 0
  r   r?   c                     | j         S rn   )ry   )r   s    r   rS   z$UpsampleConformerEncoder.output_size   s      r   xsxs_lensdecoding_chunk_sizenum_decoding_left_chunksc           	         |                     d          }t          ||                              d           }| j        |                     |          }|                     ||          \  }}}|}t          ||| j        | j        || j        |          }	| 	                    |          }| 
                    ||	||          }|                    dd                                          }|                     ||          \  }}|                    dd                                          }|                     d          }t          ||                              d           }|                     ||          \  }}}|}t          ||| j        | j        || j        | j        j        z  |          }	|                     ||	||          }| j        r|                     |          }||fS )a  Embed positions in tensor.

        Args:
            xs: padded input tensor (B, T, D)
            xs_lens: input length (B)
            decoding_chunk_size: decoding chunk size for dynamic chunk
                0: default for training, use random dynamic chunk.
                <0: for decoding, use full chunk.
                >0: for decoding, use fixed chunk size as set.
            num_decoding_left_chunks: number of left chunks, this is for decoding,
            the chunk size is decoding_chunk_size.
                >=0: use num_decoding_left_chunks
                <0: use all left chunks
        Returns:
            encoder output tensor xs, and subsampled masks
            xs: padded output tensor (B, T' ~= T/subsample_rate, D)
            masks: torch.Tensor batch padding mask after subsample
                (B, 1, T' ~= T/subsample_rate)
        NOTE(xcsong):
            We pass the `__call__` method of the modules instead of `forward` to the
            checkpointing API because `__call__` attaches all the hooks of the module.
            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
        r   Nr
   )sizer   	unsqueezer_   rz   r   r^   r`   r]   r}   forward_layersrB   rC   r   r   r   forward_up_layersr\   r|   )
r   r   r   r   r   Tmaskspos_embmask_padchunk_maskss
             r   r-   z UpsampleConformerEncoder.forward   s   < GGAJJw**44Q777'!!"%%B!ZZE22GU-b%.2.D.2.I.A.2.D.FH H %%b))  ['8DD \\!Q**,,mmB00G\\!Q**,,GGAJJw**44Q777!]]2u55GU-b%.2.D.2.I.A.2.Dt}G[.[.FH H ##BWhGG  	%$$B 5yr   r   r   r   c                 @    | j         D ]} |||||          \  }}}}|S rn   )r   r   r   r   r   r   layerrr   s          r   r   z'UpsampleConformerEncoder.forward_layers2  s<     ] 	N 	NE$)E"k7H$M$M!BQ	r   c                 @    | j         D ]} |||||          \  }}}}|S rn   )r   r   s          r   r   z*UpsampleConformerEncoder.forward_up_layers9  s=     % 	N 	NE$)E"k7H$M$M!BQ	r   )rG   rG   rH   rI   rJ   rK   rK   rK   rL   rM   Tr   FNFr   FrN   rO   FrP   FrQ   TF)r   r   )r.   r/   r0   r2   r*   strboolr3   r   Moduler   rS   r4   r   r-   r   r   r5   r6   s   @r   rF   rF   c   s          !),(+#"2!%!""''+',-.#(6&$!#+',5C CC C 	C
 C C C "'C !&C C  C C C  C X_C  !%!C" (+#C$ %C& #&'C( )C* +C, -C. /C0 1C2 3C4 !%5C C C C C CJ!S ! ! ! ! $%(*C CLC C !	C
 #&C 
u|U\)	*C C C CJ EL  %!&27,   EL u| #(<$)L5:\       r   rF   )r1   typingr   r3   r   torch.nnr   r(   convolutionr   encoder_layerr   positionwise_feed_forwardr	   utils.class_utilsr   r   r   r   
utils.maskr   r   r   r   r8   rF    r   r   <module>r      s  "                $ $ $ $ $ $ * * * * * * 0 0 0 0 0 0 > > > > > >            ' & & & & & 0 0 0 0 0 04 4 4 4 4 4 4 4:    	   B[ [ [ [ [ux [ [ [ [ [r   