
    ~VjiՊ              ,          d dl mZmZ d dlmZmZmZ d dlZd dlm	Z	 g dZ
 G d dej        j                  Z G d d	ej        j                  Z G d
 de          Z G d dej        j        e          Z G d dej        j                  Z G d dej        j                  Z G d dej        j                  Zdedededededededededededed ed!ed"ed#ed$ed%ed&ed'ed(ed)ef,d*Zded)efd+ZdS ),    )ABCabstractmethod)ListOptionalTupleN)Emformer)RNNTemformer_rnnt_baseemformer_rnnt_modelc                   x     e Zd ZdZdeddf fdZdej        dej        deej        ej        f         fdZ	 xZ
S )	_TimeReductionzCoalesces frames along time dimension into a
    fewer number of frames with higher feature dimensionality.

    Args:
        stride (int): number of frames to merge for each output frame.
    stridereturnNc                 V    t                                                       || _        d S N)super__init__r   )selfr   	__class__s     P/root/voice-cloning/.venv/lib/python3.11/site-packages/torchaudio/models/rnnt.pyr   z_TimeReduction.__init__   s$        inputlengthsc                    |j         \  }}}||| j        z  z
  }|ddd|ddf         }|                    | j        d          }|| j        z  }|                    |||| j        z            }|                                }||fS )a  Forward pass.

        B: batch size;
        T: maximum input sequence length in batch;
        D: feature dimension of each input sequence frame.

        Args:
            input (torch.Tensor): input sequences, with shape `(B, T, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.

        Returns:
            (torch.Tensor, torch.Tensor):
                torch.Tensor
                    output sequences, with shape
                    `(B, T  // stride, D * stride)`
                torch.Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output sequences.
        Ntrunc)rounding_mode)shaper   divreshape
contiguous)	r   r   r   BTD
num_framesT_maxoutputs	            r   forwardz_TimeReduction.forward   s    * +1a!dk/*
aaa*aaa'(++dk+AAdk)q%T[99""$$wr   )__name__
__module____qualname____doc__intr   torchTensorr   r'   __classcell__r   s   @r   r   r      s         s t      U\ EL U5<Y^YeKeEf        r   r   c                        e Zd ZdZ	 	 ddededededd	f
 fd
Zdej	        de
eej	                          deej	        eej	                 f         fdZ xZS )_CustomLSTMa  Custom long-short-term memory (LSTM) block that applies layer normalization
    to internal nodes.

    Args:
        input_dim (int): input dimension.
        hidden_dim (int): hidden dimension.
        layer_norm (bool, optional): if ``True``, enables layer normalization. (Default: ``False``)
        layer_norm_epsilon (float, optional):  value of epsilon to use in
            layer normalization layers (Default: 1e-5)
    Fh㈵>	input_dim
hidden_dim
layer_normlayer_norm_epsilonr   Nc                 0   t                                                       t          j                            |d|z  |           | _        t          j                            |d|z  d          | _        |rPt          j                            ||          | _        t          j                            d|z  |          | _	        nFt          j        
                                | _        t          j        
                                | _	        || _        d S )N   biasF)eps)r   r   r-   nnLinearx2gp2g	LayerNormc_normg_normIdentityr5   )r   r4   r5   r6   r7   r   s        r   r   z_CustomLSTM.__init__C   s     	8??9a*n
N?TT8??:q:~E?JJ 	.(,,Z=O,PPDK(,,Q^AS,TTDKK(++--DK(++--DK$r   r   statec                 :   |d|                     d          }t          j        || j        |j        |j                  }t          j        || j        |j        |j                  }n|\  }}|                     |          }g }|                    d          D ]}||                     |          z   }| 	                    |          }|
                    dd          \  }	}
}}|	                                }	|
                                }
|                                }|                                }|
|z  |	|z  z   }|                     |          }||                                z  }|                    |           t          j        |d          }||g}||fS )a  Forward pass.

        B: batch size;
        T: maximum sequence length in batch;
        D: feature dimension of each input sequence element.

        Args:
            input (torch.Tensor): with shape `(T, B, D)`.
            state (List[torch.Tensor] or None): list of tensors
                representing internal state generated in preceding invocation
                of ``forward``.

        Returns:
            (torch.Tensor, List[torch.Tensor]):
                torch.Tensor
                    output, with shape `(T, B, hidden_dim)`.
                List[torch.Tensor]
                    list of tensors representing internal state generated
                    in current invocation of ``forward``.
        N   )devicedtyper   r9   )dim)sizer-   zerosr5   rH   rI   r?   unbindr@   rC   chunksigmoidtanhrB   appendstack)r   r   rE   r!   hcgated_inputoutputsgates
input_gateforget_gate	cell_gateoutput_gater&   s                 r   r'   z_CustomLSTM.forwardV   s   . =

1AAtu|5;WWWAAtu|5;WWWAADAqhhuoo ''** 	 	EDHHQKK'EKK&&E>Ckk!Q>O>O;JY#++--J%--//K!((I%--//Ka*y"88AAAaffhh&ANN1W!,,,Au}r   )Fr3   r(   r)   r*   r+   r,   boolfloatr   r-   r.   r   r   r   r'   r/   r0   s   @r   r2   r2   7   s        	 	 !$(% %% % 	%
 "% 
% % % % % %&0\0*243E*F0	u|T%,//	00 0 0 0 0 0 0 0r   r2   c                      e Zd Zedej        dej        deej        ej        f         fd            Zedej        dej        dee	e	ej                                   deej        ej        e	e	ej                          f         fd            Z
dS )_Transcriberr   r   r   c                     d S r    )r   r   r   s      r   r'   z_Transcriber.forward   s    r   statesc                     d S r   rb   )r   r   r   rc   s       r   inferz_Transcriber.infer   s	     	r   N)r(   r)   r*   r   r-   r.   r   r'   r   r   re   rb   r   r   r`   r`      s        U\ EL U5<Y^YeKeEf    ^ |  d5<012	
 
u|U\4U\0B+CC	D   ^  r   r`   c            !           e Zd ZdZddddddded	ed
edededededededededededededdf  fdZde	j
        de	j
        dee	j
        e	j
        f         fdZe	j        j        de	j
        de	j
        deeee	j
                                   dee	j
        e	j
        eee	j
                          f         fd            Z xZS )_EmformerEncodera  Emformer-based recurrent neural network transducer (RNN-T) encoder (transcription network).

    Args:
        input_dim (int): feature dimension of each input sequence element.
        output_dim (int): feature dimension of each output sequence element.
        segment_length (int): length of input segment expressed as number of frames.
        right_context_length (int): length of right context expressed as number of frames.
        time_reduction_input_dim (int): dimension to scale each element in input sequences to
            prior to applying time reduction block.
        time_reduction_stride (int): factor by which to reduce length of input sequence.
        transformer_num_heads (int): number of attention heads in each Emformer layer.
        transformer_ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
        transformer_num_layers (int): number of Emformer layers to instantiate.
        transformer_left_context_length (int): length of left context.
        transformer_dropout (float, optional): transformer dropout probability. (Default: 0.0)
        transformer_activation (str, optional): activation function to use in each Emformer layer's
            feedforward network. Must be one of ("relu", "gelu", "silu"). (Default: "relu")
        transformer_max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
        transformer_weight_init_scale_strategy (str, optional): per-layer weight initialization scaling
            strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise")
        transformer_tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
            relur   	depthwiseF)transformer_dropouttransformer_activationtransformer_max_memory_size&transformer_weight_init_scale_strategytransformer_tanh_on_memr4   
output_dimsegment_lengthright_context_lengthtime_reduction_input_dimtime_reduction_stridetransformer_num_headstransformer_ffn_dimtransformer_num_layerstransformer_left_context_lengthrk   rl   rm   rn   ro   r   Nc                   t                                                       t          j                            ||d          | _        t          |          | _        ||z  }t          ||||	||z  |||
||z  |||          | _	        t          j                            ||          | _
        t          j                            |          | _        d S )NFr:   )dropout
activationleft_context_lengthrr   max_memory_sizeweight_init_scale_strategytanh_on_mem)r   r   r-   r=   r>   input_linearr   time_reductionr   transformeroutput_linearrA   r6   )r   r4   rp   rq   rr   rs   rt   ru   rv   rw   rx   rk   rl   rm   rn   ro   transformer_input_dimr   s                    r   r   z_EmformerEncoder.__init__   s    & 	!HOO$ , 
 

 --BCC 8;P P#!!"33'- ?!59N!N7'M/
 
 
 #X__-BJOO(,,Z88r   r   r   c                     |                      |          }|                     ||          \  }}|                     ||          \  }}|                     |          }|                     |          }	|	|fS )a  Forward pass for training.

        B: batch size;
        T: maximum input sequence length in batch;
        D: feature dimension of each input sequence frame (input_dim).

        Args:
            input (torch.Tensor): input frame sequences right-padded with right context, with
                shape `(B, T + right context length, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.

        Returns:
            (torch.Tensor, torch.Tensor):
                torch.Tensor
                    output frame sequences, with
                    shape `(B, T // time_reduction_stride, output_dim)`.
                torch.Tensor
                    output input lengths, with shape `(B,)` and i-th element representing
                    number of valid elements for i-th batch element in output frame sequences.
        )r   r   r   r   r6   )
r   r   r   input_linear_outtime_reduction_outtime_reduction_lengthstransformer_outtransformer_lengthsoutput_linear_outlayer_norm_outs
             r   r'   z_EmformerEncoder.forward   s    ,  ,,U33595H5HIY[b5c5c22/3/?/?@RTj/k/k,, ..??):;;222r   rc   c                     |                      |          }|                     ||          \  }}| j                            |||          \  }}}	|                     |          }
|                     |
          }|||	fS )aR  Forward pass for inference.

        B: batch size;
        T: maximum input sequence segment length in batch;
        D: feature dimension of each input sequence frame (input_dim).

        Args:
            input (torch.Tensor): input frame sequence segments right-padded with right context, with
                shape `(B, T + right context length, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.
            state (List[List[torch.Tensor]] or None): list of lists of tensors
                representing internal state generated in preceding invocation
                of ``infer``.

        Returns:
            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
                torch.Tensor
                    output frame sequences, with
                    shape `(B, T // time_reduction_stride, output_dim)`.
                torch.Tensor
                    output input lengths, with shape `(B,)` and i-th element representing
                    number of valid elements for i-th batch element in output.
                List[List[torch.Tensor]]
                    output states; list of lists of tensors
                    representing internal state generated in current invocation
                    of ``infer``.
        )r   r   r   re   r   r6   )r   r   r   rc   r   r   r   r   r   transformer_statesr   r   s               r   re   z_EmformerEncoder.infer   s    F  ,,U33595H5HIY[b5c5c22
 ""#57MvVV		
 ..??):;;24FFFr   )r(   r)   r*   r+   r,   r^   strr]   r   r-   r.   r   r'   jitexportr   r   re   r/   r0   s   @r   rg   rg      s        H &)&,+,6A(-#*9 *9 *9 *9 	*9
 *9 "*9 #&*9  #*9  #*9 !*9 !$*9 *-*9 #*9 !$*9 &)*9  14!*9" "&#*9$ 
%*9 *9 *9 *9 *9 *9X3U\ 3EL 3U5<Y^YeKeEf 3 3 3 3: Y+G|+G +G d5<012	+G
 
u|U\4U\0B+CC	D+G +G +G +G +G +G +G +Gr   rg   c                        e Zd ZdZ	 	 	 ddedededed	ed
edededdf fdZ	 ddej	        dej	        de
eeej	                                   deej	        ej	        eeej	                          f         fdZ xZS )
_Predictora  Recurrent neural network transducer (RNN-T) prediction network.

    Args:
        num_symbols (int): size of target token lexicon.
        output_dim (int): feature dimension of each output sequence element.
        symbol_embedding_dim (int): dimension of each target token embedding.
        num_lstm_layers (int): number of LSTM layers to instantiate.
        lstm_hidden_dim (int): output dimension of each LSTM layer.
        lstm_layer_norm (bool, optional): if ``True``, enables layer normalization
            for LSTM layers. (Default: ``False``)
        lstm_layer_norm_epsilon (float, optional): value of epsilon to use in
            LSTM layer normalization layers. (Default: 1e-5)
        lstm_dropout (float, optional): LSTM dropout probability. (Default: 0.0)

    Fr3   rh   num_symbolsrp   symbol_embedding_dimnum_lstm_layerslstm_hidden_dimlstm_layer_normlstm_layer_norm_epsilonlstm_dropoutr   Nc	                 L   t                                                       t          j                            |          | _        t          j                                      | _        t          j                            fdt          |          D                       | _
        t          j                            |          | _        t          j                            |          | _        t          j                            |          | _        || _        d S )Nc                 F    g | ]}t          |d k    rn          S )r   )r6   r7   )r2   ).0idxr   r   r   r   s     r   
<listcomp>z'_Predictor.__init__.<locals>.<listcomp>H  sT         ,/1HH((/#.'>	    r   )p)r   r   r-   r=   	Embedding	embeddingrA   input_layer_norm
ModuleListrangelstm_layersDropoutrz   r>   linearoutput_layer_normr   )
r   r   rp   r   r   r   r   r   r   r   s
      ` ``` r   r   z_Predictor.__init__9  s    	++K9MNN % 2 23G H H 8..       !11  

 

 x'','77hooozBB!&!3!3J!?!?(r   r   r   rE   c                    |                     dd          }|                     |          }|                     |          }|}g }t          | j                  D ]H\  }	}
 |
||dn||	                   \  }}|                     |          }|                    |           I|                     |          }|                     |          }|                     ddd          ||fS )a#  Forward pass.

        B: batch size;
        U: maximum sequence length in batch;
        D: feature dimension of each input sequence element.

        Args:
            input (torch.Tensor): target sequences, with shape `(B, U)` and each element
                mapping to a target symbol, i.e. in range `[0, num_symbols)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.
            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
                representing internal state generated in preceding invocation
                of ``forward``. (Default: ``None``)

        Returns:
            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
                torch.Tensor
                    output encoding sequences, with shape `(B, U, output_dim)`
                torch.Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid elements for i-th batch element in output encoding sequences.
                List[List[torch.Tensor]]
                    output states; list of lists of tensors
                    representing internal state generated in current invocation of ``forward``.
        rG   r   N   )	permuter   r   	enumerater   rz   rQ   r   r   )r   r   r   rE   input_tbembedding_outinput_layer_norm_outlstm_out	state_out	layer_idxlstmlstm_state_out
linear_outoutput_layer_norm_outs                 r   r'   z_Predictor.forwardX  s    @ ==A&&x00#44]CC'.0	()9:: 	- 	-OIt'+tHemddQVW`Qa'b'b$Hn||H--H^,,,,[[**
 $ 6 6z B B$,,Q155w	IIr   )Fr3   rh   r   r\   r0   s   @r   r   r   (  s2        . !&)-!) )) ) "	)
 ) ) ) "') ) 
) ) ) ) ) )F 59	-J -J|-J -J T%,/01	-J
 
u|U\4U\0B+CC	D-J -J -J -J -J -J -J -Jr   r   c                        e Zd ZdZddedededdf fdZd	ej        d
ej        dej        dej        de	ej        ej        ej        f         f
dZ
 xZS )_Joinera@  Recurrent neural network transducer (RNN-T) joint network.

    Args:
        input_dim (int): source and target input dimension.
        output_dim (int): output dimension.
        activation (str, optional): activation function to use in the joiner.
            Must be one of ("relu", "tanh"). (Default: "relu")

    ri   r4   rp   r{   r   Nc                 b   t                                                       t          j                            ||d          | _        |dk    r%t          j                                        | _        d S |dk    r%t          j                                        | _        d S t          d|           )NTr:   ri   rP   zUnsupported activation )
r   r   r-   r=   r>   r   ReLUr{   Tanh
ValueError)r   r4   rp   r{   r   s       r   r   z_Joiner.__init__  s    hooi$oGG#hmmooDOOO6!!#hmmooDOOOCzCCDDDr   source_encodingssource_lengthstarget_encodingstarget_lengthsc                     |                     d                                          |                     d                                          z   }|                     |          }|                     |          }|||fS )a  Forward pass for training.

        B: batch size;
        T: maximum source sequence length in batch;
        U: maximum target sequence length in batch;
        D: dimension of each source and target sequence encoding.

        Args:
            source_encodings (torch.Tensor): source encoding sequences, with
                shape `(B, T, D)`.
            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                valid sequence length of i-th batch element in ``source_encodings``.
            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                valid sequence length of i-th batch element in ``target_encodings``.

        Returns:
            (torch.Tensor, torch.Tensor, torch.Tensor):
                torch.Tensor
                    joint network output, with shape `(B, T, U, output_dim)`.
                torch.Tensor
                    output source lengths, with shape `(B,)` and i-th element representing
                    number of valid elements along dim 1 for i-th batch element in joint network output.
                torch.Tensor
                    output target lengths, with shape `(B,)` and i-th element representing
                    number of valid elements along dim 2 for i-th batch element in joint network output.
        r   rG   )	unsqueezer    r{   r   )r   r   r   r   r   joint_encodingsactivation_outr&   s           r   r'   z_Joiner.forward  st    D +44Q77BBDDGWGaGabcGdGdGoGoGqGqq99^,,~~55r   )ri   )r(   r)   r*   r+   r,   r   r   r-   r.   r   r'   r/   r0   s   @r   r   r     s         E E# E3 EC EUY E E E E E E%6,%6 %6  ,	%6
 %6 
u|U\5<7	8%6 %6 %6 %6 %6 %6 %6 %6r   r   c                       e Zd ZdZdedededdf fdZ	 ddej	        d	ej	        d
ej	        dej	        de
eeej	                                   deej	        ej	        ej	        eeej	                          f         fdZej        j        dej	        d	ej	        de
eeej	                                   deej	        ej	        eeej	                          f         fd            Zej        j        dej	        d	ej	        deej	        ej	        f         fd            Zej        j        d
ej	        dej	        de
eeej	                                   deej	        ej	        eeej	                          f         fd            Zej        j        dej	        d	ej	        dej	        dej	        deej	        ej	        ej	        f         f
d            Z xZS )r	   a  torchaudio.models.RNNT()

    Recurrent neural network transducer (RNN-T) model.

    Note:
        To build the model, please use one of the factory functions.

    See Also:
        :class:`torchaudio.pipelines.RNNTBundle`: ASR pipeline with pre-trained models.

    Args:
        transcriber (torch.nn.Module): transcription network.
        predictor (torch.nn.Module): prediction network.
        joiner (torch.nn.Module): joint network.
    transcriber	predictorjoinerr   Nc                 r    t                                                       || _        || _        || _        d S r   )r   r   r   r   r   )r   r   r   r   r   s       r   r   zRNNT.__init__  s3    &"r   sourcesr   targetsr   predictor_statec                     |                      ||          \  }}|                     |||          \  }}}|                     ||||          \  }}}||||fS )a  Forward pass for training.

        B: batch size;
        T: maximum source sequence length in batch;
        U: maximum target sequence length in batch;
        D: feature dimension of each source sequence element.

        Args:
            sources (torch.Tensor): source frame sequences right-padded with right context, with
                shape `(B, T, D)`.
            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``sources``.
            targets (torch.Tensor): target sequences, with shape `(B, U)` and each element
                mapping to a target symbol.
            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``targets``.
            predictor_state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
                representing prediction network internal state generated in preceding invocation
                of ``forward``. (Default: ``None``)

        Returns:
            (torch.Tensor, torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
                torch.Tensor
                    joint network output, with shape
                    `(B, max output source length, max output target length, output_dim (number of target symbols))`.
                torch.Tensor
                    output source lengths, with shape `(B,)` and i-th element representing
                    number of valid elements along dim 1 for i-th batch element in joint network output.
                torch.Tensor
                    output target lengths, with shape `(B,)` and i-th element representing
                    number of valid elements along dim 2 for i-th batch element in joint network output.
                List[List[torch.Tensor]]
                    output states; list of lists of tensors
                    representing prediction network internal state generated in current invocation
                    of ``forward``.
        )r   r   r   r   rE   r   r   r   r   )r   r   r   )	r   r   r   r   r   r   r   r   r&   s	            r   r'   zRNNT.forward  s    X ,0+;+;" ,< ,
 ,
(. =ANN"! =K =
 =
9./
 26-)-)	 2= 2
 2
. 	
 	
r   rE   c                 :    | j                             |||          S )a  Applies transcription network to sources in streaming mode.

        B: batch size;
        T: maximum source sequence segment length in batch;
        D: feature dimension of each source sequence frame.

        Args:
            sources (torch.Tensor): source frame sequence segments right-padded with right context, with
                shape `(B, T + right context length, D)`.
            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``sources``.
            state (List[List[torch.Tensor]] or None): list of lists of tensors
                representing transcription network internal state generated in preceding invocation
                of ``transcribe_streaming``.

        Returns:
            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
                torch.Tensor
                    output frame sequences, with
                    shape `(B, T // time_reduction_stride, output_dim)`.
                torch.Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid elements for i-th batch element in output.
                List[List[torch.Tensor]]
                    output states; list of lists of tensors
                    representing transcription network internal state generated in current invocation
                    of ``transcribe_streaming``.
        )r   re   )r   r   r   rE   s       r   transcribe_streamingzRNNT.transcribe_streaming  s     F %%g~uEEEr   c                 .    |                      ||          S )a  Applies transcription network to sources in non-streaming mode.

        B: batch size;
        T: maximum source sequence length in batch;
        D: feature dimension of each source sequence frame.

        Args:
            sources (torch.Tensor): source frame sequences right-padded with right context, with
                shape `(B, T + right context length, D)`.
            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``sources``.

        Returns:
            (torch.Tensor, torch.Tensor):
                torch.Tensor
                    output frame sequences, with
                    shape `(B, T // time_reduction_stride, output_dim)`.
                torch.Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid elements for i-th batch element in output frame sequences.
        )r   )r   r   r   s      r   
transcribezRNNT.transcribeD  s    6 888r   c                 2    |                      |||          S )a  Applies prediction network to targets.

        B: batch size;
        U: maximum target sequence length in batch;
        D: feature dimension of each target sequence frame.

        Args:
            targets (torch.Tensor): target sequences, with shape `(B, U)` and each element
                mapping to a target symbol, i.e. in range `[0, num_symbols)`.
            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``targets``.
            state (List[List[torch.Tensor]] or None): list of lists of tensors
                representing internal state generated in preceding invocation
                of ``predict``.

        Returns:
            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
                torch.Tensor
                    output frame sequences, with shape `(B, U, output_dim)`.
                torch.Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid elements for i-th batch element in output.
                List[List[torch.Tensor]]
                    output states; list of lists of tensors
                    representing internal state generated in current invocation of ``predict``.
        r   )r   )r   r   r   rE   s       r   predictzRNNT.predicta  s    B ~~G^5~QQQr   r   r   c                 F    |                      ||||          \  }}}|||fS )a  Applies joint network to source and target encodings.

        B: batch size;
        T: maximum source sequence length in batch;
        U: maximum target sequence length in batch;
        D: dimension of each source and target sequence encoding.

        Args:
            source_encodings (torch.Tensor): source encoding sequences, with
                shape `(B, T, D)`.
            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                valid sequence length of i-th batch element in ``source_encodings``.
            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                valid sequence length of i-th batch element in ``target_encodings``.

        Returns:
            (torch.Tensor, torch.Tensor, torch.Tensor):
                torch.Tensor
                    joint network output, with shape `(B, T, U, output_dim)`.
                torch.Tensor
                    output source lengths, with shape `(B,)` and i-th element representing
                    number of valid elements along dim 1 for i-th batch element in joint network output.
                torch.Tensor
                    output target lengths, with shape `(B,)` and i-th element representing
                    number of valid elements along dim 2 for i-th batch element in joint network output.
        r   )r   )r   r   r   r   r   r&   s         r   joinz	RNNT.join  s?    F 26-)-)	 2= 2
 2
. ~~55r   r   )r(   r)   r*   r+   r`   r   r   r   r-   r.   r   r   r   r'   r   r   r   r   r   r   r/   r0   s   @r   r	   r	     s         L Z QX ]a       ?CA
 A
A
 A
 	A

 A
 "$tEL'9":;A
 
u|U\5<d5<>P9QQ	RA
 A
 A
 A
F Y"F"F "F T%,/01	"F
 
u|U\4U\0B+CC	D"F "F "F "FH Y99 9 
u|U\)	*	9 9 9 98 Y R R  R T%,/01	 R
 
u|U\4U\0B+CC	D R  R  R  RD Y(6,(6 (6  ,	(6
 (6 
u|U\5<7	8(6 (6 (6 (6 (6 (6 (6 (6r   r	   r4   encoding_dimr   rq   rr   rs   rt   ru   rv   rw   rk   rl   rx   rm   rn   ro   r   r   r   r   r   r   c                     t          | ||||||||	|
|||||          }t          ||||||||          }t          ||          }t          |||          S )a 
  Builds Emformer-based :class:`~torchaudio.models.RNNT`.

    Note:
        For non-streaming inference, the expectation is for `transcribe` to be called on input
        sequences right-concatenated with `right_context_length` frames.

        For streaming inference, the expectation is for `transcribe_streaming` to be called
        on input chunks comprising `segment_length` frames right-concatenated with `right_context_length`
        frames.

    Args:
        input_dim (int): dimension of input sequence frames passed to transcription network.
        encoding_dim (int): dimension of transcription- and prediction-network-generated encodings
            passed to joint network.
        num_symbols (int): cardinality of set of target tokens.
        segment_length (int): length of input segment expressed as number of frames.
        right_context_length (int): length of right context expressed as number of frames.
        time_reduction_input_dim (int): dimension to scale each element in input sequences to
            prior to applying time reduction block.
        time_reduction_stride (int): factor by which to reduce length of input sequence.
        transformer_num_heads (int): number of attention heads in each Emformer layer.
        transformer_ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
        transformer_num_layers (int): number of Emformer layers to instantiate.
        transformer_left_context_length (int): length of left context considered by Emformer.
        transformer_dropout (float): Emformer dropout probability.
        transformer_activation (str): activation function to use in each Emformer layer's
            feedforward network. Must be one of ("relu", "gelu", "silu").
        transformer_max_memory_size (int): maximum number of memory elements to use.
        transformer_weight_init_scale_strategy (str): per-layer weight initialization scaling
            strategy. Must be one of ("depthwise", "constant", ``None``).
        transformer_tanh_on_mem (bool): if ``True``, applies tanh to memory elements.
        symbol_embedding_dim (int): dimension of each target token embedding.
        num_lstm_layers (int): number of LSTM layers to instantiate.
        lstm_layer_norm (bool): if ``True``, enables layer normalization for LSTM layers.
        lstm_layer_norm_epsilon (float): value of epsilon to use in LSTM layer normalization layers.
        lstm_dropout (float): LSTM dropout probability.

    Returns:
        RNNT:
            Emformer RNN-T model.
    )r4   rp   rq   rr   rs   rt   ru   rv   rw   rk   rl   rx   rm   rn   ro   )r   r   r   r   r   r   )rg   r   r   r	   )r4   r   r   rq   rr   rs   rt   ru   rv   rw   rk   rl   rx   rm   rn   ro   r   r   r   r   r   encoderr   r   s                           r   r   r     s    B %1!933/5/5(G$?/U 7  G" 1',' 7!	 	 	I \;//FF+++r   c                     t          d(i ddddd| dddd	d
ddd	dddddddddddddddddddd d!d"d#dd$d%d&d'S ))zBuilds basic version of Emformer-based :class:`~torchaudio.models.RNNT`.

    Args:
        num_symbols (int): The size of target token lexicon.

    Returns:
        RNNT:
            Emformer RNN-T model.
    r4   P   r   i   r   rq      rr   r9   rs      rt   ru      rv   i   rw      rk   g?rl   gelurx      rm   r   rn   rj   ro   Tr   i   r      r   r   gMbP?r   g333333?rb   )r   )r   s    r   r
   r
     s       "T  K r	
 Q "%  a  a !D  "r  C  &v )+ %&A 0;{  !%!" !S#$ %& '( !%)* S+ r   )abcr   r   typingr   r   r   r-   torchaudio.modelsr   __all__r=   Moduler   r2   r`   rg   r   r   r	   r,   r^   r   r]   r   r
   rb   r   r   <module>r      s   # # # # # # # # ( ( ( ( ( ( ( ( ( (  & & & & & & @
?
?) ) ) ) )UX_ ) ) )XO O O O O%(/ O O Od    3   MG MG MG MG MGux MG MG MG`]J ]J ]J ]J ]J ]J ]J ]J@:6 :6 :6 :6 :6eho :6 :6 :6zh6 h6 h6 h6 h658? h6 h6 h6V],], ], 	],
 ], ], "], ], ], ],  ], ],  ], &)], "%],  -0!]," "#],$ %],& '],( )],* #+],, -],. 
/], ], ], ],@ C  D            r   