
    ~Vji                     *   d dl Z d dlmZmZmZ d dlZdgZdej        dej        fdZ	 ddej        dej        d	ej        dej        d
ej        deej                 deej                 fdZ	de
dej        j        fdZdee
         dedeee                  fdZdee         dee         dedej        dej        f
dZ G d dej        j                  Z G d dej        j                  Z G d dej        j                  Z G d de          ZdS )    N)ListOptionalTupleEmformerlengthsreturnc                    | j         d         }t          t          j        |                                                     }t          j        || j        | j                                      ||          | 	                    d          k    }|S )Nr   )devicedtype   )
shapeinttorchmaxitemaranger
   r   expand	unsqueeze)r   
batch_size
max_lengthpadding_masks       T/root/voice-cloning/.venv/lib/python3.11/site-packages/torchaudio/models/emformer.py_lengths_to_padding_maskr   
   s~    q!JUYw'',,..//J<
7>WWW^^J 			1		L     	utteranceright_contextsummarymemsleft_context_keyc                    |                     d          |                      d          z   |                     d          z   }|                     d          }|dk    rd }n|t          j        |                                          z
  |                     d          z
  }	||                     d          nd}
||                     d          z   |	z   |
z   }t	          |          }|S )Nr   r   )r   )sizer   r   r   r   )r   r   r   r   r   r   TBr   right_context_blocks_lengthleft_context_blocks_lengthklengthss               r   _gen_padding_maskr'      s     	1	q 1 11GLLOOCA1AAvv&'%)G*<*<*@*@*B*B&BW\\RS__&T#AQA]%5%:%:1%=%=%=cd"TYYq\\),GGJdd/AAAr   
activationc                     | dk    rt           j                                        S | dk    rt           j                                        S | dk    rt           j                                        S t          d|            )NrelugelusiluzUnsupported activation )r   nnReLUGELUSiLU
ValueError)r(   s    r   _get_activation_moduler2   '   sj    Vx}}	v		x}}	v		x}}?:??@@@r   weight_init_scale_strategy
num_layersc                     | d t          |          D             S | dk    rd t          |          D             S | dk    rd t          |          D             S t          d|            )Nc                     g | ]}d S N ).0_s     r   
<listcomp>z*_get_weight_init_gains.<locals>.<listcomp>4   s    000000r   	depthwisec                 B    g | ]}d t          j        |dz             z  S )      ?r   mathsqrtr9   	layer_idxs     r   r;   z*_get_weight_init_gains.<locals>.<listcomp>6   s+    RRR9di	A...RRRr   constantc                 <    g | ]}d t          j        d          z  S )r>      r?   rB   s     r   r;   z*_get_weight_init_gains.<locals>.<listcomp>8   s%    FFFydill"FFFr   z-Unsupported weight_init_scale_strategy value )ranger1   )r3   r4   s     r   _get_weight_init_gainsrH   2   s    !)00eJ//0000	#{	2	2RRj@Q@QRRRR	#z	1	1FFE*4E4EFFFFeIceefffr   
col_widthscol_masknum_rowsr
   c                     t          |           t          |          k    rt          d          fdt          | |          D             }t          j        |d          S )Nz0Length of col_widths must match that of col_maskc                 v    g | ]5\  }}|rt          j        |           nt          j        |           6S )r
   )r   oneszeros)r9   	col_widthis_ones_colr
   rK   s      r   r;   z-_gen_attention_mask_block.<locals>.<listcomp>C   s^        #I{ 	=
8Yv6666[9V<<<  r   r   dim)lenr1   zipr   cat)rI   rJ   rK   r
   
mask_blocks     `` r   _gen_attention_mask_blockrY   =   s|     :#h--''KLLL     '**h&?&?	  J 9ZQ''''r   c                       e Zd ZdZ	 	 	 	 ddededed	ee         d
edef fdZde	j
        de	j
        dee	j
        e	j
        f         fdZde	j
        de	j
        dee	j
                 de	j
        fdZ	 	 dde	j
        de	j
        de	j
        de	j
        de	j
        de	j
        dee	j
                 dee	j
                 dee	j
        e	j
        e	j
        e	j
        f         fdZde	j
        de	j
        de	j
        de	j
        de	j
        de	j
        dee	j
        e	j
        f         fdZe	j        j        de	j
        de	j
        de	j
        de	j
        de	j
        de	j
        de	j
        dee	j
        e	j
        e	j
        e	j
        f         fd            Z xZS ) _EmformerAttentiona_  Emformer layer attention module.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads in each Emformer layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        weight_init_gain (float or None, optional): scale factor to apply when initializing
            attention module parameters. (Default: ``None``)
        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
            NF    ח	input_dim	num_headsdropoutweight_init_gaintanh_on_memnegative_infc                    t                                                       ||z  dk    rt          d| d| d          || _        || _        || _        || _        || _        | j        | j        z  dz  | _        t          j
                            |d|z  d          | _        t          j
                            ||d          | _        t          j
                            ||d          | _        |rbt          j
        j                            | j        j        |	           t          j
        j                            | j        j        |	           d S d S )
Nr   zinput_dim (z") is not a multiple of num_heads (z).g      rF   T)bias)gain)super__init__r1   r^   r_   r`   rb   rc   scalingr   r-   Linearemb_to_key_valueemb_to_queryout_projinitxavier_uniform_weight)selfr^   r_   r`   ra   rb   rc   	__class__s          r   rh   z_EmformerAttention.__init__Y   sB    	y A%%e9eeXaeeefff""&($.8TA %	1y=t T T!HOOIytOLL	94HH 	[HM))$*?*FM])^^^HM))$*;*BIY)ZZZZZ	[ 	[r   inputr   r   c                     |j         \  }}}|                    d          dz   }|d ||z
           }t          j        ||g          }|                     |                              dd          \  }}	||	fS )Nr   r   rF   chunksrT   )r   r!   r   rW   rk   chunk)
rq   rs   r   r"   r:   summary_lengthright_ctx_utterance_blockmems_right_ctx_utterance_blockkeyvalues
             r   _gen_key_valuez!_EmformerAttention._gen_key_valuew   s    +1a1)$)*>A,>*>$?!).D:S3T)U)U&**+IJJPPXY_`Paa
UEzr   attention_weightsattention_maskr   c                    |                                 }|                    |                    d          | j                  }|                    d          }|                    d          | j        z  }||                    || j        |d          }|                    |                    d                              d                              t          j	                  | j                  }|                    || j        z  |d          }t          j
        j                            |d                              |          }t          j
        j                            |t          | j                  | j                  S )Nr   r   rF   rS   )ptraining)floatmasked_fillr   rc   r!   r_   viewtor   boolr-   
functionalsoftmaxtype_asr`   r   )rq   r~   r   r   attention_weights_floatr"   r#   attention_probss           r   _gen_attention_probsz'_EmformerAttention._gen_attention_probs   sY    #4"9"9";";"9"E"EnF^F^_`FaFacgct"u"u""1%%""1%%7#&=&B&B1dnVWY[&\&\#&=&I&I&&q))33A6699%*EEtGX' '# '>&B&B1t~CUWXZ\&]&]#(-556MSU5VV^^_pqqx"**?eDL>Q>Q\`\i*jjjr   r   r   r   r   r   left_context_valc	                 L    |                     d          |                     d          |                     d          z   |                     d          z   }	                     t          j        |||g                    }
                     t          j        |||g                                        dd          \  }}|||	t          j        |                                          z
  |                     d          z
  }t          j        |d |                     d          |z            |||                     d          |z   d          g          }t          j        |d |                     d          |z            |||                     d          |z   d          g          } fd|
||fD             \  }}}t          j        | j	        z  |
                    dd                    }t          ||||||          }                     |||          }t          j        ||          }|j         j        z  |	 j         j        z  fk    rt!          d          |
                    dd                                                              |	 j                  }                     |          }|                     d          }|d |	|z
           }||	|z
  d          } j        rt          j        |          }nt          j        |dd	          }||||fS )
Nr   r   rF   ru   c                     g | ]T}|                                                     d j        z  j        j        z                                dd          US )r   r   r   )
contiguousr   r_   r^   	transpose)r9   tensorr#   rq   s     r   r;   z4_EmformerAttention._forward_impl.<locals>.<listcomp>   si     8
 8
 8
 $$RT^);T^t~=]^^hhijlmnn8
 8
 8
r   z+Computed attention has incorrect dimensionsi
   )minr   )r!   rl   r   rW   rk   rw   r   r   bmmri   r   r'   r   r   r_   r^   AssertionErrorr   r   rm   rb   tanhclamp)rq   r   r   r   r   r   r   r   r   r"   queryr{   r|   r$   reshaped_queryreshaped_keyreshaped_valuer~   r   r   	attentionoutput_right_context_memsrx   output_right_contextoutput_memsr#   s   `                        @r   _forward_implz _EmformerAttention._forward_impl   sL    NN1q!!INN1$5$55QG !!%)]Iw,O"P"PQQ **59dM95U+V+VWW]]eflm]nn
U',<,H*+ei.@.@.D.D.F.F*FVW*X')D$))A,,)DDDE$		!'BBDDE C IFDIIaLL+FFFG$$))A,,)DDFFG E8
 8
 8
 8
 8
 #u-8
 8
 8
4n "Int|&C\E[E[\]_`EaEabb )M7GUY[kll 334E~Wcdd Io~>>	?Ndn,
 
 

 !!NOOO''1--88::??1dnUU	 %)MM)$<$<! a89M1~;M9MN/N0B0D0DE 	D*[11KK+ksCCCK#[#u<<r   c                 X    |                      ||||||          \  }}}	}	||dd         fS )ac  Forward pass for training.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        S: number of summary elements;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
            attention_mask (torch.Tensor): attention mask for underlying attention module.

        Returns:
            (Tensor, Tensor):
                Tensor
                    output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
        Nr   )r   )
rq   r   r   r   r   r   r   outputr   r:   s
             r   forwardz_EmformerAttention.forward   sB    D %)$6$6y'=Zacgiw$x$x!Q{3B3'''r   c           
         |                     d          |                     d          z   |                     d          z   }|                     d          |                     d          z   |                     d          z   |                     d          z   }	t          j        ||	                              t          j        |j                  }
d|
dd|                     d          f<   |                     ||||||
||          \  }}}}||||                     d          |                     d          z   d         ||                     d          |                     d          z   d         fS )a  Forward pass for inference.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        S: number of summary elements;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
            left_context_key (torch.Tensor): left context attention key computed from preceding invocation.
            left_context_val (torch.Tensor): left context attention value computed from preceding invocation.

        Returns:
            (Tensor, Tensor, Tensor, and Tensor):
                Tensor
                    output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
                Tensor
                    attention key computed for left context and utterance.
                Tensor
                    attention value computed for left context and utterance.
        r   r   r
   Tr   N)r   r   )r!   r   rP   r   r   r
   r   )rq   r   r   r   r   r   r   r   	query_dimkey_dimr   r   r   r{   r|   s                  r   inferz_EmformerAttention.infer   sj   R "&&q))INN1,=,==QO	$$Q'')..*;*;;diillJM]MbMbcdMeMeeY88;;%*U^Ue;ff-1r>TYYq\\>)**.*<*<-- += 	+
 	+
'S% 		!}11!444667$))A,,!3!3A!6!66889	
 	
r   )r\   NFr]   )NN)__name__
__module____qualname____doc__r   r   r   r   rh   r   Tensorr   r}   r   r   r   jitexportr   __classcell__rr   s   @r   r[   r[   L   s       
 
  ,0!"[ [[ [ 	[
 #5/[ [ [ [ [ [ [ [<EL  u|]b]iOiIj    k <k k u|,	k
 
k k k k6 4837G= G=<G= G= |	G=
 G= lG= G= #5<0G= #5<0G= 
u|U\5<E	FG= G= G= G=R#(<#( #( |	#(
 #( l#( #( 
u|U\)	*#( #( #( #(J Y;
<;
 ;
 |	;

 ;
 l;
  ,;
  ,;
 
u|U\5<E	F;
 ;
 ;
 ;
 ;
 ;
 ;
 ;
r   r[   c                       e Zd ZdZ	 	 	 	 	 	 	 d+ded	ed
ededededededee         dedef fdZ	dedee
j                 dee
j                 fdZdee
j                 dee
j        e
j        e
j        f         fdZde
j        de
j        dede
j        dee
j                 dee
j                 fdZde
j        d e
j        d!e
j        de
j        fd"Zd e
j        d!e
j        dee
j        e
j        f         fd#Zde
j        d e
j        d!e
j        dee
j        e
j        f         fd$Zd e
j        d%e
j        d!e
j        de
j        d&ee
j                 dee
j        e
j        f         fd'Zd e
j        d%e
j        d!e
j        de
j        deee
j                          dee
j        e
j        ee
j                 f         fd(Zd e
j        d%e
j        d!e
j        de
j        d&e
j        dee
j        e
j        e
j        f         fd)Ze
j        j        d e
j        d%e
j        d!e
j        deee
j                          de
j        dee
j        e
j        ee
j                 e
j        f         fd*            Z xZS ),_EmformerLayera$  Emformer layer that constitutes Emformer.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads.
        ffn_dim: (int): hidden layer dimension of feedforward network.
        segment_length (int): length of each input segment.
        dropout (float, optional): dropout probability. (Default: 0.0)
        activation (str, optional): activation function to use in feedforward network.
            Must be one of ("relu", "gelu", "silu"). (Default: "relu")
        left_context_length (int, optional): length of left context. (Default: 0)
        max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
        weight_init_gain (float or None, optional): scale factor to apply when initializing
            attention module parameters. (Default: ``None``)
        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
    r\   r*   r   NFr]   r^   r_   ffn_dimsegment_lengthr`   r(   left_context_lengthmax_memory_sizera   rb   rc   c           
         t                                                       t          ||||	|
|          | _        t          j                            |          | _        t          j                            ||d          | _	        t          |          }t          j                            t          j                            |          t          j                            ||          |t          j                            |          t          j                            ||          t          j                            |                    | _        t          j                            |          | _        t          j                            |          | _        || _        || _        || _        || _        |dk    | _        d S )N)r^   r_   r`   ra   rb   rc   Tkernel_sizestride	ceil_moder   )rg   rh   r[   r   r   r-   Dropoutr`   	AvgPool1d	memory_opr2   
Sequential	LayerNormrj   pos_fflayer_norm_inputlayer_norm_outputr   r   r   r^   use_mem)rq   r^   r_   r   r   r`   r(   r   r   ra   rb   rc   activation_modulerr   s                r   rh   z_EmformerLayer.__init__R  se    	+-#%
 
 
 x''00++~im+nn2:>>h))Hy))HOOIw//HW%%HOOGY//HW%%
 
 !& 2 29 = =!&!3!3I!>!>#6 ,."&*r   r   r
   r   c                    t          j        | j        || j        |          }t          j        | j        || j        |          }t          j        | j        || j        |          }t          j        d|t           j        |          }||||gS )NrN   r   r   )r   rP   r   r^   r   int32)rq   r   r
   empty_memoryr   r   past_lengths          r   _init_statez_EmformerLayer._init_state  s    {4#7T^\bccc ;t'?T^djkkk ;t'?T^djkkkk!Zu{6RRR.0@+NNr   statec                 p   |d         d         d                                          }t          | j        |          }t          | j        t	          j        || j        z                      }|d         | j        |z
  d          }|d         | j        |z
  d          }|d         | j        |z
  d          }|||fS )N   r   r   rF   )r   r   r   r   r@   ceilr   )rq   r   r   past_left_context_lengthpast_mem_lengthpre_memslc_keylc_vals           r   _unpack_statez_EmformerLayer._unpack_state  s    Ahqk!n))++#&t'?#M#M d2DIkDL_>_4`4`aa8D0?BDDEq$25MMOOPq$25MMOOP''r   next_knext_vupdate_lengthr   c                 l   t          j        |d         |g          }t          j        |d         |g          }t          j        |d         |g          | j         d          |d<   ||j        d         | j        z
  d          |d<   ||j        d         | j        z
  d          |d<   |d         |z   |d<   |S )Nr   rF   r   r   )r   rW   r   r   r   )rq   r   r   r   r   r   new_knew_vs           r   _pack_statez_EmformerLayer._pack_state  s     	58V,--	58V,--9eAh-..0D/D/F/FGaQ$*BBDDEaQ$*BBDDEa8m+ar   	rc_outputr   r   c                     |                      |          t          j        ||g          z   }|                     |          |z   }|                     |          }|S r7   )r`   r   rW   r   r   )rq   r   r   r   results        r   _process_attention_outputz(_EmformerLayer._process_attention_output  sW     i((59mY5O+P+PPV$$v-''//r   c                     |                      t          j        ||g                    }||                    d          d          |d |                    d                   fS Nr   )r   r   rW   r!   )rq   r   r   r   s       r   _apply_pre_attention_layer_normz._EmformerLayer._apply_pre_attention_layer_norm  si      00M9;U1V1VWW]//224454}11!4445
 	
r   c                     |                      |||          }||                    d          d          |d |                    d                   fS r   )r   r!   )rq   r   r   r   s       r   _apply_post_attention_ffnz(_EmformerLayer._apply_post_attention_ffn  sY     229iWW	++A..0019=T}?Q?QRS?T?T=T3UUUr   r   r   c                 \   |t          d          | j        r@|                     |                    ddd                                        ddd          }n3t	          j        d                              |j        |j                  }| 	                    ||||||          \  }}||fS )Nz;attention_mask must be not None when for_inference is Falser   rF   r   r   )r   r   r   r   r   r   )
r1   r   r   permuter   emptyr   r   r
   r   )	rq   r   r   r   r   r   r   r   next_ms	            r   _apply_attention_forwardz'_EmformerLayer._apply_attention_forward  s     !Z[[[< 	XnnY%6%6q!Q%?%?@@HHAqQQGGk!nn''ioiFV'WWG NN') + 
 
	6 &  r   c           	      L   |/|                      |                    d          |j                  }|                     |          \  }}}| j        rJ|                     |                    ddd                                        ddd          }	|	d d         }	n3t          j        d          	                    |j
        |j                  }	| j                            ||||	|||          \  }
}}}|                     |||                    d          ||          }|
||fS )Nr   rN   rF   r   r   )r   r   r   r   r   r   r   )r   r!   r
   r   r   r   r   r   r   r   r   r   r   r   )rq   r   r   r   r   r   r   r   r   r   r   r   r   r   s                 r   _apply_attention_inferz%_EmformerLayer._apply_attention_infer  s/    =$$Y^^A%6%6y?O$PPE#'#5#5e#<#< &&< 	XnnY%6%6q!Q%?%?@@HHAqQQGbqbkGGk!nn''ioiFV'WWG,0N,@,@'## -A -
 -
)	666   1B1BD%PP&%''r   c                     |                      ||          \  }}|                     |||||          \  }}	|                     |||          \  }
}|
||	fS )a1  Forward pass for training.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
            attention_mask (torch.Tensor): attention mask for underlying attention module.

        Returns:
            (Tensor, Tensor, Tensor):
                Tensor
                    encoded utterance frames, with shape `(T, B, D)`.
                Tensor
                    updated right context frames, with shape `(R, B, D)`.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
        )r   r   r   )rq   r   r   r   r   r   layer_norm_utterancelayer_norm_right_contextr   r   output_utterancer   s               r   r   z_EmformerLayer.forward  s}    H 00MJJ	
 $!%!>!> $"
 "
	; 261O1OPY[dfs1t1t..!5{BBr   c                     |                      ||          \  }}|                     |||||          \  }}	}
|                     |||          \  }}|||
|	fS )a2  Forward pass for inference.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            state (List[torch.Tensor] or None): list of tensors representing layer internal state
                generated in preceding invocation of ``infer``.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.

        Returns:
            (Tensor, Tensor, List[torch.Tensor], Tensor):
                Tensor
                    encoded utterance frames, with shape `(T, B, D)`.
                Tensor
                    updated right context frames, with shape `(R, B, D)`.
                List[Tensor]
                    list of tensors representing layer internal state
                    generated in current invocation of ``infer``.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
        )r   r   r   )rq   r   r   r   r   r   r   r   r   r   output_stater   r   s                r   r   z_EmformerLayer.infer  s~    R 00MJJ	
 $/3/J/J '+CT50
 0
,	; 261O1OPY[dfs1t1t..!5|[PPr   )r\   r*   r   r   NFr]   )r   r   r   r   r   r   strr   r   rh   r   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s   @r   r   r   ?  sy        0  #$ ,0!",+ ,+,+ ,+ 	,+
 ,+ ,+ ,+ !,+ ,+ #5/,+ ,+ ,+ ,+ ,+ ,+ ,+ ,+\Oc O8EL3I OdSXS_N` O O O O(4#5 (%el\a\h@h:i ( ( ( (  	
 l EL! 
el	    	<	 <	 |		
 
	 	 	 	

6;l
	u|U\)	*
 
 
 
VV27,VOT|V	u|U\)	*V V V V!<! ! |	!
 l! !.! 
u|U\)	*! ! ! !2(<( ( |	(
 l( U\*+( 
u|U\4+==	>( ( ( (8-C<-C -C |	-C
 l-C -C 
u|U\5<7	8-C -C -C -C^ Y-Q<-Q -Q |	-Q
 U\*+-Q l-Q 
u|U\4+=u|K	L-Q -Q -Q -Q -Q -Q -Q -Qr   r   c                       e Zd Z	 	 	 ddej        j        dedededef
 fdZdej        d	ej        fd
Z	deded	e
e         fdZdej        d	ej        fdZdej        dej        d	eej        ej        f         fdZej        j        	 ddej        dej        dee
e
ej                                   d	eej        ej        e
e
ej                          f         fd            Z xZS )_EmformerImplr   emformer_layersr   r   right_context_lengthr   c                     t                                                       |dk    | _        t          j                            ||d          | _        || _        || _        || _	        || _
        || _        d S )Nr   Tr   )rg   rh   r   r   r-   r   r   r   r   r   r   r   )rq   r   r   r   r   r   rr   s         r   rh   z_EmformerImpl.__init__P  s~     	&*++&! , 
 

  /#6 $8!,.r   rs   r   c                 l   |j         d         }t          j        || j        z
  | j        z            }g }t          |dz
            D ]6}|dz   | j        z  }|| j        z   }|                    |||                    7|                    ||| j        z
  d                     t          j        |          S Nr   r   )	r   r@   r   r   r   rG   appendr   rW   )rq   rs   r"   num_segsright_context_blocksseg_idxstartends           r   _gen_right_contextz _EmformerImpl._gen_right_contextf  s    KN9a$";;t?RRSS!X\** 	: 	:Gq[D$77E$33C ''eCi(89999##E!d.G*G*I*I$JKKKy-...r   r  utterance_lengthc           
         t          j        || j        z            }| j        }| j        }||z  }||z   }t          || j        z  |z
  d          }t          |dz   | j        z  |          }	| j        |z  }
| j        r8t          || j        z
  d          }|dz
  }|||z
  ||z
  |||
|z
  ||	|z
  ||	z
  g	}n|||
|z
  ||	|z
  ||	z
  g}|S r   )	r@   r   r   r   r   r   r   r   r   )rq   r  r  r   rclcrc_startrc_end	seg_startseg_end	rc_lengthm_start
mem_lengthrI   s                 r   _gen_attention_mask_col_widthsz,_EmformerImpl._gen_attention_mask_col_widthsq  s   9-0CCDD&%R<B$"55:A>>	w{d&99;KLL-8	< 	'D$88!<<G!AJ'!W$F")# 7*
JJ F")# 7*J r   c                 V   |                     d          }t          j        || j        z            }g }g }g }| j        r:d}d t          |          D             }d t          |          D             }	|||g}
n!d}d t          |          D             }d }	||g}
t          |          D ]}|                     ||          }t          ||| j        |j	                  }|
                    |           t          ||t          | j        ||| j        z  z
            |j	                  }|
                    |           |	,t          ||	d|j	                  }|
                    |           dt          j        d |
D                       z
                      t          j                  }|S )	Nr   	   c                     g | ]}|d v S ))r         r8   r9   idxs     r   r;   z5_EmformerImpl._gen_attention_mask.<locals>.<listcomp>  s    JJJ3cY.JJJr   c                     g | ]}|d v S ))r  r  r8   r  s     r   r;   z5_EmformerImpl._gen_attention_mask.<locals>.<listcomp>  s    DDDS3&=DDDr      c                     g | ]}|d v S ))r   r  r8   r  s     r   r;   z5_EmformerImpl._gen_attention_mask.<locals>.<listcomp>  s    GGGcVmGGGr   r   c                 6    g | ]}t          j        |          S r8   )r   rW   )r9   masks     r   r;   z5_EmformerImpl._gen_attention_mask.<locals>.<listcomp>  s     (U(U(UT4(U(U(Ur   )r!   r@   r   r   r   rG   r  rY   r   r
   r   r   r   rW   r   r   )rq   rs   r  r   rc_mask
query_masksummary_masknum_colsrc_q_cols_masks_cols_maskmasks_to_concatr  rI   rc_mask_blockquery_mask_blocksummary_mask_blockr   s                    r   _gen_attention_maskz!_EmformerImpl._gen_attention_mask  s    ::a==9-0CCDD
< 	4HJJ%//JJJNDDE(OODDDK&
LAOOHGGuXGGGNK&
3OX 	8 	8G<<WFVWWJ5ND,Eu| M NN=)))8'$w1D'DD      .///&%>z;XY[`[g%h%h"##$6777ei(U(U_(U(U(UVVVZZ[`[effr   r   c                 @   |                     ddd          }|                     |          }|d|                    d          | j        z
           }|                     |          }| j        rG|                     |                     ddd                                         ddd          dd         n2t          j        d          	                    |j
        |j                  }|}| j        D ]} ||||||          \  }}}|                     ddd          |fS )aG  Forward pass for training and non-streaming inference.

        B: batch size;
        T: max number of input frames in batch;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): utterance frames right-padded with right context frames, with
                shape `(B, T + right_context_length, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid utterance frames for i-th batch element in ``input``.

        Returns:
            (Tensor, Tensor):
                Tensor
                    output frames, with shape `(B, T, D)`.
                Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output frames.
        r   r   rF   Nr   r   )r   r  r!   r   r'  r   r   r   r   r   r   r
   r   )	rq   rs   r   r   r   r   r   r   layers	            r   r   z_EmformerImpl.forward  s+   * aA&&//66EEJJqMMD,EEEF	11)<< |KDNN9,,Q15566>>q!QGGLLQ""U\"JJ 	
 ) 	f 	fE*/%PTVd*e*e'FM44~~aA&&//r   Nstatesc                 b   |                     d          | j        | j        z   k    r6t          d| j        | j        z    d|                     d           d          |                    ddd          }|                     d          | j        z
  }||d         }|d|         }t          j        || j        z
  d          }| j        r?|                     |                    ddd                                        ddd          n2t          j	        d          
                    |j        |j        	          }|}	g }
t          | j                  D ]B\  }}|                    |	|||dn||         |          \  }	}}}|
                    |           C|	                    ddd          ||
fS )
a  Forward pass for streaming inference.

        B: batch size;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): utterance frames right-padded with right context frames, with
                shape `(B, segment_length + right_context_length, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.
            states (List[List[torch.Tensor]] or None, optional): list of lists of tensors
                representing internal state generated in preceding invocation of ``infer``. (Default: ``None``)

        Returns:
            (Tensor, Tensor, List[List[Tensor]]):
                Tensor
                    output frames, with shape `(B, segment_length, D)`.
                Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output frames.
                List[List[Tensor]]
                    output states; list of lists of tensors representing internal state
                    generated in current invocation of ``infer``.
        r   zIPer configured segment_length and right_context_length, expected size of z# for dimension 1 of input, but got .r   rF   N)r   r   )r!   r   r   r1   r   r   r   r   r   r   r   r   r
   	enumerater   r   r   )rq   rs   r   r*  right_context_start_idxr   r   output_lengthsr   r   output_statesrC   r)  r   s                 r   r   z_EmformerImpl.infer  s   > ::a==D/$2KKKK.&*&9D<U&U. ."ZZ]]. . .  
 aA&&"'**Q--$2K"K56672223	Wt/H%HaPPP |KDNN9,,Q15566>>q!QGGGQ""U\"JJ 	
 24 )$*> ? ? 	/ 	/Iu8=F9,=9 95FM<   ....~~aA&&EEr   )r   r   r   r7   )r   r   r   r   r-   
ModuleListr   rh   r   r  r   r  r'  r   r   r   r   r   r   r   r   s   @r   r   r   O  s       
 $%$% / /,/ / !	/
 "/ / / / / / /,	/ 	/ 	/ 	/ 	/ 	/"c "S "UYZ]U^ " " " "H. .%, . . . .`!0U\ !0EL !0U5<Y^YeKeEf !0 !0 !0 !0F Y
 6:	:F :F|:F :F d5<012	:F
 
u|U\4U\0B+CC	D:F :F :F :F :F :F :F :Fr   r   c                   v     e Zd ZdZ	 	 	 	 	 	 	 	 dded	ed
ededededededededee         dedef fdZ	 xZ
S )r   a_  Emformer architecture introduced in
    *Emformer: Efficient Memory Transformer Based Acoustic Model for Low Latency Streaming Speech Recognition*
    :cite:`shi2021emformer`.

    See Also:
        * :func:`~torchaudio.models.emformer_rnnt_model`,
          :func:`~torchaudio.models.emformer_rnnt_base`: factory functions.
        * :class:`torchaudio.pipelines.RNNTBundle`: ASR pipelines with pretrained model.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads in each Emformer layer.
        ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
        num_layers (int): number of Emformer layers to instantiate.
        segment_length (int): length of each input segment.
        dropout (float, optional): dropout probability. (Default: 0.0)
        activation (str, optional): activation function to use in each Emformer layer's
            feedforward network. Must be one of ("relu", "gelu", "silu"). (Default: "relu")
        left_context_length (int, optional): length of left context. (Default: 0)
        right_context_length (int, optional): length of right context. (Default: 0)
        max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
        weight_init_scale_strategy (str or None, optional): per-layer weight initialization scaling
            strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise")
        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)

    Examples:
        >>> emformer = Emformer(512, 8, 2048, 20, 4, right_context_length=1)
        >>> input = torch.rand(128, 400, 512)  # batch, num_frames, feature_dim
        >>> lengths = torch.randint(1, 200, (128,))  # batch
        >>> output, lengths = emformer(input, lengths)
        >>> input = torch.rand(128, 5, 512)
        >>> lengths = torch.ones(128) * 5
        >>> output, lengths, states = emformer.infer(input, lengths, None)
    r\   r*   r   r<   Fr]   r^   r_   r   r4   r   r`   r(   r   r   r   r3   rb   rc   c                   
 t          ||          t          j                            
fdt	          |          D                       }t                                          ||	
           d S )Nc                 P    g | ]"}t          	
|                    #S ))r`   r(   r   r   ra   rb   rc   )r   )r9   rC   r(   r`   r   r^   r   r   rc   r_   r   rb   weight_init_gainss     r   r;   z%Emformer.__init__.<locals>.<listcomp>]  s`         "#)(;$3%6y%A +!-    r   )r   r   r   )rH   r   r-   r1  rG   rg   rh   )rq   r^   r_   r   r4   r   r`   r(   r   r   r   r3   rb   rc   r   r5  rr   s    ``` ```` ` `` @r   rh   zEmformer.__init__K  s      33MzZZ(--              "'z!2!2  
 
$ 	 3!5+ 	 	
 	
 	
 	
 	
r   )r\   r*   r   r   r   r<   Fr]   )r   r   r   r   r   r   r   r   r   rh   r   r   s   @r   r   r   &  s        " "V  #$$% 4?!")
 )
)
 )
 	)

 )
 )
 )
 )
 !)
 ")
 )
 %-SM)
 )
 )
 )
 )
 )
 )
 )
 )
 )
 )
 )
r   r7   )r@   typingr   r   r   r   __all__r   r   r'   r   r-   Moduler2   r   r   rH   r   r
   rY   r[   r   r   r   r8   r   r   <module>r9     s    ( ( ( ( ( ( ( ( ( (  ,el u|     04 |< \ \	
 , u|, el   (As Aux A A A Agx} gRU gZ^_ghm_nZo g g g g(S	(%)$Z(;>(HM(
\( ( ( (p
 p
 p
 p
 p
 p
 p
 p
fMQ MQ MQ MQ MQUX_ MQ MQ MQ`TF TF TF TF TFEHO TF TF TFnN
 N
 N
 N
 N
} N
 N
 N
 N
 N
r   