
    ~Vji'2                        d dl mZmZmZmZmZ d dlZd dlmZ ddgZ	eee
         ej        eeej                          ef         Zde_        dedee
         fd	Zdedej        fd
Zdedeeej                          fdZdedefdZdedefdZdee         deeej                          fdZdeeej                          de
dej        deeej                          fdZdedefdZdee         dej        de
deej        ej        ej        f         fdZdedee         ddfdZ G d dej        j                  ZdS )    )CallableDictListOptionalTupleN)RNNT
HypothesisRNNTBeamSearchzHypothesis generated by RNN-T beam search decoder,
    represented as tuple of (tokens, prediction network output, prediction network state, score).
    hyporeturnc                     | d         S Nr    r   s    X/root/voice-cloning/.venv/lib/python3.11/site-packages/torchaudio/models/rnnt_decoder.py_get_hypo_tokensr          7N    c                     | d         S N   r   r   s    r   _get_hypo_predictor_outr      r   r   c                     | d         S )N   r   r   s    r   _get_hypo_stater      r   r   c                     | d         S )N   r   r   s    r   _get_hypo_scorer      r   r   c                 ,    t          | d                   S r   )strr   s    r   _get_hypo_keyr!       s    tAw<<r   hyposc           	      x   g }t          t          t          | d                                       D ]g }t          t          t          | d                                                D ]6|                    t	          j        fd| D                                  7|                    |           |S )Nr   c                 F    g | ]}t          |                            S r   )r   ).0r   ijs     r   
<listcomp>z _batch_state.<locals>.<listcomp>)   s-    6e6e6eW[t7L7LQ7OPQ7R6e6e6er   )rangelenr   appendtorchcat)r"   statesbatched_state_componentsr&   r'   s      @@r   _batch_stater0   $   s    ')F3uQx001122 0 079 s?5844Q78899 	h 	hA$++EI6e6e6e6e6e_d6e6e6e,f,fgggg.////Mr   r.   idxdevicec                 N    t          j        |g|          fd| D             S )Nr2   c                 ,    g | ]}fd |D             S )c                 <    g | ]}|                     d           S )r   )index_select)r%   state
idx_tensors     r   r(   z+_slice_state.<locals>.<listcomp>.<listcomp>0   s)    HHH5U:..HHHr   r   )r%   state_tupler9   s     r   r(   z _slice_state.<locals>.<listcomp>0   s.    ccc[HHHHKHHHcccr   )r,   tensor)r.   r1   r2   r9   s      @r   _slice_stater<   .   s4    seF333Jcccc\bccccr   c                 `    t          |           t          t          |                     dz   z  S r   )r   r*   r   r   s    r   _default_hypo_sort_keyr>   3   s+    4  C(8(>(>$?$?!$CDDr   next_token_probs
beam_widthc                 J   t          j        d | D                                           d          }||d d d df         z   }|                    d                              |          \  }}|                    |j        d         d          }||j        d         z  }|||fS )Nc                 ,    g | ]}t          |          S r   r   r%   hs     r   r(   z+_compute_updated_scores.<locals>.<listcomp><   s     BBBq 2 2BBBr   r   trunc)rounding_mode)r,   r;   	unsqueezereshapetopkdivshape)	r"   r?   r@   hypo_scoresnonblank_scoresnonblank_nbest_scoresnonblank_nbest_idxnonblank_nbest_hypo_idxnonblank_nbest_tokens	            r   _compute_updated_scoresrT   7   s    
 ,BBEBBBCCMMaPPK!$4QQQV$<<O0?0G0G0K0K0P0PQ[0\0\--044_5J15M]d4ee-0Ea0HH "9;OOOr   	hypo_listc                 |    t          |          D ]+\  }}t          |           t          |          k    r||=  d S ,d S N)	enumerater!   )r   rU   r&   elems       r   _remove_hyporZ   D   sU    Y''  4-"5"555!EE 6 r   c                   :    e Zd ZdZ	 	 	 d%dedededeee	gef                  d	ed
df fdZ
dej        d
ee	         fdZdej        dee	         dej        d
ej        fdZdee	         dee	         dej        deee	f         d
ee	         f
dZdee	         dee	         dej        dededej        d
ee	         fdZdee	         dee         dee         dedej        d
ee	         fdZdej        deee	                  ded
ee	         fdZdej        d ej        ded
ee	         fd!Zej        j        	 	 d&dej        d ej        ded"eeeej                                   d#eee	                  d
eee	         eeej                          f         fd$            Z xZS )'r
   a)  Beam search decoder for RNN-T model.

    See Also:
        * :class:`torchaudio.pipelines.RNNTBundle`: ASR pipeline with pretrained model.

    Args:
        model (RNNT): RNN-T model to use.
        blank (int): index of blank token in vocabulary.
        temperature (float, optional): temperature to apply to joint network output.
            Larger values yield more uniform samples. (Default: 1.0)
        hypo_sort_key (Callable[[Hypothesis], float] or None, optional): callable that computes a score
            for a given hypothesis to rank hypotheses by. If ``None``, defaults to callable that returns
            hypothesis score normalized by token sequence length. (Default: None)
        step_max_tokens (int, optional): maximum number of tokens to emit per input time step. (Default: 100)
          ?Nd   modelblanktemperaturehypo_sort_keystep_max_tokensr   c                     t                                                       || _        || _        || _        |t
          | _        n|| _        || _        d S rW   )super__init__r^   r_   r`   r>   ra   rb   )selfr^   r_   r`   ra   rb   	__class__s         r   re   zRNNTBeamSearch.__init__\   sX     	

& !7D!.D.r   r2   c                     | j         }d }t          j        dg|          }| j                            t          j        |gg|          ||          \  }}}|g|d                                         |df}|gS )Nr   r4   r   g        )r_   r,   r;   r^   predictdetach)	rf   r2   tokenr8   
one_tensorpred_out_
pred_state	init_hypos	            r   _init_b_hyposzRNNTBeamSearch._init_b_hyposp   s    
\1#f555
"&*"4"4U\E7)TZ5[5[5[]gin"o"o!ZGQK  	
	 {r   enc_outr"   c                 r   t          j        dg|          }t          j        d |D             d          }| j                            |||t          j        dgt          |          z  |                    \  }}}t           j        j                            || j	        z  d          }|d d ddf         S )Nr   r4   c                 ,    g | ]}t          |          S r   )r   rD   s     r   r(   z8RNNTBeamSearch._gen_next_token_probs.<locals>.<listcomp>   s!    $O$O$OA%<Q%?%?$O$O$Or   r   )dimr   )
r,   r;   stackr^   joinr*   nn
functionallog_softmaxr`   )rf   rr   r"   r2   rl   predictor_out
joined_outrn   s           r   _gen_next_token_probsz$RNNTBeamSearch._gen_next_token_probs~   s     \1#f555
$O$O$O$O$OUVWWW:??L!s5zz)&999	
 

Aq X(44Z$BR5RXY4ZZ
!!!Q'""r   b_hyposa_hyposr?   key_to_b_hypoc                    t          t          |                    D ] }||         }t          |          ||df         z   }t          |          |v rg|t          |                   }t	          |           t          t          j        t          |                                        |                    }	nt          |          }	t          |          t          |          t          |          |	f}                    |           ||t          |          <   t          j        d D                                                       \  }
}fd|D             S )NrF   c                 ,    g | ]}t          |          S r   rC   )r%   r   s     r   r(   z/RNNTBeamSearch._gen_b_hypos.<locals>.<listcomp>   s     %P%P%Pod&;&;%P%P%Pr   c                      g | ]
}|         S r   r   r%   r1   r~   s     r   r(   z/RNNTBeamSearch._gen_b_hypos.<locals>.<listcomp>   s    333333r   )r)   r*   r   r!   rZ   floatr,   r;   	logaddexpr   r   r   r+   sort)rf   r~   r   r?   r   r&   h_aappend_blank_scoreh_bscorern   
sorted_idxs    `          r   _gen_b_hyposzRNNTBeamSearch._gen_b_hypos   sW    s7||$$ 	4 	4A!*C!0!5!58HB8O!OS!!]22#M#$6$67S'***el?3+?+?@@JJK]^^__011 %%',,$$	C NN303M-,,--%P%P%P%P%PQQVVXX:3333
3333r   tr@   c                     t          |||          \  }}}	t          |          |k     rt          d           }
nt          ||                    }
g }g }g }t	          |          D ]}t          ||                   }||
k    rmt          ||                   }|                    ||                    |                    t          |	|                              |                    |           |r|                     |||||          }ng }|S )Ninf)rT   r*   r   r   r)   intr+   _gen_new_hypos)rf   r   r~   r?   r   r@   r2   rP   rR   rS   b_nbest_score
base_hypos
new_tokens
new_scoresr&   r   
a_hypo_idx	new_hyposs                     r   _gen_a_hyposzRNNTBeamSearch._gen_a_hypos   s1    $G-=zJJ		
!#  w<<*$$"5\\MMM+GZK,@AAM')
 "
"$
z"" 	) 	)A/233E}$$ !8!;<<
!!'*"5666!!#&:1&=">">???!!%((( 	-++J
JPQSYZZII*,Ir   r   tokensscoresc           
         t          j        d |D             |          }t          |          }| j                            |t          j        dgt          |          z  |          |          \  }}	}
g }t          |          D ]d\  }}t          |          ||         gz   }|                    |||         	                                t          |
||          ||         f           e|S )Nc                     g | ]}|gS r   r   )r%   rk   s     r   r(   z1RNNTBeamSearch._gen_new_hypos.<locals>.<listcomp>   s    "?"?"?uE7"?"?"?r   r4   r   )r,   r;   r0   r^   ri   r*   rX   r   r+   rj   r<   )rf   r   r   r   r   r2   
tgt_tokensr.   rm   rn   pred_statesr   r&   r   r   s                  r   r   zRNNTBeamSearch._gen_new_hypos   s     \"?"?"?"?"?OOO
j))#':#5#5L!s:.v>>>$
 $
 ![
 ')	
++ 	r 	rFAs)#..&)<Jj(1+*<*<*>*>[Z[]c@d@dflmnfopqqqqr   r   c           	          |j         d         }|j        }g }|                     |          n|t          |          D ] }}t          j                            t          t                   g           i }d}	|r 	                    |d d ||dz   f         ||          }
|

                                }
                     ||
|          |	 j        k    rn#                     ||
|||          }|r|	dz  }	|t	          j         fdD                                           |          \  }}fd|D             S )Nr   r   c                 :    g | ]}                     |          S r   )ra   )r%   hyprf   s     r   r(   z*RNNTBeamSearch._search.<locals>.<listcomp>  s'    )U)U)Uc$*<*<S*A*A)U)U)Ur   c                      g | ]
}|         S r   r   r   s     r   r(   z*RNNTBeamSearch._search.<locals>.<listcomp>  s    :::ws|:::r   )rM   r2   rq   r)   r,   jitannotater   r	   r}   cpur   rb   r   r;   rK   )rf   rr   r   r@   n_time_stepsr2   r   r   r   symbols_current_tr?   rn   r   r~   s   `            @r   _searchzRNNTBeamSearch._search   s    }Q'$&04$$$V,,,$|$$ 	; 	;AGi((j)92>>G35M ! +#'#=#=gaaaQQRUl>SU\^d#e#e #3#7#7#9#9 ++GW>NP]^^$(<<<++$   +%*%#  +& "L)U)U)U)UW)U)U)UVV[[\fggMAz::::z:::GGr   inputlengthc                 
   |                                 dk    r8|                                 dk    r|j        d         dk    st          d          |                                 dk    r|                    d          }|j        dk    r|j        dk    rt          d          |                                 dk    r|                    d          }| j                            ||          \  }}|                     |d	|          S )
a  Performs beam search for the given input sequence.

        T: number of frames;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D).
            length (torch.Tensor): number of valid frames in input
                sequence, with shape () or (1,).
            beam_width (int): beam size to use during search.

        Returns:
            List[Hypothesis]: top-``beam_width`` hypotheses found by beam search.
        r   r   r   r   *input must be of shape (T, D) or (1, T, D)r   r   "length must be of shape () or (1,)N)ru   rM   
ValueErrorrI   r^   
transcriber   )rf   r   r   r@   rr   rn   s         r   forwardzRNNTBeamSearch.forward  s     99;;!UYY[[A%5%5%+a.A:M:MIJJJ99;;!OOA&&E<2&,$"6"6ABBB::<<1%%a((FZ**5&99
||GT:666r   r8   
hypothesisc                    |                                 dk    r8|                                 dk    r|j        d         dk    st          d          |                                 dk    r|                    d          }|j        dk    r|j        dk    rt          d          |                                 dk    r|                    d          }| j                            |||          \  }}}|                     |||          |fS )	a  Performs beam search for the given input sequence in streaming mode.

        T: number of frames;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D).
            length (torch.Tensor): number of valid frames in input
                sequence, with shape () or (1,).
            beam_width (int): beam size to use during search.
            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
                representing transcription network internal state generated in preceding
                invocation. (Default: ``None``)
            hypothesis (List[Hypothesis] or None): hypotheses from preceding invocation to seed
                search with. (Default: ``None``)

        Returns:
            (List[Hypothesis], List[List[torch.Tensor]]):
                List[Hypothesis]
                    top-``beam_width`` hypotheses found by beam search.
                List[List[torch.Tensor]]
                    list of lists of tensors representing transcription network
                    internal state generated in current invocation.
        r   r   r   r   r   r   r   r   )ru   rM   r   rI   r^   transcribe_streamingr   )rf   r   r   r@   r8   r   rr   rn   s           r   inferzRNNTBeamSearch.infer'  s    B 99;;!UYY[[A%5%5%+a.A:M:MIJJJ99;;!OOA&&E<2&,$"6"6ABBB::<<1%%a((F J;;E65QQE||GZ<<eCCr   )r\   Nr]   )NN)__name__
__module____qualname____doc__r   r   r   r   r   r	   re   r,   r2   r   rq   Tensorr}   r   r    r   r   r   r   r   r   exportr   r   __classcell__)rg   s   @r   r
   r
   K   ss        ( !AE"/ // / 	/
  *u)< =>/ / 
/ / / / / /(EL T*5E    #|#,0,<#FKl#	# # # #4j!4 j!4  ,	4
 CO,4 
j	4 4 4 46$j!$ j!$  ,	$
 $ $ $ 
j	$ $ $ $L$ S	 U	
   
j	   *'' tJ'(' 	'
 
j	' ' ' 'R7U\ 75< 7S 7UYZdUe 7 7 7 78 Y 5915+D +D|+D +D 	+D
 T%,/01+D T*-.+D 
tJd5<&8!99	:+D +D +D +D +D +D +D +Dr   )typingr   r   r   r   r   r,   torchaudio.modelsr   __all__r   r   r   r	   r   r   r   r   r   r    r!   r0   r2   r<   r>   rT   rZ   rx   Moduler
   r   r   r   <module>r      s   8 8 8 8 8 8 8 8 8 8 8 8 8 8  " " " " " " )
* 49elDel1C,DeKL

 
: $s)    *     * d5<.@)A    *     
 s    Z( T$u|2D-E    dd5<01 d dU\ dVZ[_`e`l[mVn d d d d
E E E E E E
P

Pl
P 
P 5<u|34	
P 
P 
P 
Pz d:.> 4    HD HD HD HD HDUX_ HD HD HD HD HDr   