a
    h'2                     @   sn  d dl mZmZmZmZmZ d dlZd dlmZ ddgZ	eee
 ejeeej  ef Zde_eee
 ddd	Zeejdd
dZeeeej  dddZeedddZeedddZee eeej  dddZeeej  e
ejeeej  dddZeedddZee eje
eejejejf dddZeee ddddZG d d dejjZdS )!    )CallableDictListOptionalTupleN)RNNT
HypothesisRNNTBeamSearchzHypothesis generated by RNN-T beam search decoder,
    represented as tuple of (tokens, prediction network output, prediction network state, score).
    )hyporeturnc                 C   s   | d S Nr    r
   r   r   L/var/www/auris/lib/python3.9/site-packages/torchaudio/models/rnnt_decoder.py_get_hypo_tokens   s    r   c                 C   s   | d S N   r   r   r   r   r   _get_hypo_predictor_out   s    r   c                 C   s   | d S )N   r   r   r   r   r   _get_hypo_state   s    r   c                 C   s   | d S )N   r   r   r   r   r   _get_hypo_score   s    r   c                 C   s   t | d S r   )strr   r   r   r   _get_hypo_key    s    r   )hyposr   c              	      sn   g }t tt| d D ]P g }t tt| d   D ]$|t fdd| D  q8|| q|S )Nr   c                    s   g | ]}t |   qS r   )r   .0r
   ijr   r   
<listcomp>)       z _batch_state.<locals>.<listcomp>)rangelenr   appendtorchcat)r   statesZbatched_state_componentsr   r   r   _batch_state$   s    "r(   )r'   idxdevicer   c                    s"   t j|g|d  fdd| D S )Nr*   c                    s   g | ]} fd d|D qS )c                    s   g | ]}| d  qS )r   )Zindex_select)r   stateZ
idx_tensorr   r   r    0   r!   z+_slice_state.<locals>.<listcomp>.<listcomp>r   )r   Zstate_tupler-   r   r   r    0   r!   z _slice_state.<locals>.<listcomp>)r%   tensor)r'   r)   r*   r   r-   r   _slice_state.   s    r/   c                 C   s   t | tt| d  S r   )r   r#   r   r   r   r   r   _default_hypo_sort_key3   s    r0   )r   next_token_probs
beam_widthr   c           	      C   sr   t dd | D d}||d d d df  }|d|\}}|j|jd dd}||jd  }|||fS )Nc                 S   s   g | ]}t |qS r   r   r   hr   r   r   r    <   r!   z+_compute_updated_scores.<locals>.<listcomp>r   trunc)Zrounding_mode)r%   r.   	unsqueezeZreshapetopkdivshape)	r   r1   r2   Zhypo_scoresZnonblank_scoresnonblank_nbest_scoresZnonblank_nbest_idxnonblank_nbest_hypo_idxnonblank_nbest_tokenr   r   r   _compute_updated_scores7   s    r?   )r
   	hypo_listr   c                 C   s0   t |D ]"\}}t| t|kr||=  q,qd S N)	enumerater   )r
   r@   r   elemr   r   r   _remove_hypoD   s    rD   c                       s  e Zd ZdZd eeeeee	gef  edd fddZ
ejee	 dd	d
Zejee	 ejejdddZee	 ee	 ejeee	f ee	 dddZee	 ee	 ejeeejee	 dddZee	 ee ee eejee	 dddZejeee	  eee	 dddZejejeee	 dddZejjd!ejejeeeeej   eee	  eee	 eeej  f dddZ  ZS )"r	   a)  Beam search decoder for RNN-T model.

    See Also:
        * :class:`torchaudio.pipelines.RNNTBundle`: ASR pipeline with pretrained model.

    Args:
        model (RNNT): RNN-T model to use.
        blank (int): index of blank token in vocabulary.
        temperature (float, optional): temperature to apply to joint network output.
            Larger values yield more uniform samples. (Default: 1.0)
        hypo_sort_key (Callable[[Hypothesis], float] or None, optional): callable that computes a score
            for a given hypothesis to rank hypotheses by. If ``None``, defaults to callable that returns
            hypothesis score normalized by token sequence length. (Default: None)
        step_max_tokens (int, optional): maximum number of tokens to emit per input time step. (Default: 100)
          ?Nd   )modelblanktemperaturehypo_sort_keystep_max_tokensr   c                    s<   t    || _|| _|| _|d u r,t| _n|| _|| _d S rA   )super__init__rG   rH   rI   r0   rJ   rK   )selfrG   rH   rI   rJ   rK   	__class__r   r   rM   \   s    
zRNNTBeamSearch.__init__)r*   r   c           	      C   sZ   | j }d }tjdg|d}| jtj|gg|d||\}}}|g|d  |df}|gS )Nr   r+   r   g        )rH   r%   r.   rG   predictdetach)	rN   r*   tokenr,   
one_tensorpred_out_Z
pred_stateZ	init_hypor   r   r   _init_b_hyposp   s    $
zRNNTBeamSearch._init_b_hypos)enc_outr   r*   r   c              	   C   s~   t jdg|d}t jdd |D dd}| j|||t jdgt| |d\}}}t jjj|| j	 dd}|d d ddf S )Nr   r+   c                 S   s   g | ]}t |qS r   )r   r4   r   r   r   r       r!   z8RNNTBeamSearch._gen_next_token_probs.<locals>.<listcomp>r   )dimr   )
r%   r.   stackrG   joinr#   nnZ
functionalZlog_softmaxrI   )rN   rX   r   r*   rT   Zpredictor_outZ
joined_outrV   r   r   r   _gen_next_token_probs~   s    
z$RNNTBeamSearch._gen_next_token_probs)b_hyposa_hyposr1   key_to_b_hypor   c                    s   t t|D ]}|| }t|||df  }t||v rh|t| }t|  ttt||}	nt|}	t	|t
|t||	f} | ||t|< qtdd  D  \}
} fdd|D S )Nr6   c                 S   s   g | ]}t |qS r   r3   r   r   r   r   r       r!   z/RNNTBeamSearch._gen_b_hypos.<locals>.<listcomp>c                    s   g | ]} | qS r   r   r   r)   r^   r   r   r       r!   )r"   r#   r   r   rD   floatr%   r.   Z	logaddexpr   r   r   r$   sort)rN   r^   r_   r1   r`   r   h_aZappend_blank_scoreZh_bscorerV   
sorted_idxr   rb   r   _gen_b_hypos   s"    

zRNNTBeamSearch._gen_b_hypos)r_   r^   r1   tr2   r*   r   c                 C   s   t |||\}}}	t||k r*td }
nt||  }
g }g }g }t|D ]N}t|| }||
krLt|| }|||  |t|	|  || qL|r| |||||}ng }|S )Ninf)r?   r#   rc   r   r"   intr$   _gen_new_hypos)rN   r_   r^   r1   ri   r2   r*   r<   r=   r>   Zb_nbest_score
base_hypos
new_tokensZ
new_scoresr   rf   Z
a_hypo_idx	new_hyposr   r   r   _gen_a_hypos   s,    
zRNNTBeamSearch._gen_a_hypos)rm   tokensscoresri   r*   r   c              	   C   s   t jdd |D |d}t|}| j|t jdgt| |d|\}}	}
g }t|D ]@\}}t||| g }|||| 	 t
|
|||| f qV|S )Nc                 S   s   g | ]
}|gqS r   r   )r   rS   r   r   r   r       r!   z1RNNTBeamSearch._gen_new_hypos.<locals>.<listcomp>r+   r   )r%   r.   r(   rG   rQ   r#   rB   r   r$   rR   r/   )rN   rm   rq   rr   ri   r*   Z
tgt_tokensr'   rU   rV   Zpred_statesro   r   re   rn   r   r   r   rl      s    
(zRNNTBeamSearch._gen_new_hypos)rX   r
   r2   r   c              	      s   |j d }|j}g }|d u r&|n| t|D ]} }tjtt g  i }d}	|r	|d d ||d f ||}
|

 }
 ||
| |	jkrq| |
|||}|rT|	d7 }	qTtfdd D |\}} fdd|D  q2 S )Nr   r   c                    s   g | ]}  |qS r   )rJ   )r   Zhyp)rN   r   r   r      r!   z*RNNTBeamSearch._search.<locals>.<listcomp>c                    s   g | ]} | qS r   r   ra   rb   r   r   r      r!   )r;   r*   rW   r"   r%   jitZannotater   r   r]   cpurh   rK   rp   r.   r9   )rN   rX   r
   r2   Zn_time_stepsr*   r_   ri   r`   Zsymbols_current_tr1   rV   rg   r   )r^   rN   r   _search   s8    
"

"zRNNTBeamSearch._search)inputlengthr2   r   c                 C   s   |  dkr.|  dkr&|jd dks.td|  dkrD|d}|jdkr`|jdkr`td|  dkrv|d}| j||\}}| |d	|S )
a  Performs beam search for the given input sequence.

        T: number of frames;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D).
            length (torch.Tensor): number of valid frames in input
                sequence, with shape () or (1,).
            beam_width (int): beam size to use during search.

        Returns:
            List[Hypothesis]: top-``beam_width`` hypotheses found by beam search.
        r   r   r   r   *input must be of shape (T, D) or (1, T, D)r   r   "length must be of shape () or (1,)N)rY   r;   
ValueErrorr8   rG   Z
transcriberu   )rN   rv   rw   r2   rX   rV   r   r   r   forward  s    &

zRNNTBeamSearch.forward)rv   rw   r2   r,   
hypothesisr   c                 C   s   |  dkr.|  dkr&|jd dks.td|  dkrD|d}|jdkr`|jdkr`td|  dkrv|d}| j|||\}}}| ||||fS )	a  Performs beam search for the given input sequence in streaming mode.

        T: number of frames;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D).
            length (torch.Tensor): number of valid frames in input
                sequence, with shape () or (1,).
            beam_width (int): beam size to use during search.
            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
                representing transcription network internal state generated in preceding
                invocation. (Default: ``None``)
            hypothesis (List[Hypothesis] or None): hypotheses from preceding invocation to seed
                search with. (Default: ``None``)

        Returns:
            (List[Hypothesis], List[List[torch.Tensor]]):
                List[Hypothesis]
                    top-``beam_width`` hypotheses found by beam search.
                List[List[torch.Tensor]]
                    list of lists of tensors representing transcription network
                    internal state generated in current invocation.
        r   r   r   r   rx   r   ry   rz   )rY   r;   r{   r8   rG   Ztranscribe_streamingru   )rN   rv   rw   r2   r,   r}   rX   rV   r   r   r   infer'  s    !&

zRNNTBeamSearch.infer)rE   NrF   )NN)__name__
__module____qualname____doc__r   rk   rc   r   r   r   rM   r%   r*   r   rW   Tensorr]   r   r   rh   rp   rl   ru   r|   rs   exportr   r~   __classcell__r   r   rO   r   r	   K   sl      
(
)  
)typingr   r   r   r   r   r%   Ztorchaudio.modelsr   __all__rk   r   rc   r   r   r   r   r   r   r   r   r(   r*   r/   r0   r?   rD   r\   Moduler	   r   r   r   r   <module>   s(    
*