
    h              
           S SK Jr  S SKrS SKJr  S SKr " S S\R                  5      r " S S\R                  5      r " S S\R                  5      r	S	\
S
\S\S\S\	4
S jrS\	4S jrg)    )TupleNc                   r   ^  \ rS rSrSrS\S\4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
AttPool   zAttention-Pooling module that estimates the attention score.

Args:
    input_dim (int): Input feature dimension.
    att_dim (int): Attention Tensor dimension.
	input_dimatt_dimc                    > [         [        U ]  5         [        R                  " US5      U l        [        R                  " X5      U l        g )N   )superr   __init__nnLinearlinear1linear2selfr   r   	__class__s      Z/var/www/auris/envauris/lib/python3.13/site-packages/torchaudio/models/squim/subjective.pyr   AttPool.__init__   s2    gt%'yyA.yy4    xreturnc                     U R                  U5      nUR                  SS5      n[        R                  R	                  USS9n[
        R                  " X!5      R                  S5      nU R                  U5      nU$ )zApply attention and pooling.

Args:
    x (torch.Tensor): Input Tensor with dimensions `(batch, time, feature_dim)`.

Returns:
    (torch.Tensor): Attention score with dimensions `(batch, att_dim)`.
   r
   dim)	r   	transposer   
functionalsoftmaxtorchmatmulsqueezer   )r   r   atts      r   forwardAttPool.forward   se     ll1ommAq!mm##CQ#/LL ((+LLOr   )r   r   __name__
__module____qualname____firstlineno____doc__intr   r    Tensorr$   __static_attributes____classcell__r   s   @r   r   r      s;    5# 5 5 %,,  r   r   c                   r   ^  \ rS rSrSrS\S\4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
	Predictor(   zPrediction module that apply pooling and attention, then predict subjective metric scores.

Args:
    input_dim (int): Input feature dimension.
    att_dim (int): Attention Tensor dimension.
r   r   c                 V   > [         [        U ]  5         [        X5      U l        X l        g N)r   r2   r   r   att_pool_layerr   r   s      r   r   Predictor.__init__0   s"    i')%i9r   r   r   c                     U R                  U5      n[        R                  R                  USS9n[        R
                  " SSU R                  UR                  S9nX-  R                  SS9nU$ )zPredict subjective evaluation metric score.

Args:
    x (torch.Tensor): Input Tensor with dimensions `(batch, time, feature_dim)`.

Returns:
    (torch.Tensor): Subjective metric score. Tensor with dimensions `(batch,)`.
r
   r   r      )stepsdevice)	r6   r   r   r   r    linspacer   r;   sum)r   r   Bs      r   r$   Predictor.forward5   s`     "MM!!!!+NN1at||AHHEUKKAKr   )r   r6   r&   r0   s   @r   r2   r2   (   s;    #  
 %,,  r   r2   c                   "  ^  \ rS rSrSrS\R                  S\R                  S\R                  4U 4S jjrS\R                  S\R                  S	\
\R                  \R                  4   4S
 jrS\R                  S\R                  4S jrSrU =r$ )SquimSubjectiveE   a4  Speech Quality and Intelligibility Measures (SQUIM) model that predicts **subjective** metric scores
for speech enhancement (e.g., Mean Opinion Score (MOS)). The model is adopted from *NORESQA-MOS*
:cite:`manocha2022speech` which predicts MOS scores given the input speech and a non-matching reference.

Args:
    ssl_model (torch.nn.Module): The self-supervised learning model for feature extraction.
    projector (torch.nn.Module): Projection layer that projects SSL feature to a lower dimension.
    predictor (torch.nn.Module): Predict the subjective scores.
	ssl_model	projector	predictorc                 N   > [         [        U ]  5         Xl        X l        X0l        g r5   )r   rA   r   rC   rD   rE   )r   rC   rD   rE   r   s       r   r   SquimSubjective.__init__P   s    ot-/"""r   waveform	referencer   c                     UR                   S   nUR                   S   nXC:  a5  X4-  S-   n[        R                  " [        U5       Vs/ s H  obPM     snSS9nXSS2SU24   4$ s  snf )a  Cut or pad the reference Tensor to make it aligned with waveform Tensor.

Args:
    waveform (torch.Tensor): Input waveform for evaluation. Tensor with dimensions `(batch, time)`.
    reference (torch.Tensor): Non-matching clean reference. Tensor with dimensions `(batch, time_ref)`.

Returns:
    (torch.Tensor, torch.Tensor): The aligned waveform and reference Tensors
        with same dimensions `(batch, time)`.
r
   r   N)shaper    catrange)r   rH   rI   
T_waveformT_referencenum_padding_s          r   _align_shapesSquimSubjective._align_shapesV   su     ^^B'
oob)#$3a7K		eK6H"I6H96H"IqQI1kzk>222 #Js   A%c                 @   U R                  X5      u  pU R                  U R                  R                  U5      S   S   5      nU R                  U R                  R                  U5      S   S   5      n[        R
                  " X!4SS9nU R                  U5      nSU-
  $ )aY  Predict subjective evaluation metric score.

Args:
    waveform (torch.Tensor): Input waveform for evaluation. Tensor with dimensions `(batch, time)`.
    reference (torch.Tensor): Non-matching clean reference. Tensor with dimensions `(batch, time_ref)`.

Returns:
    (torch.Tensor): Subjective metric score. Tensor with dimensions `(batch,)`.
r   rK   r   r      )rS   rD   rC   extract_featuresr    rM   rE   )r   rH   rI   concat
score_diffs        r   r$   SquimSubjective.forwardh   s     #00E>>$.."A"A("KA"Nr"RSNN4>>#B#B9#Ma#PQS#TU	I0a8^^F+
:~r   )rE   rD   rC   )r'   r(   r)   r*   r+   r   Moduler   r    r-   r   rS   r$   r.   r/   r0   s   @r   rA   rA   E   s    #")) #		 #bii #3ell 3u|| 3PUV[VbVbdidpdpVpPq 3$   r   rA   ssl_typefeat_dimproj_dimr   r   c                     [        [        R                  U 5      " 5       n[        R                  " X5      n[        US-  U5      n[        XEU5      $ )a  Build a custome :class:`torchaudio.prototype.models.SquimSubjective` model.

Args:
    ssl_type (str): Type of self-supervised learning (SSL) models.
        Must be one of ["wav2vec2_base", "wav2vec2_large"].
    feat_dim (int): Feature dimension of the SSL feature representation.
    proj_dim (int): Output dimension of projection layer.
    att_dim (int): Dimension of attention scores.
r   )getattr
torchaudiomodelsr   r   r2   rA   )r\   r]   r^   r   rC   rD   rE   s          r   squim_subjective_modelrc   z   sD     
))846I		(-I(Q,0I9;;r   c                      [        SSSSS9$ )zXBuild :class:`torchaudio.prototype.models.SquimSubjective` model with default arguments.wav2vec2_basei       rV   )r\   r]   r^   r   )rc    r   r   squim_subjective_baserh      s    ! 	 r   )typingr   r    torch.nnr   ra   r[   r   r2   rA   strr,   rc   rh   rg   r   r   <module>rl      s       bii @		 :2bii 2j<<< < 	<
 <* r   