
    ~Vji              
           d dl mZ d dlZd dlmZ d dlZ G d dej                  Z G d dej                  Z G d dej                  Z	d	e
d
ededede	f
dZde	fdZdS )    )TupleNc                   P     e Zd ZdZdedef fdZdej        dej        fdZ xZ	S )AttPoolzAttention-Pooling module that estimates the attention score.

    Args:
        input_dim (int): Input feature dimension.
        att_dim (int): Attention Tensor dimension.
    	input_dimatt_dimc                     t          t          |                                            t          j        |d          | _        t          j        ||          | _        d S )N   )superr   __init__nnLinearlinear1linear2selfr   r   	__class__s      \/root/voice-cloning/.venv/lib/python3.11/site-packages/torchaudio/models/squim/subjective.pyr   zAttPool.__init__   sI    gt%%'''yA..yG44    xreturnc                    |                      |          }|                    dd          }t          j                            |d          }t          j        ||                              d          }|                     |          }|S )zApply attention and pooling.

        Args:
            x (torch.Tensor): Input Tensor with dimensions `(batch, time, feature_dim)`.

        Returns:
            (torch.Tensor): Attention score with dimensions `(batch, att_dim)`.
           r	   dim)	r   	transposer   
functionalsoftmaxtorchmatmulsqueezer   )r   r   atts      r   forwardzAttPool.forward   ss     ll1oommAq!!m##CQ#//La  ((++LLOOr   
__name__
__module____qualname____doc__intr   r   Tensorr"   __classcell__r   s   @r   r   r      s{         5# 5 5 5 5 5 5 5 %,        r   r   c                   P     e Zd ZdZdedef fdZdej        dej        fdZ xZ	S )	PredictorzPrediction module that apply pooling and attention, then predict subjective metric scores.

    Args:
        input_dim (int): Input feature dimension.
        att_dim (int): Attention Tensor dimension.
    r   r   c                     t          t          |                                            t          ||          | _        || _        d S N)r
   r-   r   r   att_pool_layerr   r   s      r   r   zPredictor.__init__0   s<    i'')))%i99r   r   r   c                     |                      |          }t          j                            |d          }t	          j        dd| j        |j                  }||z                      d          }|S )a  Predict subjective evaluation metric score.

        Args:
            x (torch.Tensor): Input Tensor with dimensions `(batch, time, feature_dim)`.

        Returns:
            (torch.Tensor): Subjective metric score. Tensor with dimensions `(batch,)`.
        r	   r   r      )stepsdevice)	r0   r   r   r   r   linspacer   r4   sum)r   r   Bs      r   r"   zPredictor.forward5   si     ""M!!!!++N1at|AHEEEUKKAKr   r#   r+   s   @r   r-   r-   (   s{         #       
 %,        r   r-   c                        e Zd ZdZdej        dej        dej        f fdZdej        dej        de	ej        ej        f         fd	Z
dej        dej        fd
Z xZS )SquimSubjectiveaP  Speech Quality and Intelligibility Measures (SQUIM) model that predicts **subjective** metric scores
    for speech enhancement (e.g., Mean Opinion Score (MOS)). The model is adopted from *NORESQA-MOS*
    :cite:`manocha2022speech` which predicts MOS scores given the input speech and a non-matching reference.

    Args:
        ssl_model (torch.nn.Module): The self-supervised learning model for feature extraction.
        projector (torch.nn.Module): Projection layer that projects SSL feature to a lower dimension.
        predictor (torch.nn.Module): Predict the subjective scores.
    	ssl_model	projector	predictorc                     t          t          |                                            || _        || _        || _        d S r/   )r
   r9   r   r:   r;   r<   )r   r:   r;   r<   r   s       r   r   zSquimSubjective.__init__P   s8    ot$$--///"""r   waveform	referencer   c                     |j         d         }j         d         }||k     r7||z  dz   }t          j        fdt          |          D             d          |ddd|f         fS )a  Cut or pad the reference Tensor to make it aligned with waveform Tensor.

        Args:
            waveform (torch.Tensor): Input waveform for evaluation. Tensor with dimensions `(batch, time)`.
            reference (torch.Tensor): Non-matching clean reference. Tensor with dimensions `(batch, time_ref)`.

        Returns:
            (torch.Tensor, torch.Tensor): The aligned waveform and reference Tensors
                with same dimensions `(batch, time)`.
        r	   c                     g | ]}S  rC   ).0_r?   s     r   
<listcomp>z1SquimSubjective._align_shapes.<locals>.<listcomp>e   s    "I"I"I9"I"I"Ir   r   N)shaper   catrange)r   r>   r?   
T_waveformT_referencenum_paddings     `   r   _align_shapeszSquimSubjective._align_shapesV   s     ^B'
ob)##$3a7K	"I"I"I"IeK6H6H"I"I"IqQQQI111kzk>222r   c                 |   |                      ||          \  }}|                     | j                            |          d         d                   }|                     | j                            |          d         d                   }t	          j        ||fd          }|                     |          }d|z
  S )a  Predict subjective evaluation metric score.

        Args:
            waveform (torch.Tensor): Input waveform for evaluation. Tensor with dimensions `(batch, time)`.
            reference (torch.Tensor): Non-matching clean reference. Tensor with dimensions `(batch, time_ref)`.

        Returns:
            (torch.Tensor): Subjective metric score. Tensor with dimensions `(batch,)`.
        r   rA   r   r      )rM   r;   r:   extract_featuresr   rH   r<   )r   r>   r?   concat
score_diffs        r   r"   zSquimSubjective.forwardh   s     #009EE)>>$."A"A("K"KA"Nr"RSSNN4>#B#B9#M#Ma#PQS#TUU	Ix0a888^^F++
:~r   )r$   r%   r&   r'   r   Moduler   r   r)   r   rM   r"   r*   r+   s   @r   r9   r9   E   s         #") #	 #bi # # # # # #3el 3u| 3PUV[VbdidpVpPq 3 3 3 3$         r   r9   ssl_typefeat_dimproj_dimr   r   c                      t          t          j        |                       }t          j        ||          }t          |dz  |          }t          |||          S )a  Build a custome :class:`torchaudio.prototype.models.SquimSubjective` model.

    Args:
        ssl_type (str): Type of self-supervised learning (SSL) models.
            Must be one of ["wav2vec2_base", "wav2vec2_large"].
        feat_dim (int): Feature dimension of the SSL feature representation.
        proj_dim (int): Output dimension of projection layer.
        att_dim (int): Dimension of attention scores.
    r   )getattr
torchaudiomodelsr   r   r-   r9   )rT   rU   rV   r   r:   r;   r<   s          r   squim_subjective_modelr[   z   sT     5
)84466I	(H--I(Q,00I9i;;;r   c                  (    t          dddd          S )zXBuild :class:`torchaudio.prototype.models.SquimSubjective` model with default arguments.wav2vec2_basei       rO   )rT   rU   rV   r   )r[   rC   r   r   squim_subjective_baser_      s%    ! 	   r   )typingr   r   torch.nnr   rY   rS   r   r-   r9   strr(   r[   r_   rC   r   r   <module>rc      s+                       bi   @    	   :2 2 2 2 2bi 2 2 2j<<< < 	<
 < < < <*      r   