Source code for emvoice.pitch

"""Pitch-related voice features.
"""

import logging
from typing import List, Optional, Tuple

import librosa
import numpy as np
from scipy.interpolate import interp1d

from emvoice.frames import BaseFrames
from emvoice.signal import BaseSignal
from emvoice.spectral import SpecFrames


[docs]class PitchFrames(BaseFrames):
    """Estimate and store pitch frames.

    Estimate and store the voice pitch measured as the fundamental frequency F0 in Hz.

    Parameters
    ----------
    frames: numpy.ndarray
        Voice pitch frames in Hz with shape (num_frames,).
    flag: numpy.ndarray
        Boolean flags indicating which frames are voiced with shape (num_frames,).
    prob: numpy.ndarray
        Probabilities for frames being voiced with shape (num_frames,).
    lower: float
        Lower limit used for pitch estimation (in Hz).
    upper: float
        Upper limit used for pitch estimation (in Hz).
    method: str
        Method used for estimating voice pitch.

    See Also
    --------
    librosa.pyin
    librosa.yin

    """

    def __init__(
        self,
        frames: np.ndarray,
        flag: np.ndarray,
        prob: np.ndarray,
        sr: int,
        lower: float,
        upper: float,
        frame_len: int,
        hop_len: int,
        method: str,
        center: bool = True,
        pad_mode: str = "constant",
    ):
        self.logger = logging.getLogger("emvoice.frequency.PitchFrames")
        self.flag = flag
        self.prob = prob
        self.lower = lower
        self.upper = upper
        self.method = method
        super().__init__(frames, sr, frame_len, hop_len, center, pad_mode)

    @classmethod
[docs]    def from_signal(
        cls,
        sig_obj: BaseSignal,
        frame_len: int,
        hop_len: Optional[int] = None,
        center: bool = True,
        pad_mode: str = "constant",
        lower: float = 75.0,
        upper: float = 600.0,
        method: str = "pyin",
    ):
        """Estimate the voice pitch frames from a signal.

        Currently, voice pitch can only be extracted with the *pYIN* method.

        Parameters
        ----------
        sig_obj: BaseSignal
            Signal object.
        frame_len: int
            Number of samples per frame.
        hop_len: int, optional, default=None
            Number of samples between frame starting points. If `None`, uses `frame_len // 4`.
        center: bool, default=True
            Whether to center the frames and apply padding.
        pad_mode: str, default='constant'
            How the signal is padded before framing. See :func:`numpy.pad`.
            Uses the default value 0 for `'constant'` padding. Ignored if `center=False`.
        lower: float, default = 75.0
            Lower limit for pitch estimation (in Hz).
        upper: float, default = 600.0
            Upper limit for pitch estimation (in Hz).
        method: str, default = 'pyin'
            Method for estimating voice pitch. Only `'pyin'` is currently available.

        Raises
        ------
        NotImplementedError
            If a method other than `'pyin'` is given.

        """
        if hop_len is None:
            hop_len = frame_len // 4

        if method == "pyin":
            pitch_f0, flag, prob = librosa.pyin(
                sig_obj.sig,
                fmin=lower,
                fmax=upper,
                sr=sig_obj.sr,
                frame_length=frame_len,
                hop_length=hop_len,
                center=center,
                pad_mode=pad_mode,
            )
        else:
            raise NotImplementedError(
                'Only the "pyin" method is currently available'
            )

        return cls(
            frames=pitch_f0,
            flag=flag,
            prob=prob,
            sr=sig_obj.sr,
            lower=lower,
            upper=upper,
            frame_len=frame_len,
            hop_len=hop_len,
            method=method,
        )


[docs]class PitchPulseFrames(BaseFrames):
    """Extract and store glottal pulse frames.

    Glottal pulses are peaks in the signal corresponding to the fundamental frequency F0.

    Parameters
    ----------
    frames: list
        Pulse frames. Each frame contains a list of pulses or an empty list if no pulses are detected.
        Pulses are stored as tuples (pulse timestamp, T0, amplitude).

    Notes
    -----
    See :ref:`Algorithms section <Glottal pulses>` for details.

    """

    def __init__(
        self,
        frames: List[Tuple],
        sr: int,
        frame_len: int,
        hop_len: int,
        center: bool = True,
        pad_mode: str = "constant",
    ) -> None:
        self.logger = logging.getLogger("emvoice.frequency.PitchPulseFrames")
        super().__init__(frames, sr, frame_len, hop_len, center, pad_mode)

    @property
[docs]    def idx(self) -> np.ndarray:
        if self._idx is None:
            self._idx = np.arange(len(self.frames))
        return self._idx

    @classmethod
[docs]    def from_signal_and_pitch_frames(
        cls, sig_obj: BaseSignal, pitch_frames_obj: PitchFrames
    ):
        """Extract glottal pulse frames from a signal and voice pitch frames.

        Parameters
        ----------
        sig_obj: BaseSignal
            Signal object.
        pitch_frames_obj: PitchFrames
            Voice pitch frames object.

        """
        # Access to padded signal required so we transform it here again! Could go into separate private method perhaps
        padding = [(0, 0) for _ in sig_obj.sig.shape]
        padding[-1] = (
            pitch_frames_obj.frame_len // 2,
            pitch_frames_obj.frame_len // 2,
        )
        sig_padded = np.pad(
            sig_obj.sig, padding, mode=pitch_frames_obj.pad_mode
        )
        # Create ts for padded signal
        sig_padded_ts = librosa.samples_to_time(
            np.arange(sig_padded.shape[0]), sr=sig_obj.sr
        )

        # Frame padded signal
        sig_frames_obj = BaseFrames.from_signal(
            BaseSignal(sig_padded, sig_obj.sr),
            pitch_frames_obj.frame_len,
            pitch_frames_obj.hop_len,
            center=False,
        )

        # Frame ts of padded signal
        sig_ts_frames_obj = BaseFrames.from_signal(
            BaseSignal(sig_padded_ts, sig_obj.sr),
            pitch_frames_obj.frame_len,
            pitch_frames_obj.hop_len,
            center=False,
        )

        # Interpolate pitch F0 at padded signal ts
        interp_f0 = np.interp(
            sig_padded_ts,
            pitch_frames_obj.ts[pitch_frames_obj.flag],
            pitch_frames_obj.frames[pitch_frames_obj.flag],
        )

        # Frame F0 interpolated signal
        pitch_interp_frames_obj = BaseFrames.from_signal(
            BaseSignal(interp_f0, sig_obj.sr),
            pitch_frames_obj.frame_len,
            pitch_frames_obj.hop_len,
            center=False,
        )

        # Detect pulses in each frame; objects are passed instead of arrays bcs some attributes are needed
        pulses = [
            cls._detect_pulses_in_frame(
                i,
                sig_frames_obj,
                sig_ts_frames_obj,
                pitch_frames_obj,
                pitch_interp_frames_obj,
            )
            for i in pitch_frames_obj.idx
        ]

        return cls(
            pulses,
            pitch_frames_obj.sr,
            pitch_frames_obj.frame_len,
            pitch_frames_obj.hop_len,
            pitch_frames_obj.center,
            pitch_frames_obj.pad_mode,
        )

    @classmethod
    def _get_next_pulse(
        cls,
        sig: np.ndarray,
        ts: np.ndarray,
        t0_interp: np.ndarray,
        start: float,
        stop: float,
        left: bool = True,
        pulses: Optional[List] = None,
    ):
        # Init pulses as list if first iter of recurrence and default
        if pulses is None:
            pulses = []

        # If interval [start, stop] reaches end of frame, exit recurrence
        if (
            (left and start <= ts.min())
            or (not left and stop >= ts.max())
            or any(np.isnan((start, stop)))
        ):
            return pulses

        # Get closest ts to boundaries start, stop
        start_idx = np.argmin(np.abs(ts - start))
        stop_idx = np.argmin(np.abs(ts - stop))
        interval = sig[start_idx:stop_idx]

        # Find max peak in interval [start, stop]
        peak_idx = np.nanargmax(interval)

        # Set new mid point to idx of max peak
        new_ts_mid = ts[start_idx:stop_idx][peak_idx]

        # Add pulse to output
        new_t0_interp_mid = t0_interp[start_idx:stop_idx][peak_idx]
        pulses.append((new_ts_mid, new_t0_interp_mid, interval[peak_idx]))

        # self.logger.debug('%s - %s - %s', start, stop, pulses)

        if left:  # Move interval to left
            start = new_ts_mid - 1.25 * new_t0_interp_mid
            stop = new_ts_mid - 0.8 * new_t0_interp_mid
        else:  # Move interval to right
            stop = new_ts_mid + 1.25 * new_t0_interp_mid
            start = new_ts_mid + 0.8 * new_t0_interp_mid

        # Find next pulse in new interval
        return cls._get_next_pulse(
            sig, ts, t0_interp, start, stop, left, pulses
        )

    @classmethod
    def _detect_pulses_in_frame(
        cls,
        frame_idx: int,
        sig_frames_obj: BaseFrames,
        sig_ts_frames_obj: BaseFrames,
        pitch_obj: PitchFrames,
        pitch_interp_obj: BaseFrames,
    ) -> List[Tuple]:
        # Get period of frame
        t0_mid = 1 / pitch_obj.frames[frame_idx]
        # Get ts of frame
        ts_mid = pitch_obj.ts[frame_idx]
        # Get frame signal
        sig_frame = sig_frames_obj.frames[frame_idx, :]
        # Get ts of frame signal
        ts_sig_frame = sig_ts_frames_obj.frames[frame_idx, :]
        # Get interpolated period of frame
        t0 = 1 / pitch_interp_obj.frames[frame_idx, :]

        pulses = []

        # Return empty list if frame is unvoiced (no F0)
        if np.all(np.isnan(t0)) or np.isnan(t0_mid):
            return pulses

        # Set start interval
        start = ts_mid - t0_mid / 2
        stop = ts_mid + t0_mid / 2

        # Get pulses to the left
        cls._get_next_pulse(
            sig_frame, ts_sig_frame, t0, start, stop, True, pulses
        )

        # Get pulses to the right
        cls._get_next_pulse(
            sig_frame, ts_sig_frame, t0, start, stop, False, pulses
        )

        return list(sorted(set(pulses)))


[docs]class PitchPeriodFrames(BaseFrames):
    def __init__(
        self,
        frames: np.ndarray,
        sr: int,
        frame_len: int,
        hop_len: int,
        center: bool,
        pad_mode: str,
        lower: float,
        upper: float,
    ):
        self.logger = logging.getLogger("emvoice.frequency.PitchPeriodFrames")
        self.lower = lower
        self.upper = upper
        super().__init__(frames, sr, frame_len, hop_len, center, pad_mode)

    @staticmethod
    def _calc_period_length(
        pulses: List[Tuple], lower: float, upper: float
    ) -> Tuple[List, np.ndarray]:
        # Calc period length as first order diff of pulse ts
        periods = np.diff(np.array([puls[0] for puls in pulses]))

        # Filter out too short and long periods
        mask = np.logical_and(periods > lower, periods < upper)

        # Split periods according to mask and remove masked periods
        periods = np.array_split(periods[mask], np.where(~mask)[0])

        return periods, mask

    @staticmethod
    def _check_ratio(x_arr: np.ndarray, threshold: float) -> np.ndarray:
        valid = np.logical_and(np.isfinite(x_arr[1:]), x_arr[1:] > 0)
        valid[valid] = x_arr[:-1][valid] / x_arr[1:][valid] < threshold
        return valid


[docs]class PitchHarmonicsFrames(BaseFrames):
    """Estimate and store voice pitch harmonics.

    Compute the energy of the signal at harmonics (`nF0` for any integer n) of
    the fundamental frequency.

    Parameters
    ----------
    frames: numpy.ndarray
        Harmonics frames with the shape (num_frames, n_harmonics)
    n_harmonics: int, default=100
        Number of estimated harmonics.

    See Also
    --------
    librosa.f0_harmonics

    """

    def __init__(
        self,
        frames: np.ndarray,
        sr: int,
        frame_len: int,
        hop_len: int,
        center: bool = True,
        pad_mode: str = "constant",
        n_harmonics: int = 100,
    ):
        self.logger = logging.getLogger(
            "emvoice.frequency.PitchHarmonicsFrames"
        )
        self.n_harmonics = n_harmonics
        super().__init__(frames, sr, frame_len, hop_len, center, pad_mode)

    @classmethod
[docs]    def from_spec_and_pitch_frames(
        cls,
        spec_frames_obj: SpecFrames,
        pitch_frames_obj: PitchFrames,
        n_harmonics: int = 100,
    ):
        """Estimate voice pitch harmonics from spectrogram frames and voice pitch frames.

        Parameters
        ----------
        spec_frames_obj: SpecFrames
            Spectrogram frames object.
        pitch_frames_obj: PitchFrames
            Pitch frames object.
        n_harmonics: int, default=100
            Number of harmonics to estimate.

        """

        # harmonics = librosa.f0_harmonics(
        #     np.abs(spec_frames_obj.frames),
        #     freqs=freqs,
        #     f0=pitch_frames_obj.frames,
        #     harmonics=np.arange(n_harmonics) + 1,  # Shift one up
        #     axis=-1,
        # )

        harmonics = cls._calc_f0_harmonics(
            spec_frames_obj.frames,
            spec_frames_obj.freqs,
            pitch_frames_obj.frames,
            n_harmonics,
        )

        return cls(
            harmonics,
            spec_frames_obj.sr,
            spec_frames_obj.frame_len,
            spec_frames_obj.hop_len,
            spec_frames_obj.center,
            spec_frames_obj.pad_mode,
            n_harmonics,
        )

    @staticmethod
    def _calc_f0_harmonics(
        spec_frames: np.ndarray,
        freqs: np.ndarray,
        f0_frames: np.ndarray,
        n_harmonics: int,
    ) -> np.ndarray:
        # Adapted from librosa.f0_harmonics, see:
        # https://librosa.org/doc/latest/generated/librosa.f0_harmonics.html#librosa.f0_harmonics
        is_valid = np.isfinite(freqs)

        def mag_interp_fun(spec_frames, f0_harmonic_freqs):
            interp = interp1d(
                freqs[is_valid],
                spec_frames[is_valid],
                axis=0,
                copy=False,
                assume_sorted=False,
                bounds_error=False,
                fill_value=0,
            )
            return interp(f0_harmonic_freqs)

        xfunc = np.vectorize(mag_interp_fun, signature="(f),(h)->(h)")
        harmonics_frames = xfunc(
            np.abs(spec_frames),
            np.multiply.outer(
                f0_frames, np.arange(n_harmonics) + 1
            ),  # Shift one up
        )

        return harmonics_frames


[docs]class JitterFrames(PitchPeriodFrames):
    """Extract and store voice jitter frames.

    Parameters
    ----------
    frames: numpy.ndarray
        Voice jitter frames of shape (num_frames,).
    rel: bool
        Whether the voice jitter is relative to the average period length.
    lower: float
        Lower limit for periods between glottal pulses.
    upper: float
        Upper limit for periods between glottal pulses.
    max_period_ratio: float
        Maximum ratio between consecutive periods used for jitter extraction.

    Notes
    -----
    Compute jitter as the average absolute difference between consecutive fundamental periods with a ratio
    below `max_period_ratio` for each frame. If ``rel=True``, jitter is divided by the average fundamental period
    of each frame. Fundamental periods are calculated as the first-order temporal difference between consecutive
    glottal pulses.

    """

    def __init__(
        self,
        frames: np.ndarray,
        sr: int,
        frame_len: int,
        hop_len: int,
        center: bool,
        pad_mode: str,
        rel: bool,
        lower: float,
        upper: float,
        max_period_ratio: float,
    ):
        self.logger = logging.getLogger("emvoice.frequency.JitterFrames")
        self.rel = rel
        self.max_period_ratio = max_period_ratio
        super().__init__(
            frames, sr, frame_len, hop_len, center, pad_mode, lower, upper
        )

    @classmethod
[docs]    def from_pitch_pulse_frames(
        cls,
        pitch_pulse_frames_obj: PitchPulseFrames,
        rel: bool = True,
        lower: float = 0.0001,
        upper: float = 0.02,
        max_period_ratio: float = 1.3,
    ):
        """Extract voice jitter frames from glottal pulse frames.

        Parameters
        ----------
        pitch_pulse_frames_obj: PitchPulseFrames
            Glottal pulse frames object.
        rel: bool, optional, default=True
            Divide jitter by the average pitch period.
        lower: float, optional, default=0.0001
            Lower limit for periods between glottal pulses.
        upper: float, optional, default=0.02
            Upper limit for periods between glottal pulses.
        max_period_ratio: float, optional, default=1.3
            Maximum ratio between consecutive periods for jitter extraction.
        """
        jitter_frames = np.array(
            [
                cls._calc_jitter_frame(
                    pulses, rel, lower, upper, max_period_ratio
                )
                for pulses in pitch_pulse_frames_obj.frames
            ]
        )

        return cls(
            jitter_frames,
            pitch_pulse_frames_obj.sr,
            pitch_pulse_frames_obj.frame_len,
            pitch_pulse_frames_obj.hop_len,
            pitch_pulse_frames_obj.center,
            pitch_pulse_frames_obj.pad_mode,
            rel,
            lower,
            upper,
            max_period_ratio,
        )

    @classmethod
    def _calc_jitter_frame(
        cls,
        pulses: List[Tuple],
        rel: bool,
        lower: float,
        upper: float,
        max_period_ratio: float,
    ):
        if len(pulses) == 0:
            return np.nan

        # Calc period length as first order diff of pulse ts
        periods, _ = cls._calc_period_length(pulses, lower, upper)

        if len(periods) == 0 or all(len(period) <= 1 for period in periods):
            return np.nan

        # Calc avg of first order diff in period length
        # only consider period pairs where ratio is < max_period_ratio
        period_diff = [
            np.abs(np.diff(period)[cls._check_ratio(period, max_period_ratio)])
            for period in periods
            if len(period) > 1
        ]

        if len(period_diff) == 0 or all(
            len(period) == 0 for period in period_diff
        ):
            return np.nan

        avg_period_diff = np.nanmean(
            np.array([np.mean(period) for period in period_diff])
        )

        if rel:  # Relative to mean period length
            avg_period_len = np.nanmean(
                np.array(
                    [np.mean(period) for period in periods if len(period) > 1]
                )
            )
            return avg_period_diff / avg_period_len

        return avg_period_diff


[docs]class ShimmerFrames(PitchPeriodFrames):
    """Extract and store voice shimmer frames.

    Parameters
    ----------
    frames: numpy.ndarray
        Voice shimmer frames of shape (num_frames,).
    rel: bool
        Whether the voice shimmer is relative to the average period length.
    lower: float
        Lower limit for periods between glottal pulses.
    upper: float
        Upper limit for periods between glottal pulses.
    max_period_ratio: float
        Maximum ratio between consecutive periods used for shimmer extraction.
    max_amp_factor: float
        Maximum ratio between consecutive amplitudes used for shimmer extraction.

    Notes
    -----
    Compute shimmer as the average absolute difference between consecutive pitch amplitudes with a
    fundamental period ratio below `max_period_ratio` and amplitude ratio below `max_amp_factor`
    for each frame. If ``rel=True``, shimmer is divided by the average amplitude
    of each frame. Fundamental periods are calculated as the first-order temporal difference
    between consecutive glottal pulses. Amplitudes are signal amplitudes at the glottal pulses.
    """

    def __init__(
        self,
        frames: List[Tuple],
        sr: int,
        frame_len: int,
        hop_len: int,
        center: bool,
        pad_mode: str,
        rel: bool,
        lower: float,
        upper: float,
        max_period_ratio: float,
        max_amp_factor: float,
    ):
        self.logger = logging.getLogger("emvoice.frequency.ShimmerFrames")
        self.rel = rel
        self.max_period_ratio = max_period_ratio
        self.max_amp_factor = max_amp_factor
        super().__init__(
            frames, sr, frame_len, hop_len, center, pad_mode, lower, upper
        )

    @classmethod
[docs]    def from_pitch_pulse_frames(
        cls,
        pitch_pulse_frames_obj: PitchPulseFrames,
        rel: bool = True,
        lower: float = 0.0001,
        upper: float = 0.02,
        max_period_ratio: float = 1.3,
        max_amp_factor: float = 1.6,
    ):
        """Extract voice shimmer frames from glottal pulse frames.

        Parameters
        ----------
        pitch_pulse_frames_obj: PitchPulseFrames
            Glottal pulse frames object.
        rel: bool, optional, default=True
            Divide shimmer by the average pulse amplitude.
        lower: float, optional, default=0.0001
            Lower limit for periods between glottal pulses.
        upper: float, optional, default=0.02
            Upper limit for periods between glottal pulses.
        max_period_ratio: float, optional, default=1.3
            Maximum ratio between consecutive periods for shimmer extraction.
        max_amp_factor: float, optional, default=1.6
            Maximum ratio between consecutive amplitudes used for shimmer extraction.
        """
        shimmer_frames = np.array(
            [
                cls._calc_shimmer_frame(
                    pulses, rel, lower, upper, max_period_ratio, max_amp_factor
                )
                for pulses in pitch_pulse_frames_obj.frames
            ]
        )

        return cls(
            shimmer_frames,
            pitch_pulse_frames_obj.sr,
            pitch_pulse_frames_obj.frame_len,
            pitch_pulse_frames_obj.hop_len,
            pitch_pulse_frames_obj.center,
            pitch_pulse_frames_obj.pad_mode,
            rel,
            lower,
            upper,
            max_period_ratio,
            max_amp_factor,
        )

    @classmethod
    def _calc_shimmer_frame(
        cls,
        pulses: List[Tuple],
        rel: bool,
        lower: float,
        upper: float,
        max_period_ratio: float,
        max_amp_factor: float,
    ) -> float:
        if len(pulses) == 0:
            return np.nan

        # Calc period length as first order diff of pulse ts
        periods, mask = cls._calc_period_length(pulses, lower, upper)
        amps = cls._get_amplitude(pulses, mask)

        if (
            len(periods) == 0
            or len(amps) == 0
            or all(len(period) <= 1 for period in periods)
        ):
            return np.nan

        # Calc avg of first order diff in amplitude
        # only consider period pairs where period ratio is < max_period_ratio and
        # where amplitude ratio is < max_amp_factor
        amp_diff = [
            np.abs(
                np.diff(amp)[
                    np.logical_and(
                        cls._check_ratio(period, max_period_ratio),
                        cls._check_ratio(amp, max_amp_factor),
                    )
                ]
            )
            for amp, period in zip(amps, periods)
            if len(period) > 1 and len(amp) > 1
        ]

        if len(amp_diff) == 0 or all(len(amp) == 0 for amp in amp_diff):
            return np.nan

        avg_amp_diff = np.nanmean(np.array([np.mean(amp) for amp in amp_diff]))

        if rel:  # Relative to mean amplitude
            avg_amp = np.nanmean(
                np.array([np.mean(amp) for amp in amps if len(amp) > 1])
            )
            return avg_amp_diff / avg_amp

        return avg_amp_diff

    @staticmethod
    def _get_amplitude(pulses: List[Tuple], mask: np.ndarray) -> List:
        # Get amplitudes
        amps = np.array([puls[2] for puls in pulses])[
            1:
        ]  # Skip first amplitude to align with periods

        # Split periods according to mask and remove masked periods
        amps = np.array_split(amps[mask], np.where(~mask)[0])

        return amps