Source code for emvoice.formants

"""Formant-related voice features."""

import logging
import math
from copy import copy
from typing import List, Optional, Tuple, Union

import librosa
import numpy as np
from scipy.signal.windows import get_window

from emvoice.frames import BaseFrames
from emvoice.pitch import PitchFrames, PitchHarmonicsFrames


[docs]class FormantFrames(BaseFrames): """Estimate and store formant frames. Parameters ---------- frames: list Formant frames. Each frame contains a list of tuples for each formant, where the first item is the central frequency and the second the bandwidth. max_formants: int, default=5 The maximum number of formants that were extracted. lower: float, default=50.0 Lower limit for formant frequencies (in Hz). upper: float, default=5450.0 Upper limit for formant frequencies (in Hz). preemphasis_from: float, default=50.0 Starting value for the applied preemphasis function. window: str Window function that was applied before formant estimation. Notes ----- See the :ref:`Algorithms section <Formant frequencies and amplitudes>` for details. """ def __init__( self, frames: List, sr: int, frame_len: int, hop_len: int, center: bool = True, pad_mode: str = "constant", max_formants: int = 5, lower: float = 50.0, upper: float = 5450.0, preemphasis_from: Optional[float] = 50.0, window: Optional[Union[str, float, Tuple]] = "praat_gaussian", ) -> None: self.logger = logging.getLogger("emvoice.frequency.FormantFrames") self.max_formants = max_formants self.lower = lower self.upper = upper self.preemphasis_from = preemphasis_from self.window = window super().__init__(frames, sr, frame_len, hop_len, center, pad_mode) @property
[docs] def idx(self) -> np.ndarray: if self._idx is None: self._idx = np.arange(len(self.frames)) return self._idx
@staticmethod def _praat_gauss_window(frame_len: int): # This is the Gaussian window that is used in Praat for formant estimation # See: https://github.com/YannickJadoul/Parselmouth/blob/master/praat/fon/Sound_to_Formant.cpp sample_idx = np.arange(frame_len) + 1 idx_mid = 0.5 * (frame_len + 1) edge = np.exp(-12.0) return ( np.exp(-48.0 * (sample_idx - idx_mid) ** 2 / (frame_len) ** 2) - edge ) / (1.0 - edge) @classmethod
[docs] def from_frames( cls, sig_frames_obj: BaseFrames, max_formants: int = 5, lower: float = 50.0, upper: float = 5450.0, preemphasis_from: Optional[float] = 50.0, window: Optional[Union[str, float, Tuple]] = "praat_gaussian", ): """Extract formants from signal frames. Parameters ---------- sig_frames_obj: BaseFrames Signal frames object. max_formants: int, default=5 The maximum number of formants that are extracted. lower: float, default=50.0 Lower limit for formant frequencies (in Hz). upper: float, default=5450.0 Upper limit for formant frequencies (in Hz). preemphasis_from: float, default=50.0 Starting value for the preemphasis function (in Hz). window: str Window function that is applied before formant estimation. """ frames = sig_frames_obj.frames if preemphasis_from is not None: pre_coef = math.exp( -2 * math.pi * preemphasis_from * (1 / sig_frames_obj.sr) ) frames = librosa.effects.preemphasis( sig_frames_obj.frames, coef=pre_coef ) if window is not None: if window == "praat_gaussian": win = cls._praat_gauss_window(sig_frames_obj.frame_len) else: win = get_window( window, sig_frames_obj.frame_len, fftbins=False ) frames = frames * win # Calc linear predictive coefficients coefs = librosa.lpc(frames, order=max_formants * 2) # Transform LPCs to formants formants = [ cls._calc_formants(coef, sig_frames_obj.sr, lower, upper) for coef in coefs ] return cls( formants, sig_frames_obj.sr, sig_frames_obj.frame_len, sig_frames_obj.hop_len, sig_frames_obj.center, sig_frames_obj.pad_mode, max_formants, lower, upper, preemphasis_from, window,
) @staticmethod def _calc_formants( coefs: np.ndarray, sr: int, lower: float = 50, upper: float = 5450 ) -> List: # Function to compute complex norm def complex_norm(x): return np.sqrt(np.abs(np.real(x) ** 2) + np.abs(np.imag(x) ** 2)) nf_pi = sr / (2 * math.pi) # sr/2 = Nyquist freq # Find roots of linear coefficients roots = np.roots(coefs) # Select roots with positive imag part mask = np.imag(roots) > 0 roots = roots[mask] # Calc angular frequency ang_freq = np.abs(np.arctan2(np.imag(roots), np.real(roots))) # Calc formant centre freq formant_freqs = ang_freq * nf_pi # Calc formant bandwidth formant_bws = ( -np.log(np.apply_along_axis(complex_norm, 0, roots)) * nf_pi ) # Select formants within boundaries in_bounds = np.logical_and(formant_freqs > lower, formant_freqs < upper) formants_sorted = sorted( list(zip(formant_freqs[in_bounds], formant_bws[in_bounds])) ) return formants_sorted def select_formant_attr( self, formant_idx: int, attr_idx: int ) -> np.ndarray: return np.array( [ f[formant_idx][attr_idx] if len(f) > formant_idx else np.nan for f in self.frames
] )
[docs]class FormantAmplitudeFrames(BaseFrames): """Estimate and store formant amplitudes. Parameters ---------- frames: np.ndarray Formant amplitude frames of shape (num_frames, max_formants) in dB. lower: float Lower boundary for peak amplitude search interval. upper: float Upper boundary for peak amplitude search interval. rel_f0: bool Whether the amplitude is relative to the fundamental frequency amplitude. Notes ----- Estimate the formant amplitude as the maximum amplitude of harmonics of the fundamental frequency within an interval ``[lower*f, upper*f]`` where `f` is the central frequency of the formant in each frame. If ``rel=True``, divide the amplitude by the amplitude of the fundamental frequency. """ def __init__( self, frames: np.ndarray, sr: int, frame_len: int, hop_len: int, center: bool, pad_mode: str, lower: float, upper: float, rel_f0: bool, ): self.lower = lower self.upper = upper self.rel_f0 = rel_f0 super().__init__(frames, sr, frame_len, hop_len, center, pad_mode) @property
[docs] def idx(self) -> np.ndarray: if self._idx is None: self._idx = np.arange(len(self.frames)) return self._idx
@classmethod
[docs] def from_formant_harmonics_and_pitch_frames( # pylint: disable=too-many-locals cls, formant_frames_obj: FormantFrames, harmonics_frames_obj: PitchHarmonicsFrames, pitch_frames_obj: PitchFrames, lower: float = 0.8, upper: float = 1.2, rel_f0: bool = True, ): """Estimate formant amplitudes from formant, pitch harmonics, and pitch frames. Parameters ---------- formant_frames_obj: FormantFrames Formant frames object. harmonics_frames_obj: PitchHarmonicsFrames Pitch harmonics frames object. pitch_frames_obj: PitchFrames Pitch frames object. lower: float, optional, default=0.8 Lower boundary for peak amplitude search interval. upper: float, optional, default=1.2 Upper boundary for peak amplitude search interval. rel_f0: bool, optional, default=True Whether the amplitude is divided by the fundamental frequency amplitude. """ amp_frames = [] for i in range(formant_frames_obj.max_formants): freqs = formant_frames_obj.select_formant_attr(i, 0) harmonic_freqs = ( pitch_frames_obj.frames[:, None] * (np.arange(harmonics_frames_obj.n_harmonics) + 1)[None, :] ) f0_amp = harmonics_frames_obj.frames[:, 0] freqs_lower = lower * freqs freqs_upper = upper * freqs freq_in_bounds = np.logical_and( harmonic_freqs > freqs_lower[:, None], harmonic_freqs < freqs_upper[:, None], ) harmonics_amp = copy(harmonics_frames_obj.frames) harmonics_amp[~freq_in_bounds] = np.nan # Set all-nan frames to nan (otherwise np.nanmax throws warning) harmonic_peaks = np.zeros(harmonics_amp.shape[0:1]) harmonics_amp_all_na = np.all(np.isnan(harmonics_amp), axis=1) harmonic_peaks[harmonics_amp_all_na] = np.nan harmonic_peaks[~harmonics_amp_all_na] = np.nanmax( harmonics_amp[~harmonics_amp_all_na], axis=1 ) harmonic_peaks_db = librosa.amplitude_to_db(harmonic_peaks) if rel_f0: harmonic_peaks_db = harmonic_peaks_db - librosa.amplitude_to_db( f0_amp ) amp_frames.append(harmonic_peaks_db) return cls( np.array(amp_frames).T, formant_frames_obj.sr, formant_frames_obj.frame_len, formant_frames_obj.hop_len, formant_frames_obj.center, formant_frames_obj.pad_mode, lower, upper, rel_f0,
)