Source code for emvoice.formants
"""Formant-related voice features."""
import logging
import math
from copy import copy
from typing import List, Optional, Tuple, Union
import librosa
import numpy as np
from scipy.signal.windows import get_window
from emvoice.frames import BaseFrames
from emvoice.pitch import PitchFrames, PitchHarmonicsFrames
[docs]class FormantFrames(BaseFrames):
"""Estimate and store formant frames.
Parameters
----------
frames: list
Formant frames. Each frame contains a list of tuples for each formant, where the first item
is the central frequency and the second the bandwidth.
max_formants: int, default=5
The maximum number of formants that were extracted.
lower: float, default=50.0
Lower limit for formant frequencies (in Hz).
upper: float, default=5450.0
Upper limit for formant frequencies (in Hz).
preemphasis_from: float, default=50.0
Starting value for the applied preemphasis function.
window: str
Window function that was applied before formant estimation.
Notes
-----
See the :ref:`Algorithms section <Formant frequencies and amplitudes>` for details.
"""
def __init__(
self,
frames: List,
sr: int,
frame_len: int,
hop_len: int,
center: bool = True,
pad_mode: str = "constant",
max_formants: int = 5,
lower: float = 50.0,
upper: float = 5450.0,
preemphasis_from: Optional[float] = 50.0,
window: Optional[Union[str, float, Tuple]] = "praat_gaussian",
) -> None:
self.logger = logging.getLogger("emvoice.frequency.FormantFrames")
self.max_formants = max_formants
self.lower = lower
self.upper = upper
self.preemphasis_from = preemphasis_from
self.window = window
super().__init__(frames, sr, frame_len, hop_len, center, pad_mode)
@property
[docs] def idx(self) -> np.ndarray:
if self._idx is None:
self._idx = np.arange(len(self.frames))
return self._idx
@staticmethod
def _praat_gauss_window(frame_len: int):
# This is the Gaussian window that is used in Praat for formant estimation
# See: https://github.com/YannickJadoul/Parselmouth/blob/master/praat/fon/Sound_to_Formant.cpp
sample_idx = np.arange(frame_len) + 1
idx_mid = 0.5 * (frame_len + 1)
edge = np.exp(-12.0)
return (
np.exp(-48.0 * (sample_idx - idx_mid) ** 2 / (frame_len) ** 2)
- edge
) / (1.0 - edge)
@classmethod
[docs] def from_frames(
cls,
sig_frames_obj: BaseFrames,
max_formants: int = 5,
lower: float = 50.0,
upper: float = 5450.0,
preemphasis_from: Optional[float] = 50.0,
window: Optional[Union[str, float, Tuple]] = "praat_gaussian",
):
"""Extract formants from signal frames.
Parameters
----------
sig_frames_obj: BaseFrames
Signal frames object.
max_formants: int, default=5
The maximum number of formants that are extracted.
lower: float, default=50.0
Lower limit for formant frequencies (in Hz).
upper: float, default=5450.0
Upper limit for formant frequencies (in Hz).
preemphasis_from: float, default=50.0
Starting value for the preemphasis function (in Hz).
window: str
Window function that is applied before formant estimation.
"""
frames = sig_frames_obj.frames
if preemphasis_from is not None:
pre_coef = math.exp(
-2 * math.pi * preemphasis_from * (1 / sig_frames_obj.sr)
)
frames = librosa.effects.preemphasis(
sig_frames_obj.frames, coef=pre_coef
)
if window is not None:
if window == "praat_gaussian":
win = cls._praat_gauss_window(sig_frames_obj.frame_len)
else:
win = get_window(
window, sig_frames_obj.frame_len, fftbins=False
)
frames = frames * win
# Calc linear predictive coefficients
coefs = librosa.lpc(frames, order=max_formants * 2)
# Transform LPCs to formants
formants = [
cls._calc_formants(coef, sig_frames_obj.sr, lower, upper)
for coef in coefs
]
return cls(
formants,
sig_frames_obj.sr,
sig_frames_obj.frame_len,
sig_frames_obj.hop_len,
sig_frames_obj.center,
sig_frames_obj.pad_mode,
max_formants,
lower,
upper,
preemphasis_from,
window,
)
@staticmethod
def _calc_formants(
coefs: np.ndarray, sr: int, lower: float = 50, upper: float = 5450
) -> List:
# Function to compute complex norm
def complex_norm(x):
return np.sqrt(np.abs(np.real(x) ** 2) + np.abs(np.imag(x) ** 2))
nf_pi = sr / (2 * math.pi) # sr/2 = Nyquist freq
# Find roots of linear coefficients
roots = np.roots(coefs)
# Select roots with positive imag part
mask = np.imag(roots) > 0
roots = roots[mask]
# Calc angular frequency
ang_freq = np.abs(np.arctan2(np.imag(roots), np.real(roots)))
# Calc formant centre freq
formant_freqs = ang_freq * nf_pi
# Calc formant bandwidth
formant_bws = (
-np.log(np.apply_along_axis(complex_norm, 0, roots)) * nf_pi
)
# Select formants within boundaries
in_bounds = np.logical_and(formant_freqs > lower, formant_freqs < upper)
formants_sorted = sorted(
list(zip(formant_freqs[in_bounds], formant_bws[in_bounds]))
)
return formants_sorted
def select_formant_attr(
self, formant_idx: int, attr_idx: int
) -> np.ndarray:
return np.array(
[
f[formant_idx][attr_idx] if len(f) > formant_idx else np.nan
for f in self.frames
]
)
[docs]class FormantAmplitudeFrames(BaseFrames):
"""Estimate and store formant amplitudes.
Parameters
----------
frames: np.ndarray
Formant amplitude frames of shape (num_frames, max_formants) in dB.
lower: float
Lower boundary for peak amplitude search interval.
upper: float
Upper boundary for peak amplitude search interval.
rel_f0: bool
Whether the amplitude is relative to the fundamental frequency amplitude.
Notes
-----
Estimate the formant amplitude as the maximum amplitude of harmonics of the
fundamental frequency within an interval ``[lower*f, upper*f]`` where `f` is the
central frequency of the formant in each frame. If ``rel=True``, divide the amplitude by the amplitude of
the fundamental frequency.
"""
def __init__(
self,
frames: np.ndarray,
sr: int,
frame_len: int,
hop_len: int,
center: bool,
pad_mode: str,
lower: float,
upper: float,
rel_f0: bool,
):
self.lower = lower
self.upper = upper
self.rel_f0 = rel_f0
super().__init__(frames, sr, frame_len, hop_len, center, pad_mode)
@property
[docs] def idx(self) -> np.ndarray:
if self._idx is None:
self._idx = np.arange(len(self.frames))
return self._idx
@classmethod
[docs] def from_formant_harmonics_and_pitch_frames( # pylint: disable=too-many-locals
cls,
formant_frames_obj: FormantFrames,
harmonics_frames_obj: PitchHarmonicsFrames,
pitch_frames_obj: PitchFrames,
lower: float = 0.8,
upper: float = 1.2,
rel_f0: bool = True,
):
"""Estimate formant amplitudes from formant, pitch harmonics, and pitch frames.
Parameters
----------
formant_frames_obj: FormantFrames
Formant frames object.
harmonics_frames_obj: PitchHarmonicsFrames
Pitch harmonics frames object.
pitch_frames_obj: PitchFrames
Pitch frames object.
lower: float, optional, default=0.8
Lower boundary for peak amplitude search interval.
upper: float, optional, default=1.2
Upper boundary for peak amplitude search interval.
rel_f0: bool, optional, default=True
Whether the amplitude is divided by the fundamental frequency amplitude.
"""
amp_frames = []
for i in range(formant_frames_obj.max_formants):
freqs = formant_frames_obj.select_formant_attr(i, 0)
harmonic_freqs = (
pitch_frames_obj.frames[:, None]
* (np.arange(harmonics_frames_obj.n_harmonics) + 1)[None, :]
)
f0_amp = harmonics_frames_obj.frames[:, 0]
freqs_lower = lower * freqs
freqs_upper = upper * freqs
freq_in_bounds = np.logical_and(
harmonic_freqs > freqs_lower[:, None],
harmonic_freqs < freqs_upper[:, None],
)
harmonics_amp = copy(harmonics_frames_obj.frames)
harmonics_amp[~freq_in_bounds] = np.nan
# Set all-nan frames to nan (otherwise np.nanmax throws warning)
harmonic_peaks = np.zeros(harmonics_amp.shape[0:1])
harmonics_amp_all_na = np.all(np.isnan(harmonics_amp), axis=1)
harmonic_peaks[harmonics_amp_all_na] = np.nan
harmonic_peaks[~harmonics_amp_all_na] = np.nanmax(
harmonics_amp[~harmonics_amp_all_na], axis=1
)
harmonic_peaks_db = librosa.amplitude_to_db(harmonic_peaks)
if rel_f0:
harmonic_peaks_db = harmonic_peaks_db - librosa.amplitude_to_db(
f0_amp
)
amp_frames.append(harmonic_peaks_db)
return cls(
np.array(amp_frames).T,
formant_frames_obj.sr,
formant_frames_obj.frame_len,
formant_frames_obj.hop_len,
formant_frames_obj.center,
formant_frames_obj.pad_mode,
lower,
upper,
rel_f0,
)