Source code for emvoice.energy

"""Energy-related voice features."""

import logging

import librosa
import numpy as np
from scipy.signal import find_peaks

from emvoice.frames import BaseFrames
from emvoice.spectral import SpecFrames


[docs]class HnrFrames(BaseFrames): """Estimate and store harmonics-to-noise ratios (HNRs). Parameters ---------- frames: numpy.ndarray HNR frames in dB with shape (num_frames,). lower: float Lower fundamental frequency limit for choosing pitch candidates. rel_silence_threshold: float Relative threshold for treating signal frames as silent. Notes ----- Estimate the HNR for each signal frame with ``np.max(np.abs(frames), axis=1) > rel_silence_threshold*np.max(np.abs(frames))`` by: 1. Compute the autocorrelation function (ACF) using the short-term Fourier transform (STFT). 2. Find the lags of peaks in the ACF excluding the zero-th lag. 3. Filter out peaks that correspond to pitch candidates below `lower` and above the Nyquist frequency. 4. Compute the harmonic component `R0` as the highest of the remaining peaks divided by the ACF at lag zero. 5. Compute the HNR as `R0/(1-R0)` and convert to dB. """ def __init__( self, frames: np.ndarray, sr: int, frame_len: int, hop_len: int, center: bool, pad_mode: str, lower: float, rel_silence_threshold, ): self.logger = logging.getLogger("emvoice.frequency.HnrFrames") self.lower = lower self.rel_silence_threshold = rel_silence_threshold super().__init__(frames, sr, frame_len, hop_len, center, pad_mode) @classmethod
[docs] def from_frames( cls, sig_frames_obj: BaseFrames, lower: float = 75.0, rel_silence_threshold: float = 0.1, ): """Estimate the HNR from signal frames. Parameters ---------- sig_frames_obj: BaseFrames Signal frames object. lower: float, default = 75.0 Lower fundamental frequency limit for choosing pitch candidates. rel_silence_threshold: float, default = 0.1 Relative threshold for treating signal frames as silent. """ auto_cor = librosa.autocorrelate(sig_frames_obj.frames) harmonic_strength = np.apply_along_axis( cls._find_max_peak, 1, auto_cor[:, 1:], sr=sig_frames_obj.sr, lower=lower, ) harmonic_comp = harmonic_strength / auto_cor[:, 0] hnr = harmonic_comp / (1 - harmonic_comp) silence_mask = np.max( np.abs(sig_frames_obj.frames), axis=1 ) > rel_silence_threshold * np.max(np.abs(sig_frames_obj.frames)) hnr[np.logical_or(~silence_mask, hnr <= 0)] = np.nan hnr_db = librosa.power_to_db(hnr) # HNR is on power scale return cls( hnr_db, sig_frames_obj.sr, sig_frames_obj.frame_len, sig_frames_obj.hop_len, sig_frames_obj.center, sig_frames_obj.pad_mode, lower, rel_silence_threshold,
) @staticmethod def _find_max_peak(auto_cor: np.ndarray, sr: int, lower: float) -> float: if np.all(np.isnan(auto_cor)): return np.nan auto_cor_peak_lags = find_peaks(auto_cor)[0] auto_cor_peaks = auto_cor[auto_cor_peak_lags] auto_cor_peak_periods = 1 / auto_cor_peak_lags * sr auto_cor_peaks_voiced = auto_cor_peaks[ np.logical_and( auto_cor_peak_periods > lower, auto_cor_peak_periods < sr / 2 ) ] if len(auto_cor_peaks_voiced) == 0: return np.nan auto_cor_max_peak_lag = np.argmax(auto_cor_peaks_voiced) return auto_cor_peaks_voiced[auto_cor_max_peak_lag]
[docs]class RmsEnergyFrames(SpecFrames): """Calculate and store the root mean squared (RMS) energy. Parameters --------- frames: numpy.ndarray RMS energy frames in dB with shape (num_frames,). """ @classmethod
[docs] def from_spec_frames(cls, spec_frames_obj: SpecFrames): """Calculate the RMS energy from spectrogram frames. Parameters ---------- spec_frames_obj: SpecFrames Spectrogram frames object. """ rms_frames = librosa.amplitude_to_db( librosa.feature.rms( # to dB S=np.abs(spec_frames_obj.frames).T, frame_length=spec_frames_obj.frame_len, hop_length=spec_frames_obj.hop_len, center=spec_frames_obj.center, pad_mode=spec_frames_obj.pad_mode, ) ) return cls( rms_frames.squeeze(), spec_frames_obj.sr, spec_frames_obj.window, spec_frames_obj.frame_len, spec_frames_obj.hop_len, spec_frames_obj.center, spec_frames_obj.pad_mode,
)