Source code for emvoice.spectral

"""Spectral voice features."""

import logging
from typing import Optional, Tuple, Union

import librosa
import numpy as np
from sklearn.linear_model import LinearRegression

from emvoice.frames import BaseFrames
from emvoice.signal import BaseSignal


[docs]class SpecFrames(BaseFrames): """Create and store spectrogram frames. Computes a spectrogram of a signal using the short-time Fourier transform (STFT). Parameters ---------- frames: numpy.ndarray Spectrogram frames. window: str The window that was applied before the STFT. Notes ----- Frames contain complex arrays `x` where ``np.abs(x)`` is the magnitude and ``np.angle(x)`` is the phase of the signal for different frequency bins. See Also -------- librosa.stft """ def __init__( self, frames: np.ndarray, sr: int, window: str, frame_len: int, hop_len: int, center: bool = True, pad_mode: str = "constant", ) -> None: self.logger = logging.getLogger("emvoice.frequency.SpecFrames") self.window = window self._freqs = None super().__init__(frames, sr, frame_len, hop_len, center, pad_mode) @property def freqs(self): if self._freqs is None: self._freqs = librosa.fft_frequencies( sr=self.sr, n_fft=self.frame_len ) return self._freqs @classmethod
[docs] def from_signal( cls, sig_obj: BaseSignal, frame_len: int, hop_len: Optional[int] = None, center: bool = True, pad_mode: str = "constant", window: Union[str, float, Tuple] = "hann", ): """Transform a signal into spectrogram frames. Parameters ---------- sig_obj: BaseSignal Signal object. frame_len: int Number of samples per frame. hop_len: int, optional, default=None Number of samples between frame starting points. If `None`, uses `frame_len // 4`. center: bool, default=True Whether to center the frames and apply padding. pad_mode: str, default='constant' How the signal is padded before framing. See :func:`numpy.pad`. Uses the default value 0 for `'constant'` padding. Ignored if `center=False`. window: str The window that is applied before the STFT. """ spec_frames = librosa.stft( sig_obj.sig, n_fft=frame_len, hop_length=hop_len, window=window, center=center, pad_mode=pad_mode, ) return cls( np.swapaxes(spec_frames, 0, 1), sig_obj.sr, window, frame_len, hop_len, center, pad_mode,
)
[docs]class MelSpecFrames(SpecFrames): """Calculate and store Mel spectrograms. Parameters ---------- frames: numpy.ndarray Spectrogram frames on the Mel power scale with shape (num_frames, n_mels). n_mels: int Number of Mel filters. lower: float Lower frequency boundary in Hz. upper: float Upper frequency boundary in Hz. See Also -------- librosa.feature.melspectrogram """ def __init__( self, frames: np.ndarray, sr: int, window: str, frame_len: int, hop_len: int, center: bool, pad_mode: str, n_mels: int, lower: float, upper: float, ): self.logger = logging.getLogger("emvoice.frequency.MelSpecFrames") self.n_mels = n_mels self.lower = lower self.upper = upper super().__init__( frames, sr, window, frame_len, hop_len, center, pad_mode ) @classmethod
[docs] def from_spec_frames( cls, spec_frames_obj: SpecFrames, n_mels: int = 26, lower: float = 20.0, upper: float = 8000.0, ): """Calculate Mel spectrograms from spectrogram frames. spec_frames_obj: SpecFrames Spectrogram frames object. n_mels: int, default=26 Number of Mel filters. lower: float, default=20.0 Lower frequency boundary in Hz. upper: float, default=8000.0 Upper frequency boundary in Hz. """ mel_spec_frames = librosa.feature.melspectrogram( S=np.abs(spec_frames_obj.frames.T) ** 2, # requires power spectrum sr=spec_frames_obj.sr, n_fft=spec_frames_obj.frame_len, hop_length=spec_frames_obj.hop_len, window=spec_frames_obj.window, center=spec_frames_obj.center, pad_mode=spec_frames_obj.pad_mode, n_mels=n_mels, fmin=lower, fmax=upper, ) return cls( mel_spec_frames.T, # outputs power spectrum spec_frames_obj.sr, spec_frames_obj.window, spec_frames_obj.frame_len, spec_frames_obj.hop_len, spec_frames_obj.center, spec_frames_obj.pad_mode, n_mels, lower, upper,
)
[docs]class MfccFrames(MelSpecFrames): """Estimate and store Mel frequency cepstral coefficients (MFCCs). Parameters ---------- frames: numpy.ndarray MFCC frames with shape (num_frames, n_mfcc). n_mfcc: int Number of coeffcients that were estimated per frame. lifter: float Cepstral liftering coefficient. Must be >= 0. If zero, no liftering is applied. """ def __init__( self, frames: np.ndarray, sr: int, window: str, frame_len: int, hop_len: int, center: bool, pad_mode: str, n_mels: int, lower: float, upper: float, n_mfcc: int, lifter: float, ): self.logger = logging.getLogger("emvoice.frequency.MfccFrames") self.n_mfcc = n_mfcc self.lifter = lifter super().__init__( frames, sr, window, frame_len, hop_len, center, pad_mode, n_mels, lower, upper, ) @classmethod
[docs] def from_mel_spec_frames( cls, mel_spec_frames_obj: MelSpecFrames, n_mfcc: int = 4, lifter: float = 22.0, ): """Estimate MFCCs from Mel spectogram frames. Parameters ---------- mel_spec_frames_obj: MelSpecFrames Mel spectrogram frames object. n_mfcc: int, default=4 Number of coeffcients that were estimated per frame. lifter: float, default=22.0 Cepstral liftering coefficient. Must be >= 0. If zero, no liftering is applied. See Also -------- librosa.feature.mfcc """ mfcc_frames = librosa.feature.mfcc( S=librosa.power_to_db( mel_spec_frames_obj.frames.T ), # dB on power spectrum sr=mel_spec_frames_obj.sr, n_mfcc=n_mfcc, lifter=lifter, ) return cls( mfcc_frames.T, mel_spec_frames_obj.sr, mel_spec_frames_obj.window, mel_spec_frames_obj.frame_len, mel_spec_frames_obj.hop_len, mel_spec_frames_obj.center, mel_spec_frames_obj.pad_mode, mel_spec_frames_obj.n_mels, mel_spec_frames_obj.lower, mel_spec_frames_obj.upper, n_mfcc, lifter,
)
[docs]class AlphaRatioFrames(BaseFrames): """Calculate and store spectogram alpha ratios. Parameters ---------- frames: numpy.ndarray Alpha ratio frames in dB with shape (num_frames,). lower_band: tuple Boundaries of the lower frequency band (start, end) in Hz. upper_band: tuple Boundaries of the upper frequency band (start, end) in Hz. Notes ----- Calculate the alpha ratio by dividing the energy (sum of magnitude) in the lower frequency band by the energy in the upper frequency band. The ratio is then converted to dB. """ def __init__( self, frames: np.ndarray, sr: int, frame_len: int, hop_len: int, center: bool, pad_mode: str, lower_band: Tuple[float], upper_band: Tuple[float], ): self.logger = logging.getLogger("emvoice.frequency.AlphaRatioFrames") self.lower_band = lower_band self.upper_band = upper_band super().__init__(frames, sr, frame_len, hop_len, center, pad_mode) @classmethod
[docs] def from_spec_frames( cls, spec_frames_obj: SpecFrames, lower_band: Tuple = (50.0, 1000.0), upper_band: Tuple = (1000.0, 5000.0), ): """Calculate the alpha ratio from spectrogram frames. Parameters ---------- spec_frames_obj: SpecFrames Spectrogram frames object. lower_band: tuple, default=(50.0, 1000.0) Boundaries of the lower frequency band (start, end) in Hz. upper_band: tuple, default=(1000.0, 5000.0) Boundaries of the upper frequency band (start, end) in Hz. """ lower_band_bins = np.logical_and( spec_frames_obj.freqs > lower_band[0], spec_frames_obj.freqs <= lower_band[1], ) lower_band_energy = np.nansum( np.abs(spec_frames_obj.frames[:, lower_band_bins]), axis=1 ) upper_band_bins = np.logical_and( spec_frames_obj.freqs > upper_band[0], spec_frames_obj.freqs <= upper_band[1], ) upper_band_energy = np.nansum( np.abs(spec_frames_obj.frames[:, upper_band_bins]), axis=1 ) alpha_ratio_frames = np.zeros(lower_band_energy.shape) upper_band_energy_is_valid = np.logical_and( np.isfinite(upper_band_energy), upper_band_energy != 0 ) alpha_ratio_frames[~upper_band_energy_is_valid] = np.nan alpha_ratio_frames[upper_band_energy_is_valid] = ( lower_band_energy[upper_band_energy_is_valid] / upper_band_energy[upper_band_energy_is_valid] ) alpha_ratio_frames_db = 20.0 * np.log10(alpha_ratio_frames) return cls( alpha_ratio_frames_db, spec_frames_obj.sr, spec_frames_obj.frame_len, spec_frames_obj.hop_len, spec_frames_obj.center, spec_frames_obj.pad_mode, lower_band, upper_band,
)
[docs]class HammarIndexFrames(BaseFrames): """Calculate and store the spectogram Hammarberg index. Parameters ---------- frames: numpy.ndarray Hammarberg index frames in dB with shape (num_frames,). pivot_point: float Point separating the lower and upper frequency regions in Hz. upper: float Upper limit for the upper frequency region in Hz. Notes ----- Calculate the Hammarberg index by dividing the peak magnitude in the spectrogram region below `pivot_point` by the peak magnitude in region between `pivot_point` and `upper`. The ratio is then converted to dB. """ def __init__( self, frames: np.ndarray, sr: int, frame_len: int, hop_len: int, center: bool, pad_mode: str, pivot_point: float, upper: float, ): self.logger = logging.getLogger("emvoice.frequency.HammarIndexFrames") self.pivot_point = pivot_point self.upper = upper super().__init__(frames, sr, frame_len, hop_len, center, pad_mode) @classmethod
[docs] def from_spec_frames( cls, spec_frames_obj: SpecFrames, pivot_point: float = 2000.0, upper: float = 5000.0, ): """Calculate the Hammarberg index from spectrogram frames. Parameters ---------- spec_frames_obj: SpecFrames Spectrogram frames object. pivot_point: float, default=2000.0 Point separating the lower and upper frequency regions in Hz. upper: float, default=5000.0 Upper limit for the upper frequency region in Hz. """ lower_band = np.abs( spec_frames_obj.frames[:, spec_frames_obj.freqs <= pivot_point] ) upper_band_freqs = np.logical_and( spec_frames_obj.freqs > pivot_point, spec_frames_obj.freqs <= upper ) upper_band = np.abs(spec_frames_obj.frames[:, upper_band_freqs]) hammar_index_frames = np.zeros(lower_band.shape[0]) upper_band_is_valid = np.logical_and( np.any(np.isfinite(upper_band), axis=1), np.all(upper_band > 0, axis=1), ) hammar_index_frames[~upper_band_is_valid] = np.nan hammar_index_frames[upper_band_is_valid] = np.nanmax( lower_band[upper_band_is_valid, :], axis=1 ) / np.nanmax(upper_band[upper_band_is_valid, :], axis=1) hammar_index_frames_db = librosa.amplitude_to_db(hammar_index_frames) return cls( hammar_index_frames_db, spec_frames_obj.sr, spec_frames_obj.frame_len, spec_frames_obj.hop_len, spec_frames_obj.center, spec_frames_obj.pad_mode, pivot_point, upper,
)
[docs]class SpectralSlopeFrames(BaseFrames): """Estimate and store spectral slopes. Parameters ---------- frames: numpy.ndarray Spectral slope frames with shape (num_frames, num_bands). bands: tuple Frequency bands in Hz for which slopes were estimated. Notes ----- Estimate spectral slopes by fitting linear models to frequency bands predicting power in dB from frequency in Hz. Fits separate models for each frame and band. """ def __init__( self, frames: np.ndarray, sr: int, frame_len: int, hop_len: int, center: bool, pad_mode: str, bands: Tuple[Tuple[float]], ): self.logger = logging.getLogger("emvoice.frequency.HammarIndexFrames") self.bands = bands super().__init__(frames, sr, frame_len, hop_len, center, pad_mode) @classmethod
[docs] def from_spec_frames( cls, spec_frames_obj: SpecFrames, bands: Tuple[Tuple[float]] = ((0.0, 500.0), (500.0, 1500.0)), ): """Estimate spectral slopes from spectrogram frames. Parameters ---------- spec_frames_obj: SpecFrames Spectrogram frames object. bands: tuple, default=((0.0, 500.0), (500.0, 1500.0)) Frequency bands in Hz for which slopes are estimated. """ spectral_slopes = np.zeros( shape=(spec_frames_obj.idx.shape[0], len(bands)) ) for i, band in enumerate(bands): band_freqs_mask = np.logical_and( spec_frames_obj.freqs > band[0], spec_frames_obj.freqs <= band[1], ) band_power = np.abs(spec_frames_obj.frames[:, band_freqs_mask]) band_freqs = spec_frames_obj.freqs[band_freqs_mask] spectral_slopes[:, i] = np.apply_along_axis( cls._calc_spectral_slope, 1, band_power, band_freqs=band_freqs ).squeeze() return cls( spectral_slopes, spec_frames_obj.sr, spec_frames_obj.frame_len, spec_frames_obj.hop_len, spec_frames_obj.center, spec_frames_obj.pad_mode, bands,
) @staticmethod def _calc_spectral_slope( band_power: np.ndarray, band_freqs: np.ndarray ) -> np.ndarray: band_power_is_valid = np.logical_and( np.isfinite(band_power), band_power > 0 ) if np.all(~band_power_is_valid): return np.nan band_freqs_finite = band_freqs[band_power_is_valid] band_power_finite_db = librosa.amplitude_to_db( band_power[band_power_is_valid] ) linear_model = LinearRegression() linear_model.fit(band_freqs_finite.reshape(-1, 1), band_power_finite_db) return linear_model.coef_
[docs]class SpectralFluxFrames(SpecFrames): """Calculate and store spectral flux. Parameters ---------- frames: numpy.ndarray Spectral flux frames with shape (num_frames-1,). lower: float Lower limit for frequency bins. upper: float Upper limit for frequency bins Notes ----- Compute the spectral flux as: 1. Compute the normalized magnitudes of the frame spectra by dividing the magnitude at each frequency bin by the sum of all frequency bins. 2. Compute the first-order difference of normalized magnitudes for each frequency bin within [`lower`, `upper`) across frames. 3. Sum up the squared differences for each frame. Due to the first-order difference, the object has a frame less than the spectrogram from which it has been computed. """ def __init__( self, frames: np.ndarray, sr: int, window: str, frame_len: int, hop_len: int, center: bool, pad_mode: str, lower: float, upper: float, ) -> None: self.logger = logging.getLogger("emvoice.frequency.SpectralFluxFrames") self.lower = lower self.upper = upper super().__init__( frames, sr, window, frame_len, hop_len, center, pad_mode ) @classmethod
[docs] def from_spec_frames( cls, spec_frames_obj: SpecFrames, lower: float = 0.0, upper: float = 5000.0, ): """Calculate the spectral flux from spectrogram frames. Parameters ---------- spec_frames_obj: SpecFrames Spectrogram frames object. lower: float, default=0.0 Lower limit for frequency bins. upper: float, default=5000.0 Upper limit for frequency bins """ spec_freq_mask = np.logical_and( spec_frames_obj.freqs >= lower, spec_frames_obj.freqs < upper ) spec_mag = np.abs(spec_frames_obj.frames) spec_norm = np.sum(spec_mag, axis=1) spec_diff = np.diff( spec_mag[:, spec_freq_mask] / spec_norm[:, None], axis=0 ) spec_flux_frames = np.sum(spec_diff**2, axis=1) return cls( spec_flux_frames, spec_frames_obj.sr, spec_frames_obj.window, spec_frames_obj.frame_len, spec_frames_obj.hop_len, spec_frames_obj.center, spec_frames_obj.pad_mode, lower, upper,
)