Source code for emvoice.pitch

"""Pitch-related voice features.
"""

import logging
from typing import List, Optional, Tuple

import librosa
import numpy as np
from scipy.interpolate import interp1d

from emvoice.frames import BaseFrames
from emvoice.signal import BaseSignal
from emvoice.spectral import SpecFrames


[docs]class PitchFrames(BaseFrames): """Estimate and store pitch frames. Estimate and store the voice pitch measured as the fundamental frequency F0 in Hz. Parameters ---------- frames: numpy.ndarray Voice pitch frames in Hz with shape (num_frames,). flag: numpy.ndarray Boolean flags indicating which frames are voiced with shape (num_frames,). prob: numpy.ndarray Probabilities for frames being voiced with shape (num_frames,). lower: float Lower limit used for pitch estimation (in Hz). upper: float Upper limit used for pitch estimation (in Hz). method: str Method used for estimating voice pitch. See Also -------- librosa.pyin librosa.yin """ def __init__( self, frames: np.ndarray, flag: np.ndarray, prob: np.ndarray, sr: int, lower: float, upper: float, frame_len: int, hop_len: int, method: str, center: bool = True, pad_mode: str = "constant", ): self.logger = logging.getLogger("emvoice.frequency.PitchFrames") self.flag = flag self.prob = prob self.lower = lower self.upper = upper self.method = method super().__init__(frames, sr, frame_len, hop_len, center, pad_mode) @classmethod
[docs] def from_signal( cls, sig_obj: BaseSignal, frame_len: int, hop_len: Optional[int] = None, center: bool = True, pad_mode: str = "constant", lower: float = 75.0, upper: float = 600.0, method: str = "pyin", ): """Estimate the voice pitch frames from a signal. Currently, voice pitch can only be extracted with the *pYIN* method. Parameters ---------- sig_obj: BaseSignal Signal object. frame_len: int Number of samples per frame. hop_len: int, optional, default=None Number of samples between frame starting points. If `None`, uses `frame_len // 4`. center: bool, default=True Whether to center the frames and apply padding. pad_mode: str, default='constant' How the signal is padded before framing. See :func:`numpy.pad`. Uses the default value 0 for `'constant'` padding. Ignored if `center=False`. lower: float, default = 75.0 Lower limit for pitch estimation (in Hz). upper: float, default = 600.0 Upper limit for pitch estimation (in Hz). method: str, default = 'pyin' Method for estimating voice pitch. Only `'pyin'` is currently available. Raises ------ NotImplementedError If a method other than `'pyin'` is given. """ if hop_len is None: hop_len = frame_len // 4 if method == "pyin": pitch_f0, flag, prob = librosa.pyin( sig_obj.sig, fmin=lower, fmax=upper, sr=sig_obj.sr, frame_length=frame_len, hop_length=hop_len, center=center, pad_mode=pad_mode, ) else: raise NotImplementedError( 'Only the "pyin" method is currently available' ) return cls( frames=pitch_f0, flag=flag, prob=prob, sr=sig_obj.sr, lower=lower, upper=upper, frame_len=frame_len, hop_len=hop_len, method=method,
)
[docs]class PitchPulseFrames(BaseFrames): """Extract and store glottal pulse frames. Glottal pulses are peaks in the signal corresponding to the fundamental frequency F0. Parameters ---------- frames: list Pulse frames. Each frame contains a list of pulses or an empty list if no pulses are detected. Pulses are stored as tuples (pulse timestamp, T0, amplitude). Notes ----- See :ref:`Algorithms section <Glottal pulses>` for details. """ def __init__( self, frames: List[Tuple], sr: int, frame_len: int, hop_len: int, center: bool = True, pad_mode: str = "constant", ) -> None: self.logger = logging.getLogger("emvoice.frequency.PitchPulseFrames") super().__init__(frames, sr, frame_len, hop_len, center, pad_mode) @property
[docs] def idx(self) -> np.ndarray: if self._idx is None: self._idx = np.arange(len(self.frames)) return self._idx
@classmethod
[docs] def from_signal_and_pitch_frames( cls, sig_obj: BaseSignal, pitch_frames_obj: PitchFrames ): """Extract glottal pulse frames from a signal and voice pitch frames. Parameters ---------- sig_obj: BaseSignal Signal object. pitch_frames_obj: PitchFrames Voice pitch frames object. """ # Access to padded signal required so we transform it here again! Could go into separate private method perhaps padding = [(0, 0) for _ in sig_obj.sig.shape] padding[-1] = ( pitch_frames_obj.frame_len // 2, pitch_frames_obj.frame_len // 2, ) sig_padded = np.pad( sig_obj.sig, padding, mode=pitch_frames_obj.pad_mode ) # Create ts for padded signal sig_padded_ts = librosa.samples_to_time( np.arange(sig_padded.shape[0]), sr=sig_obj.sr ) # Frame padded signal sig_frames_obj = BaseFrames.from_signal( BaseSignal(sig_padded, sig_obj.sr), pitch_frames_obj.frame_len, pitch_frames_obj.hop_len, center=False, ) # Frame ts of padded signal sig_ts_frames_obj = BaseFrames.from_signal( BaseSignal(sig_padded_ts, sig_obj.sr), pitch_frames_obj.frame_len, pitch_frames_obj.hop_len, center=False, ) # Interpolate pitch F0 at padded signal ts interp_f0 = np.interp( sig_padded_ts, pitch_frames_obj.ts[pitch_frames_obj.flag], pitch_frames_obj.frames[pitch_frames_obj.flag], ) # Frame F0 interpolated signal pitch_interp_frames_obj = BaseFrames.from_signal( BaseSignal(interp_f0, sig_obj.sr), pitch_frames_obj.frame_len, pitch_frames_obj.hop_len, center=False, ) # Detect pulses in each frame; objects are passed instead of arrays bcs some attributes are needed pulses = [ cls._detect_pulses_in_frame( i, sig_frames_obj, sig_ts_frames_obj, pitch_frames_obj, pitch_interp_frames_obj, ) for i in pitch_frames_obj.idx ] return cls( pulses, pitch_frames_obj.sr, pitch_frames_obj.frame_len, pitch_frames_obj.hop_len, pitch_frames_obj.center, pitch_frames_obj.pad_mode,
) @classmethod def _get_next_pulse( cls, sig: np.ndarray, ts: np.ndarray, t0_interp: np.ndarray, start: float, stop: float, left: bool = True, pulses: Optional[List] = None, ): # Init pulses as list if first iter of recurrence and default if pulses is None: pulses = [] # If interval [start, stop] reaches end of frame, exit recurrence if ( (left and start <= ts.min()) or (not left and stop >= ts.max()) or any(np.isnan((start, stop))) ): return pulses # Get closest ts to boundaries start, stop start_idx = np.argmin(np.abs(ts - start)) stop_idx = np.argmin(np.abs(ts - stop)) interval = sig[start_idx:stop_idx] # Find max peak in interval [start, stop] peak_idx = np.nanargmax(interval) # Set new mid point to idx of max peak new_ts_mid = ts[start_idx:stop_idx][peak_idx] # Add pulse to output new_t0_interp_mid = t0_interp[start_idx:stop_idx][peak_idx] pulses.append((new_ts_mid, new_t0_interp_mid, interval[peak_idx])) # self.logger.debug('%s - %s - %s', start, stop, pulses) if left: # Move interval to left start = new_ts_mid - 1.25 * new_t0_interp_mid stop = new_ts_mid - 0.8 * new_t0_interp_mid else: # Move interval to right stop = new_ts_mid + 1.25 * new_t0_interp_mid start = new_ts_mid + 0.8 * new_t0_interp_mid # Find next pulse in new interval return cls._get_next_pulse( sig, ts, t0_interp, start, stop, left, pulses ) @classmethod def _detect_pulses_in_frame( cls, frame_idx: int, sig_frames_obj: BaseFrames, sig_ts_frames_obj: BaseFrames, pitch_obj: PitchFrames, pitch_interp_obj: BaseFrames, ) -> List[Tuple]: # Get period of frame t0_mid = 1 / pitch_obj.frames[frame_idx] # Get ts of frame ts_mid = pitch_obj.ts[frame_idx] # Get frame signal sig_frame = sig_frames_obj.frames[frame_idx, :] # Get ts of frame signal ts_sig_frame = sig_ts_frames_obj.frames[frame_idx, :] # Get interpolated period of frame t0 = 1 / pitch_interp_obj.frames[frame_idx, :] pulses = [] # Return empty list if frame is unvoiced (no F0) if np.all(np.isnan(t0)) or np.isnan(t0_mid): return pulses # Set start interval start = ts_mid - t0_mid / 2 stop = ts_mid + t0_mid / 2 # Get pulses to the left cls._get_next_pulse( sig_frame, ts_sig_frame, t0, start, stop, True, pulses ) # Get pulses to the right cls._get_next_pulse( sig_frame, ts_sig_frame, t0, start, stop, False, pulses ) return list(sorted(set(pulses)))
[docs]class PitchPeriodFrames(BaseFrames): def __init__( self, frames: np.ndarray, sr: int, frame_len: int, hop_len: int, center: bool, pad_mode: str, lower: float, upper: float, ): self.logger = logging.getLogger("emvoice.frequency.PitchPeriodFrames") self.lower = lower self.upper = upper super().__init__(frames, sr, frame_len, hop_len, center, pad_mode) @staticmethod def _calc_period_length( pulses: List[Tuple], lower: float, upper: float ) -> Tuple[List, np.ndarray]: # Calc period length as first order diff of pulse ts periods = np.diff(np.array([puls[0] for puls in pulses])) # Filter out too short and long periods mask = np.logical_and(periods > lower, periods < upper) # Split periods according to mask and remove masked periods periods = np.array_split(periods[mask], np.where(~mask)[0]) return periods, mask @staticmethod def _check_ratio(x_arr: np.ndarray, threshold: float) -> np.ndarray: valid = np.logical_and(np.isfinite(x_arr[1:]), x_arr[1:] > 0) valid[valid] = x_arr[:-1][valid] / x_arr[1:][valid] < threshold return valid
[docs]class PitchHarmonicsFrames(BaseFrames): """Estimate and store voice pitch harmonics. Compute the energy of the signal at harmonics (`nF0` for any integer n) of the fundamental frequency. Parameters ---------- frames: numpy.ndarray Harmonics frames with the shape (num_frames, n_harmonics) n_harmonics: int, default=100 Number of estimated harmonics. See Also -------- librosa.f0_harmonics """ def __init__( self, frames: np.ndarray, sr: int, frame_len: int, hop_len: int, center: bool = True, pad_mode: str = "constant", n_harmonics: int = 100, ): self.logger = logging.getLogger( "emvoice.frequency.PitchHarmonicsFrames" ) self.n_harmonics = n_harmonics super().__init__(frames, sr, frame_len, hop_len, center, pad_mode) @classmethod
[docs] def from_spec_and_pitch_frames( cls, spec_frames_obj: SpecFrames, pitch_frames_obj: PitchFrames, n_harmonics: int = 100, ): """Estimate voice pitch harmonics from spectrogram frames and voice pitch frames. Parameters ---------- spec_frames_obj: SpecFrames Spectrogram frames object. pitch_frames_obj: PitchFrames Pitch frames object. n_harmonics: int, default=100 Number of harmonics to estimate. """ # harmonics = librosa.f0_harmonics( # np.abs(spec_frames_obj.frames), # freqs=freqs, # f0=pitch_frames_obj.frames, # harmonics=np.arange(n_harmonics) + 1, # Shift one up # axis=-1, # ) harmonics = cls._calc_f0_harmonics( spec_frames_obj.frames, spec_frames_obj.freqs, pitch_frames_obj.frames, n_harmonics, ) return cls( harmonics, spec_frames_obj.sr, spec_frames_obj.frame_len, spec_frames_obj.hop_len, spec_frames_obj.center, spec_frames_obj.pad_mode, n_harmonics,
) @staticmethod def _calc_f0_harmonics( spec_frames: np.ndarray, freqs: np.ndarray, f0_frames: np.ndarray, n_harmonics: int, ) -> np.ndarray: # Adapted from librosa.f0_harmonics, see: # https://librosa.org/doc/latest/generated/librosa.f0_harmonics.html#librosa.f0_harmonics is_valid = np.isfinite(freqs) def mag_interp_fun(spec_frames, f0_harmonic_freqs): interp = interp1d( freqs[is_valid], spec_frames[is_valid], axis=0, copy=False, assume_sorted=False, bounds_error=False, fill_value=0, ) return interp(f0_harmonic_freqs) xfunc = np.vectorize(mag_interp_fun, signature="(f),(h)->(h)") harmonics_frames = xfunc( np.abs(spec_frames), np.multiply.outer( f0_frames, np.arange(n_harmonics) + 1 ), # Shift one up ) return harmonics_frames
[docs]class JitterFrames(PitchPeriodFrames): """Extract and store voice jitter frames. Parameters ---------- frames: numpy.ndarray Voice jitter frames of shape (num_frames,). rel: bool Whether the voice jitter is relative to the average period length. lower: float Lower limit for periods between glottal pulses. upper: float Upper limit for periods between glottal pulses. max_period_ratio: float Maximum ratio between consecutive periods used for jitter extraction. Notes ----- Compute jitter as the average absolute difference between consecutive fundamental periods with a ratio below `max_period_ratio` for each frame. If ``rel=True``, jitter is divided by the average fundamental period of each frame. Fundamental periods are calculated as the first-order temporal difference between consecutive glottal pulses. """ def __init__( self, frames: np.ndarray, sr: int, frame_len: int, hop_len: int, center: bool, pad_mode: str, rel: bool, lower: float, upper: float, max_period_ratio: float, ): self.logger = logging.getLogger("emvoice.frequency.JitterFrames") self.rel = rel self.max_period_ratio = max_period_ratio super().__init__( frames, sr, frame_len, hop_len, center, pad_mode, lower, upper ) @classmethod
[docs] def from_pitch_pulse_frames( cls, pitch_pulse_frames_obj: PitchPulseFrames, rel: bool = True, lower: float = 0.0001, upper: float = 0.02, max_period_ratio: float = 1.3, ): """Extract voice jitter frames from glottal pulse frames. Parameters ---------- pitch_pulse_frames_obj: PitchPulseFrames Glottal pulse frames object. rel: bool, optional, default=True Divide jitter by the average pitch period. lower: float, optional, default=0.0001 Lower limit for periods between glottal pulses. upper: float, optional, default=0.02 Upper limit for periods between glottal pulses. max_period_ratio: float, optional, default=1.3 Maximum ratio between consecutive periods for jitter extraction. """ jitter_frames = np.array( [ cls._calc_jitter_frame( pulses, rel, lower, upper, max_period_ratio ) for pulses in pitch_pulse_frames_obj.frames ] ) return cls( jitter_frames, pitch_pulse_frames_obj.sr, pitch_pulse_frames_obj.frame_len, pitch_pulse_frames_obj.hop_len, pitch_pulse_frames_obj.center, pitch_pulse_frames_obj.pad_mode, rel, lower, upper, max_period_ratio,
) @classmethod def _calc_jitter_frame( cls, pulses: List[Tuple], rel: bool, lower: float, upper: float, max_period_ratio: float, ): if len(pulses) == 0: return np.nan # Calc period length as first order diff of pulse ts periods, _ = cls._calc_period_length(pulses, lower, upper) if len(periods) == 0 or all(len(period) <= 1 for period in periods): return np.nan # Calc avg of first order diff in period length # only consider period pairs where ratio is < max_period_ratio period_diff = [ np.abs(np.diff(period)[cls._check_ratio(period, max_period_ratio)]) for period in periods if len(period) > 1 ] if len(period_diff) == 0 or all( len(period) == 0 for period in period_diff ): return np.nan avg_period_diff = np.nanmean( np.array([np.mean(period) for period in period_diff]) ) if rel: # Relative to mean period length avg_period_len = np.nanmean( np.array( [np.mean(period) for period in periods if len(period) > 1] ) ) return avg_period_diff / avg_period_len return avg_period_diff
[docs]class ShimmerFrames(PitchPeriodFrames): """Extract and store voice shimmer frames. Parameters ---------- frames: numpy.ndarray Voice shimmer frames of shape (num_frames,). rel: bool Whether the voice shimmer is relative to the average period length. lower: float Lower limit for periods between glottal pulses. upper: float Upper limit for periods between glottal pulses. max_period_ratio: float Maximum ratio between consecutive periods used for shimmer extraction. max_amp_factor: float Maximum ratio between consecutive amplitudes used for shimmer extraction. Notes ----- Compute shimmer as the average absolute difference between consecutive pitch amplitudes with a fundamental period ratio below `max_period_ratio` and amplitude ratio below `max_amp_factor` for each frame. If ``rel=True``, shimmer is divided by the average amplitude of each frame. Fundamental periods are calculated as the first-order temporal difference between consecutive glottal pulses. Amplitudes are signal amplitudes at the glottal pulses. """ def __init__( self, frames: List[Tuple], sr: int, frame_len: int, hop_len: int, center: bool, pad_mode: str, rel: bool, lower: float, upper: float, max_period_ratio: float, max_amp_factor: float, ): self.logger = logging.getLogger("emvoice.frequency.ShimmerFrames") self.rel = rel self.max_period_ratio = max_period_ratio self.max_amp_factor = max_amp_factor super().__init__( frames, sr, frame_len, hop_len, center, pad_mode, lower, upper ) @classmethod
[docs] def from_pitch_pulse_frames( cls, pitch_pulse_frames_obj: PitchPulseFrames, rel: bool = True, lower: float = 0.0001, upper: float = 0.02, max_period_ratio: float = 1.3, max_amp_factor: float = 1.6, ): """Extract voice shimmer frames from glottal pulse frames. Parameters ---------- pitch_pulse_frames_obj: PitchPulseFrames Glottal pulse frames object. rel: bool, optional, default=True Divide shimmer by the average pulse amplitude. lower: float, optional, default=0.0001 Lower limit for periods between glottal pulses. upper: float, optional, default=0.02 Upper limit for periods between glottal pulses. max_period_ratio: float, optional, default=1.3 Maximum ratio between consecutive periods for shimmer extraction. max_amp_factor: float, optional, default=1.6 Maximum ratio between consecutive amplitudes used for shimmer extraction. """ shimmer_frames = np.array( [ cls._calc_shimmer_frame( pulses, rel, lower, upper, max_period_ratio, max_amp_factor ) for pulses in pitch_pulse_frames_obj.frames ] ) return cls( shimmer_frames, pitch_pulse_frames_obj.sr, pitch_pulse_frames_obj.frame_len, pitch_pulse_frames_obj.hop_len, pitch_pulse_frames_obj.center, pitch_pulse_frames_obj.pad_mode, rel, lower, upper, max_period_ratio, max_amp_factor,
) @classmethod def _calc_shimmer_frame( cls, pulses: List[Tuple], rel: bool, lower: float, upper: float, max_period_ratio: float, max_amp_factor: float, ) -> float: if len(pulses) == 0: return np.nan # Calc period length as first order diff of pulse ts periods, mask = cls._calc_period_length(pulses, lower, upper) amps = cls._get_amplitude(pulses, mask) if ( len(periods) == 0 or len(amps) == 0 or all(len(period) <= 1 for period in periods) ): return np.nan # Calc avg of first order diff in amplitude # only consider period pairs where period ratio is < max_period_ratio and # where amplitude ratio is < max_amp_factor amp_diff = [ np.abs( np.diff(amp)[ np.logical_and( cls._check_ratio(period, max_period_ratio), cls._check_ratio(amp, max_amp_factor), ) ] ) for amp, period in zip(amps, periods) if len(period) > 1 and len(amp) > 1 ] if len(amp_diff) == 0 or all(len(amp) == 0 for amp in amp_diff): return np.nan avg_amp_diff = np.nanmean(np.array([np.mean(amp) for amp in amp_diff])) if rel: # Relative to mean amplitude avg_amp = np.nanmean( np.array([np.mean(amp) for amp in amps if len(amp) > 1]) ) return avg_amp_diff / avg_amp return avg_amp_diff @staticmethod def _get_amplitude(pulses: List[Tuple], mask: np.ndarray) -> List: # Get amplitudes amps = np.array([puls[2] for puls in pulses])[ 1: ] # Skip first amplitude to align with periods # Split periods according to mask and remove masked periods amps = np.array_split(amps[mask], np.where(~mask)[0]) return amps