# -*- coding: utf-8 -*-
#
# This file is part of SIDEKIT.
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is free software: you can redistribute it and/or modify
# it under the terms of the GNU LLesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# SIDEKIT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with SIDEKIT. If not, see <http://www.gnu.org/licenses/>.
"""
Copyright 2014-2020 Anthony Larcher and Sylvain Meignier
:mod:`frontend` provides methods to process an audio signal in order to extract
useful parameters for speaker verification.
"""
import numpy
import numpy.matlib
import scipy
from scipy.fftpack.realtransforms import dct
from sidekit.frontend.vad import pre_emphasis
import numpy.matlib
PARAM_TYPE = numpy.float32
__author__ = "Anthony Larcher and Sylvain Meignier"
__copyright__ = "Copyright 2014-2020 Anthony Larcher and Sylvain Meignier"
__license__ = "LGPL"
__maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'
[docs]def hz2mel(f, htk=True):
"""Convert an array of frequency in Hz into mel.
:param f: frequency to convert
:return: the equivalence on the mel scale.
"""
if htk:
return 2595 * numpy.log10(1 + f / 700.)
else:
f = numpy.array(f)
# Mel fn to match Slaney's Auditory Toolbox mfcc.m
# Mel fn to match Slaney's Auditory Toolbox mfcc.m
f_0 = 0.
f_sp = 200. / 3.
brkfrq = 1000.
brkpt = (brkfrq - f_0) / f_sp
logstep = numpy.exp(numpy.log(6.4) / 27)
linpts = f < brkfrq
z = numpy.zeros_like(f)
# fill in parts separately
z[linpts] = (f[linpts] - f_0) / f_sp
z[~linpts] = brkpt + (numpy.log(f[~linpts] / brkfrq)) / numpy.log(logstep)
if z.shape == (1,):
return z[0]
else:
return z
[docs]def mel2hz(z, htk=True):
"""Convert an array of mel values in Hz.
:param m: ndarray of frequencies to convert in Hz.
:return: the equivalent values in Hertz.
"""
if htk:
return 700. * (10**(z / 2595.) - 1)
else:
z = numpy.array(z, dtype=float)
f_0 = 0
f_sp = 200. / 3.
brkfrq = 1000.
brkpt = (brkfrq - f_0) / f_sp
logstep = numpy.exp(numpy.log(6.4) / 27)
linpts = (z < brkpt)
f = numpy.zeros_like(z)
# fill in parts separately
f[linpts] = f_0 + f_sp * z[linpts]
f[~linpts] = brkfrq * numpy.exp(numpy.log(logstep) * (z[~linpts] - brkpt))
if f.shape == (1,):
return f[0]
else:
return f
[docs]def hz2bark(f):
"""
Convert frequencies (Hertz) to Bark frequencies
:param f: the input frequency
:return:
"""
return 6. * numpy.arcsinh(f / 600.)
[docs]def bark2hz(z):
"""
Converts frequencies Bark to Hertz (Hz)
:param z:
:return:
"""
return 600. * numpy.sinh(z / 6.)
[docs]def compute_delta(features,
win=3,
method='filter',
filt=numpy.array([.25, .5, .25, 0, -.25, -.5, -.25])):
"""features is a 2D-ndarray each row of features is a a frame
:param features: the feature frames to compute the delta coefficients
:param win: parameter that set the length of the computation window.
The size of the window is (win x 2) + 1
:param method: method used to compute the delta coefficients
can be diff or filter
:param filt: definition of the filter to use in "filter" mode, default one
is similar to SPRO4: filt=numpy.array([.2, .1, 0, -.1, -.2])
:return: the delta coefficients computed on the original features.
"""
# First and last features are appended to the begining and the end of the
# stream to avoid border effect
x = numpy.zeros((features.shape[0] + 2 * win, features.shape[1]), dtype=PARAM_TYPE)
x[:win, :] = features[0, :]
x[win:-win, :] = features
x[-win:, :] = features[-1, :]
delta = numpy.zeros(x.shape, dtype=PARAM_TYPE)
if method == 'diff':
filt = numpy.zeros(2 * win + 1, dtype=PARAM_TYPE)
filt[0] = -1
filt[-1] = 1
for i in range(features.shape[1]):
delta[:, i] = numpy.convolve(features[:, i], filt)
return delta[win:-win, :]
[docs]def pca_dct(cep, left_ctx=12, right_ctx=12, p=None):
"""Apply DCT PCA as in [McLaren 2015] paper:
Mitchell McLaren and Yun Lei, 'Improved Speaker Recognition
Using DCT coefficients as features' in ICASSP, 2015
A 1D-dct is applied to the cepstral coefficients on a temporal
sliding window.
The resulting matrix is then flatten and reduced by using a Principal
Component Analysis.
:param cep: a matrix of cepstral cefficients, 1 line per feature vector
:param left_ctx: number of frames to consider for left context
:param right_ctx: number of frames to consider for right context
:param p: a PCA matrix trained on a developpment set to reduce the
dimension of the features. P is a portait matrix
"""
y = numpy.r_[numpy.resize(cep[0, :], (left_ctx, cep.shape[1])),
cep,
numpy.resize(cep[-1, :], (right_ctx, cep.shape[1]))]
ceps = framing(y, win_size=left_ctx + 1 + right_ctx).transpose(0, 2, 1)
dct_temp = (dct_basis(left_ctx + 1 + right_ctx, left_ctx + 1 + right_ctx)).T
if p is None:
p = numpy.eye(dct_temp.shape[0] * cep.shape[1], dtype=PARAM_TYPE)
return (numpy.dot(ceps.reshape(-1, dct_temp.shape[0]),
dct_temp).reshape(ceps.shape[0], -1)).dot(p)
[docs]def shifted_delta_cepstral(cep, d=1, p=3, k=7):
"""
Compute the Shifted-Delta-Cepstral features for language identification
:param cep: matrix of feature, 1 vector per line
:param d: represents the time advance and delay for the delta computation
:param k: number of delta-cepstral blocks whose delta-cepstral
coefficients are stacked to form the final feature vector
:param p: time shift between consecutive blocks.
return: cepstral coefficient concatenated with shifted deltas
"""
y = numpy.r_[numpy.resize(cep[0, :], (d, cep.shape[1])),
cep,
numpy.resize(cep[-1, :], (k * 3 + d, cep.shape[1]))]
delta = compute_delta(y, win=d, method='diff')
sdc = numpy.empty((cep.shape[0], cep.shape[1] * k))
idx = numpy.zeros(delta.shape[0], dtype='bool')
for ii in range(k):
idx[d + ii * p] = True
for ff in range(len(cep)):
sdc[ff, :] = delta[idx, :].reshape(1, -1)
idx = numpy.roll(idx, 1)
return numpy.hstack((cep, sdc))
[docs]def trfbank(fs, nfft, lowfreq, maxfreq, nlinfilt, nlogfilt, midfreq=1000):
"""Compute triangular filterbank for cepstral coefficient computation.
:param fs: sampling frequency of the original signal.
:param nfft: number of points for the Fourier Transform
:param lowfreq: lower limit of the frequency band filtered
:param maxfreq: higher limit of the frequency band filtered
:param nlinfilt: number of linear filters to use in low frequencies
:param nlogfilt: number of log-linear filters to use in high frequencies
:param midfreq: frequency boundary between linear and log-linear filters
:return: the filter bank and the central frequencies of each filter
"""
# Total number of filters
nfilt = nlinfilt + nlogfilt
# ------------------------
# Compute the filter bank
# ------------------------
# Compute start/middle/end points of the triangular filters in spectral
# domain
frequences = numpy.zeros(nfilt + 2, dtype=PARAM_TYPE)
if nlogfilt == 0:
linsc = (maxfreq - lowfreq) / (nlinfilt + 1)
frequences[:nlinfilt + 2] = lowfreq + numpy.arange(nlinfilt + 2) * linsc
elif nlinfilt == 0:
low_mel = hz2mel(lowfreq)
max_mel = hz2mel(maxfreq)
mels = numpy.zeros(nlogfilt + 2)
# mels[nlinfilt:]
melsc = (max_mel - low_mel) / (nfilt + 1)
mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc
# Back to the frequency domain
frequences = mel2hz(mels)
else:
# Compute linear filters on [0;1000Hz]
linsc = (min([midfreq, maxfreq]) - lowfreq) / (nlinfilt + 1)
frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc
# Compute log-linear filters on [1000;maxfreq]
low_mel = hz2mel(min([1000, maxfreq]))
max_mel = hz2mel(maxfreq)
mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE)
melsc = (max_mel - low_mel) / (nlogfilt + 1)
# Verify that mel2hz(melsc)>linsc
while mel2hz(melsc) < linsc:
# in this case, we add a linear filter
nlinfilt += 1
nlogfilt -= 1
frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc
low_mel = hz2mel(frequences[nlinfilt - 1] + 2 * linsc)
max_mel = hz2mel(maxfreq)
mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE)
melsc = (max_mel - low_mel) / (nlogfilt + 1)
mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc
# Back to the frequency domain
frequences[nlinfilt:] = mel2hz(mels)
heights = 2. / (frequences[2:] - frequences[0:-2])
# Compute filterbank coeff (in fft domain, in bins)
fbank = numpy.zeros((nfilt, int(numpy.floor(nfft / 2)) + 1), dtype=PARAM_TYPE)
# FFT bins (in Hz)
n_frequences = numpy.arange(nfft) / (1. * nfft) * fs
for i in range(nfilt):
low = frequences[i]
cen = frequences[i + 1]
hi = frequences[i + 2]
lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int)
left_slope = heights[i] / (cen - low)
rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1,
min(numpy.floor(hi * nfft / fs) + 1, nfft), dtype=numpy.int)
right_slope = heights[i] / (hi - cen)
fbank[i][lid] = left_slope * (n_frequences[lid] - low)
fbank[i][rid[:-1]] = right_slope * (hi - n_frequences[rid[:-1]])
return fbank, frequences
[docs]def mel_filter_bank(fs, nfft, lowfreq, maxfreq, widest_nlogfilt, widest_lowfreq, widest_maxfreq,):
"""Compute triangular filterbank for cepstral coefficient computation.
:param fs: sampling frequency of the original signal.
:param nfft: number of points for the Fourier Transform
:param lowfreq: lower limit of the frequency band filtered
:param maxfreq: higher limit of the frequency band filtered
:param widest_nlogfilt: number of log filters
:param widest_lowfreq: lower frequency of the filter bank
:param widest_maxfreq: higher frequency of the filter bank
:param widest_maxfreq: higher frequency of the filter bank
:return: the filter bank and the central frequencies of each filter
"""
# ------------------------
# Compute the filter bank
# ------------------------
# Compute start/middle/end points of the triangular filters in spectral
# domain
widest_freqs = numpy.zeros(widest_nlogfilt + 2, dtype=PARAM_TYPE)
low_mel = hz2mel(widest_lowfreq)
max_mel = hz2mel(widest_maxfreq)
mels = numpy.zeros(widest_nlogfilt+2)
melsc = (max_mel - low_mel) / (widest_nlogfilt + 1)
mels[:widest_nlogfilt + 2] = low_mel + numpy.arange(widest_nlogfilt + 2) * melsc
# Back to the frequency domain
widest_freqs = mel2hz(mels)
# Select filters in the narrow band
sub_band_freqs = numpy.array([fr for fr in widest_freqs if lowfreq <= fr <= maxfreq], dtype=PARAM_TYPE)
heights = 2./(sub_band_freqs[2:] - sub_band_freqs[0:-2])
nfilt = sub_band_freqs.shape[0] - 2
# Compute filterbank coeff (in fft domain, in bins)
fbank = numpy.zeros((nfilt, numpy.floor(nfft/2)+1).astype(int), dtype=PARAM_TYPE)
# FFT bins (in Hz)
nfreqs = numpy.arange(nfft) / (1. * nfft) * fs
for i in range(nfilt):
low = sub_band_freqs[i]
cen = sub_band_freqs[i+1]
hi = sub_band_freqs[i+2]
lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int)
left_slope = heights[i] / (cen - low)
rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1, min(numpy.floor(hi * nfft / fs) + 1,
nfft), dtype=numpy.int)
right_slope = heights[i] / (hi - cen)
fbank[i][lid] = left_slope * (nfreqs[lid] - low)
fbank[i][rid[:-1]] = right_slope * (hi - nfreqs[rid[:-1]])
return fbank, sub_band_freqs
[docs]def power_spectrum(input_sig,
fs=8000,
win_time=0.025,
shift=0.01,
prefac=0.97):
"""
Compute the power spectrum of the signal.
:param input_sig:
:param fs:
:param win_time:
:param shift:
:param prefac:
:return:
"""
window_length = int(round(win_time * fs))
overlap = window_length - int(shift * fs)
framed = framing(input_sig, window_length, win_shift=window_length-overlap).copy()
# Pre-emphasis filtering is applied after framing to be consistent with stream processing
framed = pre_emphasis(framed, prefac)
l = framed.shape[0]
n_fft = 2 ** int(numpy.ceil(numpy.log2(window_length)))
# Windowing has been changed to hanning which is supposed to have less noisy sidelobes
# ham = numpy.hamming(window_length)
window = numpy.hanning(window_length)
spec = numpy.ones((l, int(n_fft / 2) + 1), dtype=PARAM_TYPE)
log_energy = numpy.log((framed**2).sum(axis=1))
dec = 500000
start = 0
stop = min(dec, l)
while start < l:
ahan = framed[start:stop, :] * window
mag = numpy.fft.rfft(ahan, n_fft, axis=-1)
spec[start:stop, :] = mag.real**2 + mag.imag**2
start = stop
stop = min(stop + dec, l)
return spec, log_energy
[docs]def mfcc(input_sig,
lowfreq=100, maxfreq=8000,
nlinfilt=0, nlogfilt=24,
nwin=0.025,
fs=16000,
nceps=13,
shift=0.01,
get_spec=False,
get_mspec=False,
prefac=0.97):
"""Compute Mel Frequency Cepstral Coefficients.
:param input_sig: input signal from which the coefficients are computed.
Input audio is supposed to be RAW PCM 16bits
:param lowfreq: lower limit of the frequency band filtered.
Default is 100Hz.
:param maxfreq: higher limit of the frequency band filtered.
Default is 8000Hz.
:param nlinfilt: number of linear filters to use in low frequencies.
Default is 0.
:param nlogfilt: number of log-linear filters to use in high frequencies.
Default is 24.
:param nwin: length of the sliding window in seconds
Default is 0.025.
:param fs: sampling frequency of the original signal. Default is 16000Hz.
:param nceps: number of cepstral coefficients to extract.
Default is 13.
:param shift: shift between two analyses. Default is 0.01 (10ms).
:param get_spec: boolean, if true returns the spectrogram
:param get_mspec: boolean, if true returns the output of the filter banks
:param prefac: pre-emphasis filter value
:return: the cepstral coefficients in a ndaray as well as
the Log-spectrum in the mel-domain in a ndarray.
.. note:: MFCC are computed as follows:
- Pre-processing in time-domain (pre-emphasizing)
- Compute the spectrum amplitude by windowing with a Hamming window
- Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively
linearly spaced on the mel scale, and have equal bandwith in the mel scale
- Compute the DCT of the log-spectrom
- Log-energy is returned as first coefficient of the feature vector.
For more details, refer to [Davis80]_.
"""
# Compute power spectrum
spec, log_energy = power_spectrum(input_sig,
fs,
win_time=nwin,
shift=shift,
prefac=prefac)
# Filter the spectrum through the triangle filter-bank
n_fft = 2 ** int(numpy.ceil(numpy.log2(int(round(nwin * fs)))))
fbank = trfbank(fs, n_fft, lowfreq, maxfreq, nlinfilt, nlogfilt)[0]
mspec = numpy.log(numpy.dot(spec, fbank.T)) # A tester avec log10 et log
# Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
# The C0 term is removed as it is the constant term
ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, 1:nceps + 1]
lst = list()
lst.append(ceps)
lst.append(log_energy)
if get_spec:
lst.append(spec)
else:
lst.append(None)
del spec
if get_mspec:
lst.append(mspec)
else:
lst.append(None)
del mspec
return lst
[docs]def fft2barkmx(n_fft, fs, nfilts=0, width=1., minfreq=0., maxfreq=8000):
"""
Generate a matrix of weights to combine FFT bins into Bark
bins. n_fft defines the source FFT size at sampling rate fs.
Optional nfilts specifies the number of output bands required
(else one per bark), and width is the constant width of each
band in Bark (default 1).
While wts has n_fft columns, the second half are all zero.
Hence, Bark spectrum is fft2barkmx(n_fft,fs) * abs(fft(xincols, n_fft));
2004-09-05 dpwe@ee.columbia.edu based on rastamat/audspec.m
:param n_fft: the source FFT size at sampling rate fs
:param fs: sampling rate
:param nfilts: number of output bands required
:param width: constant width of each band in Bark (default 1)
:param minfreq:
:param maxfreq:
:return: a matrix of weights to combine FFT bins into Bark bins
"""
maxfreq = min(maxfreq, fs / 2.)
min_bark = hz2bark(minfreq)
nyqbark = hz2bark(maxfreq) - min_bark
if nfilts == 0:
nfilts = numpy.ceil(nyqbark) + 1
wts = numpy.zeros((nfilts, n_fft))
# bark per filt
step_barks = nyqbark / (nfilts - 1)
# Frequency of each FFT bin in Bark
binbarks = hz2bark(numpy.arange(n_fft / 2 + 1) * fs / n_fft)
for i in range(nfilts):
f_bark_mid = min_bark + i * step_barks
# Linear slopes in log-space (i.e. dB) intersect to trapezoidal window
lof = (binbarks - f_bark_mid - 0.5)
hif = (binbarks - f_bark_mid + 0.5)
wts[i, :n_fft // 2 + 1] = 10 ** (numpy.minimum(numpy.zeros_like(hif), numpy.minimum(hif, -2.5 * lof) / width))
return wts
[docs]def fft2melmx(n_fft,
fs=8000,
nfilts=0,
width=1.,
minfreq=0,
maxfreq=4000,
htkmel=False,
constamp=False):
"""
Generate a matrix of weights to combine FFT bins into Mel
bins. n_fft defines the source FFT size at sampling rate fs.
Optional nfilts specifies the number of output bands required
(else one per "mel/width"), and width is the constant width of each
band relative to standard Mel (default 1).
While wts has n_fft columns, the second half are all zero.
Hence, Mel spectrum is fft2melmx(n_fft,fs)*abs(fft(xincols,n_fft));
minfreq is the frequency (in Hz) of the lowest band edge;
default is 0, but 133.33 is a common standard (to skip LF).
maxfreq is frequency in Hz of upper edge; default fs/2.
You can exactly duplicate the mel matrix in Slaney's mfcc.m
as fft2melmx(512, 8000, 40, 1, 133.33, 6855.5, 0);
htkmel=1 means use HTK's version of the mel curve, not Slaney's.
constamp=1 means make integration windows peak at 1, not sum to 1.
frqs returns bin center frqs.
% 2004-09-05 dpwe@ee.columbia.edu based on fft2barkmx
:param n_fft:
:param fs:
:param nfilts:
:param width:
:param minfreq:
:param maxfreq:
:param htkmel:
:param constamp:
:return:
"""
maxfreq = min(maxfreq, fs / 2.)
if nfilts == 0:
nfilts = numpy.ceil(hz2mel(maxfreq, htkmel) / 2.)
wts = numpy.zeros((nfilts, n_fft))
# Center freqs of each FFT bin
fftfrqs = numpy.arange(n_fft / 2 + 1) / n_fft * fs
# 'Center freqs' of mel bands - uniformly spaced between limits
minmel = hz2mel(minfreq, htkmel)
maxmel = hz2mel(maxfreq, htkmel)
binfrqs = mel2hz(minmel + numpy.arange(nfilts + 2) / (nfilts + 1) * (maxmel - minmel), htkmel)
for i in range(nfilts):
_fs = binfrqs[i + numpy.arange(3, dtype=int)]
# scale by width
_fs = _fs[1] + width * (_fs - _fs[1])
# lower and upper slopes for all bins
loslope = (fftfrqs - _fs[0]) / (_fs[1] - __fs[0])
hislope = (_fs[2] - fftfrqs)/(_fs[2] - _fs[1])
wts[i, 1 + numpy.arange(n_fft//2 + 1)] =numpy.maximum(numpy.zeros_like(loslope),numpy.minimum(loslope, hislope))
if not constamp:
# Slaney-style mel is scaled to be approx constant E per channel
wts = numpy.dot(numpy.diag(2. / (binfrqs[2 + numpy.arange(nfilts)] - binfrqs[numpy.arange(nfilts)])) , wts)
# Make sure 2nd half of FFT is zero
wts[:, n_fft // 2 + 1: n_fft] = 0
return wts, binfrqs
[docs]def audspec(power_spectrum,
fs=16000,
nfilts=None,
fbtype='bark',
minfreq=0,
maxfreq=8000,
sumpower=True,
bwidth=1.):
"""
:param power_spectrum:
:param fs:
:param nfilts:
:param fbtype:
:param minfreq:
:param maxfreq:
:param sumpower:
:param bwidth:
:return:
"""
if nfilts is None:
nfilts = int(numpy.ceil(hz2bark(fs / 2)) + 1)
if not fs == 16000:
maxfreq = min(fs / 2, maxfreq)
nframes, nfreqs = power_spectrum.shape
n_fft = (nfreqs -1 ) * 2
if fbtype == 'bark':
wts = fft2barkmx(n_fft, fs, nfilts, bwidth, minfreq, maxfreq)
elif fbtype == 'mel':
wts = fft2melmx(n_fft, fs, nfilts, bwidth, minfreq, maxfreq)
elif fbtype == 'htkmel':
wts = fft2melmx(n_fft, fs, nfilts, bwidth, minfreq, maxfreq, True, True)
elif fbtype == 'fcmel':
wts = fft2melmx(n_fft, fs, nfilts, bwidth, minfreq, maxfreq, True, False)
else:
print('fbtype {} not recognized'.format(fbtype))
wts = wts[:, :nfreqs]
if sumpower:
audio_spectrum = power_spectrum.dot(wts.T)
else:
audio_spectrum = numpy.dot(numpy.sqrt(power_spectrum), wts.T)**2
return audio_spectrum, wts
[docs]def postaud(x, fmax, fbtype='bark', broaden=0):
"""
do loudness equalization and cube root compression
:param x:
:param fmax:
:param fbtype:
:param broaden:
:return:
"""
nframes, nbands = x.shape
# Include frequency points at extremes, discard later
nfpts = nbands + 2 * broaden
if fbtype == 'bark':
bandcfhz = bark2hz(numpy.linspace(0, hz2bark(fmax), num=nfpts))
elif fbtype == 'mel':
bandcfhz = mel2hz(numpy.linspace(0, hz2bark(fmax), num=nfpts))
elif fbtype == 'htkmel' or fbtype == 'fcmel':
bandcfhz = mel2hz(numpy.linspace(0, hz2mel(fmax,1), num=nfpts),1)
else:
print('unknown fbtype {}'.format(fbtype))
# Remove extremal bands (the ones that will be duplicated)
bandcfhz = bandcfhz[broaden:(nfpts - broaden)]
# Hynek's magic equal-loudness-curve formula
fsq = bandcfhz ** 2
ftmp = fsq + 1.6e5
eql = ((fsq / ftmp) ** 2) * ((fsq + 1.44e6) / (fsq + 9.61e6))
# weight the critical bands
z = numpy.matlib.repmat(eql.T,nframes,1) * x
# cube root compress
z = z ** .33
# replicate first and last band (because they are unreliable as calculated)
if broaden == 1:
y = z[:, numpy.hstack((0,numpy.arange(nbands), nbands - 1))]
else:
y = z[:, numpy.hstack((1,numpy.arange(1, nbands - 1), nbands - 2))]
return y, eql
[docs]def dolpc(x, model_order=8):
"""
compute autoregressive model from spectral magnitude samples
:param x:
:param model_order:
:return:
"""
nframes, nbands = x.shape
r = numpy.real(numpy.fft.ifft(numpy.hstack((x,x[:,numpy.arange(nbands-2,0,-1)]))))
# First half only
r = r[:, :nbands]
# Find LPC coeffs by Levinson-Durbin recursion
y_lpc = numpy.ones((r.shape[0], model_order + 1))
for ff in range(r.shape[0]):
y_lpc[ff, 1:], e, _ = levinson(r[ff, :-1].T, order=model_order, allow_singularity=True)
# Normalize each poly by gain
y_lpc[ff, :] /= e
return y_lpc
[docs]def lpc2cep(a, nout):
"""
Convert the LPC 'a' coefficients in each column of lpcas
into frames of cepstra.
nout is number of cepstra to produce, defaults to size(lpcas,1)
2003-04-11 dpwe@ee.columbia.edu
:param a:
:param nout:
:return:
"""
ncol , nin = a.shape
order = nin - 1
if nout is None:
nout = order + 1
c = numpy.zeros((ncol, nout))
# First cep is log(Error) from Durbin
c[:, 0] = -numpy.log(a[:, 0])
# Renormalize lpc A coeffs
a /= numpy.tile(a[:, 0][:, None], (1, nin))
for n in range(1, nout):
sum = 0
for m in range(1, n):
sum += (n - m) * a[:, m] * c[:, n - m]
c[:, n] = -(a[:, n] + sum / n)
return c
[docs]def lpc2spec(lpcas, nout=17):
"""
Convert LPC coeffs back into spectra
nout is number of freq channels, default 17 (i.e. for 8 kHz)
:param lpcas:
:param nout:
:return:
"""
[cols, rows] = lpcas.shape
order = rows - 1
gg = lpcas[:, 0]
aa = lpcas / numpy.tile(gg, (rows,1)).T
# Calculate the actual z-plane polyvals: nout points around unit circle
zz = numpy.exp((-1j * numpy.pi / (nout - 1)) * numpy.outer(numpy.arange(nout).T, numpy.arange(order + 1)))
# Actual polyvals, in power (mag^2)
features = ( 1./numpy.abs(aa.dot(zz.T))**2) / numpy.tile(gg, (nout, 1)).T
F = numpy.zeros((cols, rows-1))
M = numpy.zeros((cols, rows-1))
for c in range(cols):
aaa = aa[c, :]
rr = numpy.roots(aaa)
ff = numpy.angle(rr.T)
zz = numpy.exp(1j * numpy.outer(ff, numpy.arange(len(aaa))))
mags = numpy.sqrt(((1./numpy.abs(zz.dot(aaa)))**2)/gg[c])
ix = numpy.argsort(ff)
keep = ff[ix] > 0
ix = ix[keep]
F[c, numpy.arange(len(ix))] = ff[ix]
M[c, numpy.arange(len(ix))] = mags[ix]
F = F[:, F.sum(axis=0) != 0]
M = M[:, M.sum(axis=0) != 0]
return features, F, M
[docs]def spec2cep(spec, ncep=13, type=2):
"""
Calculate cepstra from spectral samples (in columns of spec)
Return ncep cepstral rows (defaults to 9)
This one does type II dct, or type I if type is specified as 1
dctm returns the DCT matrix that spec was multiplied by to give cep.
:param spec:
:param ncep:
:param type:
:return:
"""
nrow, ncol = spec.shape
# Make the DCT matrix
dctm = numpy.zeros(ncep, nrow);
#if type == 2 || type == 3
# # this is the orthogonal one, the one you want
# for i = 1:ncep
# dctm(i,:) = cos((i-1)*[1:2:(2*nrow-1)]/(2*nrow)*pi) * sqrt(2/nrow);
# if type == 2
# # make it unitary! (but not for HTK type 3)
# dctm(1,:) = dctm(1,:)/sqrt(2);
#elif type == 4: # type 1 with implicit repeating of first, last bins
# """
# Deep in the heart of the rasta/feacalc code, there is the logic
# that the first and last auditory bands extend beyond the edge of
# the actual spectra, and they are thus copied from their neighbors.
# Normally, we just ignore those bands and take the 19 in the middle,
# but when feacalc calculates mfccs, it actually takes the cepstrum
# over the spectrum *including* the repeated bins at each end.
# Here, we simulate 'repeating' the bins and an nrow+2-length
# spectrum by adding in extra DCT weight to the first and last
# bins.
# """
# for i = 1:ncep
# dctm(i,:) = cos((i-1)*[1:nrow]/(nrow+1)*pi) * 2;
# # Add in edge points at ends (includes fixup scale)
# dctm(i,1) = dctm(i,1) + 1;
# dctm(i,nrow) = dctm(i,nrow) + ((-1)^(i-1));
# dctm = dctm / (2*(nrow+1));
#else % dpwe type 1 - same as old spec2cep that expanded & used fft
# for i = 1:ncep
# dctm(i,:) = cos((i-1)*[0:(nrow-1)]/(nrow-1)*pi) * 2 / (2*(nrow-1));
# dctm(:,[1 nrow]) = dctm(:, [1 nrow])/2;
#cep = dctm*log(spec);
return None, None, None
[docs]def lifter(x, lift=0.6, invs=False):
"""
Apply lifter to matrix of cepstra (one per column)
lift = exponent of x i^n liftering
or, as a negative integer, the length of HTK-style sin-curve liftering.
If inverse == 1 (default 0), undo the liftering.
:param x:
:param lift:
:param invs:
:return:
"""
nfrm , ncep = x.shape
if lift == 0:
y = x
else:
if lift > 0:
if lift > 10:
print('Unlikely lift exponent of {} did you mean -ve?'.format(lift))
liftwts = numpy.hstack((1, numpy.arange(1, ncep)**lift))
elif lift < 0:
# Hack to support HTK liftering
L = float(-lift)
if (L != numpy.round(L)):
print('HTK liftering value {} must be integer'.format(L))
liftwts = numpy.hstack((1, 1 + L/2*numpy.sin(numpy.arange(1, ncep) * numpy.pi / L)))
if invs:
liftwts = 1 / liftwts
y = x.dot(numpy.diag(liftwts))
return y
[docs]def plp(input_sig,
nwin=0.025,
fs=16000,
plp_order=13,
shift=0.01,
get_spec=False,
get_mspec=False,
prefac=0.97,
rasta=True):
"""
output is matrix of features, row = feature, col = frame
% fs is sampling rate of samples, defaults to 8000
% dorasta defaults to 1; if 0, just calculate PLP
% modelorder is order of PLP model, defaults to 8. 0 -> no PLP
:param input_sig:
:param fs: sampling rate of samples default is 8000
:param rasta: default is True, if False, juste compute PLP
:param model_order: order of the PLP model, default is 8, 0 means no PLP
:return: matrix of features, row = features, column are frames
"""
plp_order -= 1
# first compute power spectrum
powspec, log_energy = power_spectrum(input_sig, fs, nwin, shift, prefac)
# next group to critical bands
audio_spectrum = audspec(powspec, fs)[0]
nbands = audio_spectrum.shape[0]
if rasta:
# put in log domain
nl_aspectrum = numpy.log(audio_spectrum)
# next do rasta filtering
ras_nl_aspectrum = rasta_filt(nl_aspectrum)
# do inverse log
audio_spectrum = numpy.exp(ras_nl_aspectrum)
# do final auditory compressions
post_spectrum = postaud(audio_spectrum, fs / 2.)[0]
if plp_order > 0:
# LPC analysis
lpcas = dolpc(post_spectrum, plp_order)
# convert lpc to cepstra
cepstra = lpc2cep(lpcas, plp_order + 1)
# .. or to spectra
spectra, F, M = lpc2spec(lpcas, nbands)
else:
# No LPC smoothing of spectrum
spectra = post_spectrum
cepstra = spec2cep(spectra)
cepstra = lifter(cepstra, 0.6)
lst = list()
lst.append(cepstra)
lst.append(log_energy)
if get_spec:
lst.append(powspec)
else:
lst.append(None)
del powspec
if get_mspec:
lst.append(post_spectrum)
else:
lst.append(None)
del post_spectrum
return lst
[docs]def framing(sig, win_size, win_shift=1, context=(0, 0), pad='zeros'):
"""
:param sig: input signal, can be mono or multi dimensional
:param win_size: size of the window in term of samples
:param win_shift: shift of the sliding window in terme of samples
:param context: tuple of left and right context
:param pad: can be zeros or edge
"""
dsize = sig.dtype.itemsize
if sig.ndim == 1:
sig = sig[:, numpy.newaxis]
# Manage padding
c = (context, ) + (sig.ndim - 1) * ((0, 0), )
_win_size = win_size + sum(context)
shape = (int((sig.shape[0] - win_size) / win_shift) + 1, 1, _win_size, sig.shape[1])
strides = tuple(map(lambda x: x * dsize, [win_shift * sig.shape[1], 1, sig.shape[1], 1]))
if pad == 'zeros':
return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'constant', constant_values=(0,)),
shape=shape,
strides=strides).squeeze()
elif pad == 'edge':
return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'edge'),
shape=shape,
strides=strides).squeeze()
[docs]def dct_basis(nbasis, length):
"""
:param nbasis: number of CT coefficients to keep
:param length: length of the matrix to process
:return: a basis of DCT coefficients
"""
return scipy.fftpack.idct(numpy.eye(nbasis, length), norm='ortho')
[docs]def levinson(r, order=None, allow_singularity=False):
r"""Levinson-Durbin recursion.
Find the coefficients of a length(r)-1 order autoregressive linear process
:param r: autocorrelation sequence of length N + 1 (first element being the zero-lag autocorrelation)
:param order: requested order of the autoregressive coefficients. default is N.
:param allow_singularity: false by default. Other implementations may be True (e.g., octave)
:return:
* the `N+1` autoregressive coefficients :math:`A=(1, a_1...a_N)`
* the prediction errors
* the `N` reflections coefficients values
This algorithm solves the set of complex linear simultaneous equations
using Levinson algorithm.
.. math::
\bold{T}_M \left( \begin{array}{c} 1 \\ \bold{a}_M \end{array} \right) =
\left( \begin{array}{c} \rho_M \\ \bold{0}_M \end{array} \right)
where :math:`\bold{T}_M` is a Hermitian Toeplitz matrix with elements
:math:`T_0, T_1, \dots ,T_M`.
.. note:: Solving this equations by Gaussian elimination would
require :math:`M^3` operations whereas the levinson algorithm
requires :math:`M^2+M` additions and :math:`M^2+M` multiplications.
This is equivalent to solve the following symmetric Toeplitz system of
linear equations
.. math::
\left( \begin{array}{cccc}
r_1 & r_2^* & \dots & r_{n}^*\\
r_2 & r_1^* & \dots & r_{n-1}^*\\
\dots & \dots & \dots & \dots\\
r_n & \dots & r_2 & r_1 \end{array} \right)
\left( \begin{array}{cccc}
a_2\\
a_3 \\
\dots \\
a_{N+1} \end{array} \right)
=
\left( \begin{array}{cccc}
-r_2\\
-r_3 \\
\dots \\
-r_{N+1} \end{array} \right)
where :math:`r = (r_1 ... r_{N+1})` is the input autocorrelation vector, and
:math:`r_i^*` denotes the complex conjugate of :math:`r_i`. The input r is typically
a vector of autocorrelation coefficients where lag 0 is the first
element :math:`r_1`.
.. doctest::
>>> import numpy; from spectrum import LEVINSON
>>> T = numpy.array([3., -2+0.5j, .7-1j])
>>> a, e, k = LEVINSON(T)
"""
#from numpy import isrealobj
T0 = numpy.real(r[0])
T = r[1:]
M = len(T)
if order is None:
M = len(T)
else:
assert order <= M, 'order must be less than size of the input data'
M = order
realdata = numpy.isrealobj(r)
if realdata is True:
A = numpy.zeros(M, dtype=float)
ref = numpy.zeros(M, dtype=float)
else:
A = numpy.zeros(M, dtype=complex)
ref = numpy.zeros(M, dtype=complex)
P = T0
for k in range(M):
save = T[k]
if k == 0:
temp = -save / P
else:
#save += sum([A[j]*T[k-j-1] for j in range(0,k)])
for j in range(0, k):
save = save + A[j] * T[k-j-1]
temp = -save / P
if realdata:
P = P * (1. - temp**2.)
else:
P = P * (1. - (temp.real**2+temp.imag**2))
if (P <= 0).any() and allow_singularity==False:
raise ValueError("singular matrix")
A[k] = temp
ref[k] = temp # save reflection coeff at each step
if k == 0:
continue
khalf = (k+1)//2
if realdata is True:
for j in range(0, khalf):
kj = k-j-1
save = A[j]
A[j] = save + temp * A[kj]
if j != kj:
A[kj] += temp*save
else:
for j in range(0, khalf):
kj = k-j-1
save = A[j]
A[j] = save + temp * A[kj].conjugate()
if j != kj:
A[kj] = A[kj] + temp * save.conjugate()
return A, P, ref