Source code for alex.utils.mfcc

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np

from scipy.fftpack import dct
from collections import deque


[docs]class MFCCKaldi: ''' TODO port Kaldi mfcc to Python. Use similar parameters as in suggested in __init__ function ''' def __init__(self, sourcerate=16000, framesize=512, usehamming=True, preemcoef=0.97, numchans=26, ceplifter=22, numceps=12, enormalise=True, zmeansource=True, usepower=True, usec0=True, usecmn=False, usedelta=True, useacc=True, n_last_frames=0, lofreq=125, hifreq=3800, mel_banks_only=False): self.sourcerate = sourcerate self.framesize = framesize self.usehamming = usehamming self.preemcoef = preemcoef self.numchans = numchans self.ceplifter = ceplifter self.enormalise = enormalise self.zmeansource = zmeansource self.usepower = usepower self.usec0 = usec0 self.usecmn = usecmn self.usedelta = usedelta self.useacc = useacc self.numceps = numceps self.lofreq = lofreq self.hifreq = hifreq self.mel_banks_only = mel_banks_only
[docs] def param(self, frame): """Compute the MFCC coefficients in a way similar to the HTK."""
[docs]class MFCCFrontEnd: """This is an a CLOSE approximation of MFCC coefficients computed by the HTK. The frame size should be a number of power of 2. TODO: CMN is not implemented. It should normalise only teh cepstrum, not the delta or acc coefficients. It was not tested to give exactly the same results the HTK. As a result, it should not be used in conjunction with models trained on speech parametrised with the HTK. Over all it appears that this implementation of MFCC is worse than the one from the HTK. On the VAD task, the HTK features score 90.8% and the this features scores only 88.7%. """ def __init__(self, sourcerate=16000, framesize=512, usehamming=True, preemcoef=0.97, numchans=26, ceplifter=22, numceps=12, enormalise=True, zmeansource=True, usepower=True, usec0=True, usecmn=False, usedelta=True, useacc=True, n_last_frames = 0, lofreq=125, hifreq=3800, mel_banks_only = False): self.sourcerate = sourcerate self.framesize = framesize self.usehamming = usehamming self.preemcoef = preemcoef self.numchans = numchans self.ceplifter = ceplifter self.enormalise = enormalise self.zmeansource = zmeansource self.usepower = usepower self.usec0 = usec0 self.usecmn = usecmn self.usedelta = usedelta self.useacc = useacc self.numceps = numceps self.lofreq = lofreq self.hifreq = hifreq self.mel_banks_only = mel_banks_only self.prior = 0.0 self.n_last_frames = n_last_frames self.mfcc_queue = deque(maxlen=4 + n_last_frames) self.mfcc_delta_queue = deque(maxlen=4 + n_last_frames) self.init_hamming() self.init_mel_filter_bank() self.init_cep_liftering_weights()
[docs] def freq_to_mel(self, freq): return 1127 * np.log(1.0 + freq / 700.0)
[docs] def mel_to_freq(self, mel): return 700 * (np.exp(mel / 1127) - 1.0)
[docs] def init_hamming(self): self.hamming = np.hamming(self.framesize)
[docs] def init_mel_filter_bank(self): """Initialise the triangular mel freq filters.""" minMel = self.freq_to_mel(self.lofreq) maxMel = self.freq_to_mel(self.hifreq) # print "MM", minMel, "MM", maxMel # Create a matrix for triangular filters, one row per filter filterMatrix = np.zeros((self.numchans, self.framesize / 2 + 1)) melRange = np.array(xrange(self.numchans + 2)) # print "MR", melRange melCenterFilters = melRange * (maxMel - minMel) / ( self.numchans + 1) + minMel # print "MCF", melCenterFilters dfreq = self.sourcerate / self.framesize # each array index represent the center of each triangular filter centerIndex = np.array( np.round(self.mel_to_freq(melCenterFilters) / dfreq), int) # print "CI", centerIndex for i in xrange(self.numchans): start, centre, end = centerIndex[i:i + 3] k1 = np.float32(centre - start) k2 = np.float32(end - centre) up = (np.array(xrange(start, centre)) - start) / k1 down = (end - np.array(xrange(centre, end))) / k2 filterMatrix[i][start:centre] = up filterMatrix[i][centre:end] = down self.mel_filter_bank = filterMatrix.transpose() # print "SMFB", self.mel_filter_bank.shape
[docs] def init_cep_liftering_weights(self): cep_lift_weights = np.zeros((self.numceps, )) a = np.pi / self.ceplifter b = self.ceplifter / 2.0 for i in range(self.numceps): cep_lift_weights[i] = 1.0 + b * np.sin(i * a) self.cep_lift_weights = cep_lift_weights
[docs] def preemphasis(self, frame): out_frame = np.zeros_like(frame) out_frame[0] = frame[0] - self.preemcoef * self.prior for i in range(1, len(frame)): out_frame[i] = frame[i] - self.preemcoef * frame[i - 1] self.prior = frame[-1] return out_frame
[docs] def param(self, frame): """Compute the MFCC coefficients in a way similar to the HTK.""" # zero mean if self.zmeansource: frame = frame - np.mean(frame) # preemphasis frame = self.preemphasis(frame) # apply hamming window if self.usehamming: frame = self.hamming * frame complex_spectrum = np.fft.rfft(frame) # print "LCS", len(complex_spectrum) power_spectrum = complex_spectrum.real * complex_spectrum.real + \ complex_spectrum.imag * complex_spectrum.imag # compute only power spectrum if required if not self.usepower: power_spectrum = np.sqrt(power_spectrum) # print "SPS",power_spectrum.shape mel_spectrum = np.dot(power_spectrum, self.mel_filter_bank) # apply mel floor for i in range(len(mel_spectrum)): if mel_spectrum[i] < 1.0: mel_spectrum[i] = 1.0 mel_spectrum = np.log(mel_spectrum) if self.mel_banks_only: mfcc = mel_spectrum self.mfcc_queue.append(mel_spectrum) else: cepstrum = dct(mel_spectrum, type=2, norm='ortho') c0 = cepstrum[0] htk_cepstrum = cepstrum[1:self.numceps + 1] # cepstral liftering cep_lift_mfcc = self.cep_lift_weights * htk_cepstrum if self.usec0: mfcc = np.append(cep_lift_mfcc, c0) else: mfcc = cep_lift_mfcc # compute delta and acceleration coefficients if requested self.mfcc_queue.append(mfcc) # print len(self.mfcc_queue) if self.usedelta: # print "LMQ", len(self.mfcc_queue) if len(self.mfcc_queue) >= 2: delta = np.zeros_like(mfcc) for i in range(1, len(self.mfcc_queue)): delta += self.mfcc_queue[i] - self.mfcc_queue[i - 1] delta /= len(self.mfcc_queue) - 1 self.mfcc_delta_queue.append(delta) else: delta = np.zeros_like(mfcc) if self.useacc: if len(self.mfcc_delta_queue) >= 2: acc = np.zeros_like(mfcc) for i in range(1, len(self.mfcc_delta_queue)): acc += self.mfcc_delta_queue[i] - \ self.mfcc_delta_queue[i - 1] acc /= len(self.mfcc_delta_queue) - 1 else: acc = np.zeros_like(mfcc) if self.usedelta: mfcc = np.append(mfcc, delta) if self.useacc: mfcc = np.append(mfcc, acc) for i in range(self.n_last_frames): if len(self.mfcc_queue) > i + 1 : mfcc = np.append(mfcc, self.mfcc_queue[-1-i-1]) else: mfcc = np.append(mfcc, np.zeros_like(self.mfcc_queue[-1])) return mfcc.astype(np.float32)