Source code for alex.tools.vad.train_vad_gmm

#!/usr/bin/env python
# -*- coding: utf-8 -*-
if __name__ == '__main__':
    import autopath

import numpy as np
import datetime
from multiprocessing import *


from alex.ml.gmm import GMM
from alex.utils.htk import *

n_max_frames = 5000000
n_crossvalid_frames = int((0.20 * n_max_frames ))  # cca 20% of all training data

max_files = 100000
max_frames_per_segment = 50
trim_segments = 0
n_iter = 10
n_mixies = 64 # 32 # 16


[docs]def load_mlf(train_data_sil_aligned, max_files, max_frames_per_segment):
    mlf_sil = MLF(train_data_sil_aligned, max_files=max_files)
    mlf_sil.filter_zero_segments()
    # map all sp, _noise_, _laugh_, _inhale_ to sil
    mlf_sil.sub('sp', 'sil')
    mlf_sil.sub('_noise_', 'sil')
    mlf_sil.sub('_laugh_', 'sil')
    mlf_sil.sub('_inhale_', 'sil')
    # map everything except of sil to speech
    mlf_sil.sub('sil', 'speech', False)
    mlf_sil.merge()
    #mlf_sil.times_to_seconds()
    mlf_sil.times_to_frames()
    mlf_sil.trim_segments(trim_segments)
    mlf_sil.shorten_segments(max_frames_per_segment)

    return mlf_sil


[docs]def mixup(gmm, vta, name):
    i = len(gmm.weights)

    if i >= 256:
        gmm.mixup(12)
    if i >= 128:
        gmm.mixup(10)
    if i >= 64:
        gmm.mixup(8)
    elif i >= 32:
        gmm.mixup(6)
    elif i >= 16:
        gmm.mixup(4)
    elif i >= 8:
        gmm.mixup(2)
    else:
        gmm.mixup(1)

    gmm.fit(vta)
    print "%s weights: %d" % (name, len(gmm.weights))
    print gmm.weights
    print "%s LP: %f" % (name, gmm.log_probs[-1])
    print datetime.datetime.now()
    print "-" * 120


[docs]def train_gmm(name, vta):

    vta = [frame for frame, label in vta if label == name]

    gmm = GMM(n_features=36, n_components=1, n_iter=n_iter)
    gmm.fit(vta)
    while len(gmm.weights) < n_mixies:
        mixup(gmm, vta, name)
    gmm.save_model('model_voip/vad_%s_sds_mfcc.gmm' % name)
    return


if __name__ == '__main__':

    train_data_sil = 'data_vad_sil/data/*.wav'
    train_data_sil_aligned = 'data_vad_sil/vad-silence.mlf'

    train_data_speech = 'data_voip_en/train/*.wav'
    train_data_speech_aligned = 'asr_model_voip_en/aligned_best.mlf'

    mlf_sil = load_mlf(train_data_sil_aligned, max_files, max_frames_per_segment)
    mlf_speech = load_mlf(train_data_speech_aligned, max_files, max_frames_per_segment)

    print datetime.datetime.now()
    # print "The length of sil segments in sil:    ", mlf_sil.count_length('sil')
    # print "The length of speech segments in sil: ", mlf_sil.count_length('speech')
    print "The length of sil segments in speech:    ", mlf_speech.count_length('sil')
    print "The length of speech segments in speech: ", mlf_speech.count_length('speech')

    vta = MLFMFCCOnlineAlignedArray(usec0=False)
    # vta.append_mlf(mlf_sil)
    # vta.append_trn(train_data_sil)
    vta.append_mlf(mlf_speech)
    vta.append_trn(train_data_speech)

    print "Generating the MFCC features"
    train = []
    test = []
    i = 0
    for frame, label in vta:
        if i % (n_max_frames / 10) == 0:
            print "Already processed: %.2f%% of data" % (100.0*i/n_max_frames)

        if i > n_max_frames:
            break


        if i < n_crossvalid_frames:
            test.append((frame, label))
        else:
            train.append((frame, label))

        i += 1

    p_speech = Process(target=train_gmm, args=('speech',train))
    p_sil = Process(target=train_gmm, args=('sil', train))
    p_speech.start()
    p_sil.start()

    p_sil.join()
    print "Sil GMM training finished"
    print datetime.datetime.now()
    p_speech.join()
    print "Speech GMM training finished"
    print datetime.datetime.now()

    #train_speech_gmm()
    #train_sil_gmm()


    print '-' * 120
    print 'VAD GMM test'
    print datetime.datetime.now()
    print '-' * 120
    gmm_speech = GMM(n_features=0)
    gmm_speech.load_model('model_voip/vad_speech_sds_mfcc.gmm')
    gmm_sil = GMM(n_features=0)
    gmm_sil.load_model('model_voip/vad_sil_sds_mfcc.gmm')


    vta = test

    print "Length of test data:", len(vta)
    print datetime.datetime.now()

    accuracy = 0.0
    n = 0
    for frame, label in vta:
        log_prob_speech = gmm_speech.score(frame)
        log_prob_sil = gmm_sil.score(frame)

        ratio = log_prob_speech - log_prob_sil
        if ratio >= 0:
            rec_label = 'speech'
        else:
            rec_label = 'sil'

        if rec_label == label:
            accuracy += 1.0

        n += 1

    accuracy = accuracy * 100.0 / n

    print "VAD accuracy : %0.3f%% " % accuracy
    print datetime.datetime.now()