Source code for alex.components.hub.asr

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# This code is PEP8-compliant. See http://www.python.org/dev/peps/pep-0008.

from __future__ import unicode_literals

if __name__ == "__main__":
    import autopath

from collections import deque
import multiprocessing
import time

from alex.components.asr.common import asr_factory
from alex.components.asr.exceptions import ASRException
from alex.components.asr.utterance import UtteranceNBList, UtteranceConfusionNetwork
from alex.components.hub.messages import Command, Frame, ASRHyp
from alex.utils.procname import set_proc_name


[docs]class ASR(multiprocessing.Process):

    """
    ASR recognizes input audio and returns an N-best list hypothesis or
    a confusion network.

    Recognition starts with the "speech_start()" command in the input audio
    stream and ends with the "speech_end()" command.

    When the "speech_end()" command is received, the component asks responsible
    ASR module to return hypotheses and sends them to the output.

    This component is a wrapper around multiple recognition engines which
    handles inter-process communication.

    Attributes:
        asr -- the ASR object itself

    """

    def __init__(self, cfg, commands, audio_in, asr_hypotheses_out, close_event):
        """
        Initialises an ASR object according to the configuration (cfg['ASR']
        is the relevant section), and stores pipe ends to other processes.

        Arguments:
            cfg: a Config object specifying the configuration to use
            commands: our end of a pipe (multiprocessing.Pipe) for receiving
                commands
            audio_in: our end of a pipe (multiprocessing.Pipe) for receiving
                audio frames (from VAD)
            asr_hypotheses_out: our end of a pipe (multiprocessing.Pipe) for
                sending ASR hypotheses

        """

        multiprocessing.Process.__init__(self)

        self.cfg = cfg
        self.commands = commands
        self.local_commands = deque()
        self.audio_in = audio_in
        self.local_audio_in = deque()
        self.asr_hypotheses_out = asr_hypotheses_out
        self.close_event = close_event

        # Load the ASR
        self.asr = asr_factory(cfg)

        self.system_logger = self.cfg['Logging']['system_logger']
        self.session_logger = self.cfg['Logging']['session_logger']

        self.recognition_on = False

[docs]    def recv_input_locally(self):
        """ Copy all input from input connections into local queue objects.

        This will prevent blocking the senders.
        """

        while self.commands.poll():
            command = self.commands.recv()
            self.local_commands.append(command)

        while self.audio_in.poll():
            frame = self.audio_in.recv()
            self.local_audio_in.append(frame)

[docs]    def process_pending_commands(self):
        """Process all pending commands.

        Available commands:
          stop() - stop processing and exit the process
          flush() - flush input buffers.
            Now it only flushes the input connection.

        Returns True iff the process should terminate.
        """

        while self.local_commands:
            command = self.local_commands.popleft()
            if self.cfg['ASR']['debug']:
                self.system_logger.debug(command)

            if isinstance(command, Command):
                if command.parsed['__name__'] == 'stop':
                    return True

                if command.parsed['__name__'] == 'flush':
                    # Discard all data in input buffers.
                    while self.audio_in.poll():
                        self.audio_in.recv()

                    self.local_audio_in.clear()
                    self.asr.flush()
                    self.recognition_on = False

                    self.commands.send(Command("flushed()", 'ASR', 'HUB'))

                    return False

        return False

[docs]    def read_audio_write_asr_hypotheses(self):
        # Read input audio.
        if self.local_audio_in:
            if len(self.local_audio_in) > 40:
                print "ASR unprocessed frames:", len(self.local_audio_in)

            if len(self.local_audio_in) > 200:
                print "ASR too many unprocessed frames:", len(self.local_audio_in)
                print "    skipping everything until the end of the segment:", len(self.local_audio_in)
                while len(self.local_audio_in) > 2 and isinstance(self.local_audio_in[0], Frame):
                    skip = self.local_audio_in.popleft()

            # read recorded audio
            data_rec = self.local_audio_in.popleft()

            if isinstance(data_rec, Frame):
                if self.recognition_on:
                    self.asr.rec_in(data_rec)
            elif isinstance(data_rec, Command):
                dr_speech_start = False
                fname = None

                if data_rec.parsed['__name__'] == "speech_start":
                    # check whether there are more then one speech segments
                    segments = [ cmd for cmd in self.local_audio_in
                                 if isinstance(cmd, Command) and cmd.parsed['__name__'] == "speech_start"]
                    if len(segments):
                        # there are multiple unprocessed segments in the queue
                        # remove all unprocessed segments except the last
                        print "ASR too many unprocessed speech segments:", len(segments)
                        print "    removed all segments but the last"
                        removed_segments = 0
                        while removed_segments < len(segments):
                            data_rec = self.local_audio_in.popleft()
                            if isinstance(data_rec, Command) and data_rec.parsed['__name__'] == "speech_start":
                                removed_segments += 1

                    dr_speech_start = "speech_start"
                    fname = data_rec.parsed['fname']
                elif data_rec.parsed['__name__'] == "speech_end":
                    dr_speech_start = "speech_end"
                    fname = data_rec.parsed['fname']

                # Check consistency of the input command.
                if dr_speech_start:
                    if ((not self.recognition_on and dr_speech_start != "speech_start")
                        or
                        (self.recognition_on and dr_speech_start != "speech_end")):
                        msg = ('Commands received by the ASR component are '
                               'inconsistent (recognition_on: {rec}; the new '
                               'command: {cmd}').format(
                                   rec=self.recognition_on,
                                   cmd=dr_speech_start)
                        self.system_logger.exception(msg)

                if dr_speech_start == "speech_start":
                    self.commands.send(Command('asr_start(fname="%s")' % fname, 'ASR', 'HUB'))
                    self.recognition_on = True

                    if self.cfg['ASR']['debug']:
                        self.system_logger.debug('ASR: speech_start(fname="%s")' % fname)

                elif dr_speech_start == "speech_end":
                    self.recognition_on = False

                    if self.cfg['ASR']['debug']:
                        self.system_logger.debug('ASR: speech_end(fname="%s")' % fname)

                    try:
                        asr_hyp = self.asr.hyp_out()

                        if self.cfg['ASR']['debug']:
                            msg = list()
                            msg.append("ASR Hypothesis")
                            msg.append("-" * 60)
                            msg.append(unicode(asr_hyp))
                            msg.append(u"")
                            msg = u'\n'.join(msg)
                            self.system_logger.debug(msg)

                    except (ASRException, JuliusASRTimeoutException):
                        self.system_logger.debug("Julius ASR Result Timeout.")
                        if self.cfg['ASR']['debug']:
                            msg = list()
                            msg.append("ASR Alternative hypothesis")
                            msg.append("-" * 60)
                            msg.append("sil")
                            msg.append("")
                            msg = u'\n'.join(msg)
                            self.system_logger.debug(msg)

                        asr_hyp = UtteranceConfusionNetwork()
                        asr_hyp.add([[1.0, "_other_"], ])

                    # The ASR component can return either NBList or a confusion
                    # network.
                    if isinstance(asr_hyp, UtteranceNBList):
                        self.session_logger.asr("user", fname, asr_hyp, None)
                    elif isinstance(asr_hyp, UtteranceConfusionNetwork):
                        self.session_logger.asr("user", fname, asr_hyp.get_utterance_nblist(), asr_hyp)
                    else:
                        self.session_logger.asr("user", fname, [(-1, asr_hyp)], None)

                    self.commands.send(Command('asr_end(fname="%s")' % fname, 'ASR', 'HUB'))
                    self.asr_hypotheses_out.send(ASRHyp(asr_hyp, fname=fname))
            else:
                raise ASRException('Unsupported input.')

[docs]    def run(self):
        try:
            set_proc_name("Alex_ASR")
            self.cfg['Logging']['session_logger'].cancel_join_thread()

            while 1:
                # Check the close event.
                if self.close_event.is_set():
                    print 'Received close event in: %s' % multiprocessing.current_process().name
                    return

                time.sleep(self.cfg['Hub']['main_loop_sleep_time'])

                s = (time.time(), time.clock())

                self.recv_input_locally()

                # Process all pending commands.
                if self.process_pending_commands():
                    return

                # Process audio data.
                for i in range(self.cfg['ASR']['n_rawa']):
                    self.read_audio_write_asr_hypotheses()

                d = (time.time() - s[0], time.clock() - s[1])
                if d[0] > 0.200:
                    print "EXEC Time inner loop: ASR t = {t:0.4f} c = {c:0.4f}\n".format(t=d[0], c=d[1])

        except KeyboardInterrupt:
            print 'KeyboardInterrupt exception in: %s' % multiprocessing.current_process().name
            self.close_event.set()
            return
        except:
            self.cfg['Logging']['system_logger'].exception('Uncaught exception in ASR process.')
            self.close_event.set()
            raise

        print 'Exiting: %s. Setting close event' % multiprocessing.current_process().name
        self.close_event.set()