Source code for alex.components.nlg.tectotpl.block.t2a.cs.generatewordforms

#!/usr/bin/env python
# coding=utf-8
#
# A Treex block
#
from __future__ import unicode_literals

from alex.components.nlg.tectotpl.core.block import Block
from alex.components.nlg.tectotpl.core.exception import LoadingException
from alex.components.nlg.tectotpl.tool.ml.model import Model
from alex.components.nlg.tectotpl.core.util import first
import re
import os.path


__author__ = "Ondřej Dušek"
__date__ = "2012"


[docs]class GenerateWordForms(Block): """ Inflect word forms according to filled-in tags. Arguments: language: the language of the target tree selector: the selector of the target tree """ BACK_REGEX = re.compile(r'^>([0-9]+)(.*)$') def __init__(self, scenario, args): """\ Constructor, just checking the argument values. """ Block.__init__(self, scenario, args) if self.language is None: raise LoadingException('Language must be defined!') self.model = None self.model_file = args['model']
[docs] def load(self): """\ Load the model from a pickle. """ self.model = Model.load_from_file(os.path.join(self.scenario.data_dir, self.model_file))
[docs] def process_atree(self, aroot): """\ Inflect word forms in the given a-tree. """ anodes = aroot.get_descendants(ordered=True) # set hard form = lemma for non-inflected words for anode in [anode for anode in anodes if anode.morphcat_pos in ['Z', 'J', 'R', '!']]: anode.form = anode.lemma # inflect the rest to_process = [anode for anode in anodes if anode.morphcat_pos not in ['Z', 'J', 'R', '!']] instances = [self.__get_features(anode) for anode in to_process] inflections = self.model.classify(instances) for anode, inflection in zip(to_process, inflections): self.__inflect(anode, inflection)
def __get_features(self, anode): """\ Retrieve all the features needed for morphological inflection and store them as a dictionary. """ # add lemma and morphological information feats = {'Lemma': anode.lemma, 'Tag_POS': anode.morphcat_pos, 'Tag_SubPOS': anode.morphcat_subpos, 'Tag_Gen': anode.morphcat_gender, 'Tag_Num': anode.morphcat_number, 'Tag_Cas': anode.morphcat_case, 'Tag_PGe': anode.morphcat_possgender, 'Tag_PNu': anode.morphcat_possnumber, 'Tag_Per': anode.morphcat_person, 'Tag_Ten': anode.morphcat_tense, 'Tag_Gra': anode.morphcat_grade, 'Tag_Neg': anode.morphcat_negation, 'Tag_Voi': anode.morphcat_voice} # concatenated features cas = anode.morphcat_case or '?' num = anode.morphcat_number or '?' gen = anode.morphcat_gender or '?' feats['Tag_Cas-Num-Gen'] = cas + num + gen feats['Tag_Num-Gen'] = num + gen feats['Tag_Cas-Gen'] = cas + gen feats['Tag_Cas-Num'] = cas + num # add suffixes of length 1 - 8 (inclusive) for suff_len in xrange(1, 9): feats['LemmaSuff_' + str(suff_len)] = anode.lemma[-suff_len:] return feats def __inflect(self, anode, inflection): """\ Set the anode's form according to the given inflection pattern. Supports front, back and mid changes (front changes currently unsupported by the model, there must be a different model to do them). """ # start from lemma form = anode.lemma # replace irregular if inflection.startswith('*'): form = inflection[1:] # if there are changes, perform them elif inflection != '': # find out the front, mid, back changes diffs = inflection.split(",") front = first(lambda x: x.startswith('<'), diffs) back = first(lambda x: x.startswith('>'), diffs) mid = first(lambda x: '-' in x, diffs) # perform the changes add_back = '' # chop off the things from the back if back is not None: chop, add_back = self.BACK_REGEX.match(back).groups() chop = int(chop) if chop != 0: form = form[0:-chop] # change mid vowel if mid is not None: orig, changed = mid.split('-') if len(orig) > 0: pos = form.lower().rfind(orig, 0, -1) else: pos = len(form) - 1 if pos >= -1: form = form[0:pos] + changed + form[pos + len(orig):] # add things to beginning and end if front is not None: form = front[1:] + form form = form + add_back # set the resulting form to the anode anode.form = form