#!/usr/bin/env python #-*- coding: utf-8 -*- ### ### Script for extracting transfer rules from mrs phrase tables ### import sys logonroot = sys.argv[1] corpus = sys.argv[2] transdir = sys.argv[3] # Prefix added to source language predicates in order to avoid loops # in the tranfer grammar. Change the value to '' if the source # language is not Japanese src_prefix = 'ja:' # All template types types = set(['adjective_','noun_','adj_v_','arg1_v_','arg12_v_','arg123_v_','n+n_adj+n_','n+n_n+n_','n_named_','pp-adj_','pp_pp_','arg12+np_arg12+np_','n+adj-adj-','p+n+arg12_arg12_','pp+arg12_arg12_','pp+np_np+pp_','arg1+pp_arg1+pp_','n_n+n_','n_adj+n_','n+n_n_','preposition_','adj_pp_']) threshold = 0.1 mwe_thresh = 0.01 # Reading the source language lexicon srclex = open(transdir + 'source-lex.tab') srcrel2lem = {} for line in srclex: items = line.split('\t') lemma = items[0] rel = items[1] srcrel2lem[rel] = lemma # Reading the target language lexicon tgtlex = open(transdir + 'target-lex.tab') tgtrel2lem = {} for line in tgtlex: items = line.split('\t') lemma = items[0] rel = items[1] tgtrel2lem[rel] = lemma # Reading the existing transfer rules transfer = open(transdir + 'hand-rules') transferset = set([]) enset = set([]) alltrans = set([]) allsource = set([]) for line in transfer: alltrans.add(line[:-1]) items = line.split('\t') source = items[0] target = items[1][:-1] source = source.replace(src_prefix,'') sourcestr = source.replace('&',' ') transferset.add(sourcestr) enset.add(target) # Function for looping through alignment files and checking the # alignments for matches against templates. The function that contains # the templates, is given in 'jaen/templates.py' rulecheck = set([]) transdict = {} ruledict = {} freqs = {} trans2freq = {} trans2prob = {} def readfile(phrtab): infile = open(transdir + corpus + '-profiles/mrs-thin') for line in infile: items = line.split('\t') sourcestr = items[0] targetstr = items[1] prob = float(items[2]) freq = 1 newtargetstr = '' for string in targetstr.split(): newstr = string.split('@')[0] newtargetstr = newtargetstr + newstr + ' ' newsourcestr = sourcestr.split('@')[0] newtargetstr = newtargetstr[:-1] trans = newsourcestr + ' >> ' + newtargetstr trans = trans.replace('nmd_','') if phrtab == 'any': transferset.add(sourcestr) trans2freq[trans] = freq trans2prob[trans] = trans2prob.get(trans, []) + [prob] source = sourcestr.split() newsource = [] for s in source: if '_q_' in s and not s[-1]=='"': s = '"' + s + '"' newsource.append(s) source = newsource target = targetstr.split() opt = '' if phrtab == items[3][:-1]: from jaen.templates import templates templates(source, target, prob, threshold, opt, trans, phrtab, src_prefix, alltrans,rulecheck,ruledict,transdict,mwe_thresh) readfile('any') readfile('mos') def sortfunc(x,y): return cmp(x[1][1],y[1][1]) items=ruledict.items() items.sort(sortfunc) items.reverse() rulekeys = [] for item in items: rulekeys.append(item[0]) source_count = {} for trans in ruledict.keys(): source = trans.split(' := ')[0].split('--')[0] source_count[source] = source_count.get( source , 0 ) + 1 transset = set([]) evaldict = {} # Printing the transfer rules a = open(transdir + corpus + '.single.mtr','w') b = open(transdir + corpus + '.mwe.mtr','w') newtrans2prob = {} x = 0 y = 0 for key in rulekeys: rulevalue = ruledict[key] rule = rulevalue[0] inlen = rulevalue[1] sourcerule = rule.split('--')[0] opt = '' if source_count[sourcerule] > 1: opt = 'o' source_count[sourcerule] = source_count[sourcerule] - 1 if opt == 'o': rule = rule.replace('_mtr','_omtr') supertype = rule.split('\n')[0].split()[2] prob = 0 freq = 0 for translation in set(transdict[key]): try: if trans2freq[translation] > freq: freq = trans2freq[translation] if float(freq) == 0.1: freq = 0.5 if max(trans2prob[translation]) > prob: prob = max(trans2prob[translation]) if inlen == 1: a.write('; ' + translation) a.write('\t' + str(trans2freq[translation]) + '\t' + str(max(trans2prob[translation])) +'\n') elif inlen > 1: b.write('; ' + translation) b.write('\t' + str(trans2freq[translation]) + '\t' + str(max(trans2prob[translation])) +'\n') newtrans2prob[translation] = max(trans2prob[translation]) transset.add(translation) evaldict[translation] = evaldict.get(translation, []) + [key] except: print translation if inlen == 1: a.write(rule) x = x+1 elif inlen > 1: b.write(rule) y = y+1 infl = 's' if x == 1: infl = '' print 'Wrote '+str(x)+' rule'+infl+' in \''+transdir + corpus + '.single.mtr\'' infl = 's' if y == 1: infl = '' print 'Wrote '+str(y)+' rule'+infl+' in \''+transdir + corpus + '.mwe.mtr\''