#-*- coding: utf-8 -*- # # Script for selecting transfer rules relevant to batch files # from the automatically derived transfer rules. The script reads the # file(s) given as argument(s) and selects the transfer rules that could # apply to the text in the batch file(s). To run the script, give the # following command: # # $ python select-rule.py LOGONROOT BATHCFILE1 BATHCFILE2 ... # # $ python select-rule.py /home/petterha/logon /dfki/jacy/tsdb/skeletons/tanaka/tc-000/item /dfki/jacy/tsdb/skeletons/tanaka/tc-001/item /dfki/jacy/tsdb/skeletons/tanaka/tc-002/item # # To install MeCab, try # # sudo apt-get install python-yaml # sudo apt-get install mecab-ipadic-utf8 python-mecab # import sys import MeCab import os import re threshold = 0.1 division = 1 logonroot = sys.argv[1] path = logonroot + '/uio/tm/jaen/auto/' mecab = MeCab.Tagger('-Ochasen') # Reading Edict edictset = set([]) unfulltrans = set([]) edict = open(logonroot + '/uio/tm/jaen/auto/edict.ja-en.txt') for line in edict: edictset.add(line[:-1]) items = line[:-1].split('\t') ens = items[1].split() if len(ens) == 2: unfulltrans.add(items[0] + '\t' + ens[0]) unfulltrans.add(items[0] + '\t' + ens[1]) # Reading Jacy jacy = open(logonroot + '/dfki/jacy/lex/Jacy.rev') jrel = {} id2rel = {} rel2lem = {} for line in jacy: items = line.split('\t') jap = items[4] japs = jap.split() rel = items[5] rel2lem[rel] = jap myid = items[0] id2rel[myid] = rel try: reljap = rel.split('_')[1] except: reljap = '' for jap in japs: if not reljap == '': jrel[jap] = jrel.get( jap , [] ) + [reljap] ## FIXME ## Added 'saseru' to the dictonaries. The _saseru_v_cause_rel is not specified ## in the Jacy lexicon. jrel['させる'] = jrel.get( 'させる' , [] ) + ["saseru"] rel2lem['"_saseru_v_cause_rel"'] = 'させる' id2rel['saseru-intrans-end'] = '"_saseru_v_cause_rel"' id2rel['saseru-trans-obj-end'] = '"_saseru_v_cause_rel"' ## End FIXME jrelkeys = set(jrel.keys()) # Reading the ERG lexicon erg = open(logonroot + '/lingo/erg/lexicon.tdl') erel2lem = {} for line in erg: if ':=' in line: items = line.split() t = items[2] if 'ORTH' in line: items = line.split() expr = '' for item in items: if item[-1] == ',': item = item[:-1] if '"' in item: item = item.replace('"','') expr = expr + item + ' ' o = expr[:-1] if 'KEYREL.PRED' in line: items = line.split() for item in items: if '_rel' in item: if item[-1] == ',': item = item[:-1] p = item erel2lem[p] = o # Reading Jacy fullform fullform = open(path+'/jacy-fullform.txt') for line in fullform: items = line.split(' ') if len(items) == 3 and 'INFL-RULE' in line: jap = items[0] myid = items[1].lower() try: rel = id2rel[myid] japs = jap.split() reljap = rel.split('_')[1] except: reljap = '' for jap in japs: if not reljap == '': jrel[jap] = jrel.get( jap , [] ) + [reljap] jrelkeys = set(jrel.keys()) # Reading transfer rule files and writing three dictionaries: # ruledict[ruleid] = rule # r2l[ruleid] = lemmas # l2r[lemma] = ruleids sgids = [] mweids = [] ruleset = set([]) r2l = {} ruledict = {} l2r = {} ja2jaen = {} jaen2rule = {} jaen2prob = {} def findrule(mtrfile): prob = 0.1 infile = open(path + mtrfile) rule = '' inrule = 0 outrule = 0 for line in infile: items = line.split(' ') if ';' in line: comment = line[:-1].split('\t') if len(comment) == 3: if not mtrfile in ['lex-auto-jaen.single.phr-tab.mtr']: prob = float(comment[2]) else: prob = float(comment[2])/division else: prob = 0.09 if ':=' in line and len(items[0].split('--')) > 1: inrule = 1 ruleid = items[0] jpid = ruleid.split('--')[0] enid = ruleid.split('--')[1] jaen = jpid + '--' + enid jps = jpid.split('+') jlist = [] for jp in jps: js = jp.split('_') j = js[0] j = re.sub(r'[0-9]', '', j) if not j in ['udef']: jlist.append(j) if inrule == 1: rule = rule + line if '].' in line: ja2jaen[jpid] = ja2jaen.get(jpid,[]) + [jaen] inrule = 0 r2l[jaen] = jlist ruledict[jaen] = rule jaen2prob[jaen] = jaen2prob.get(jaen,[]) + [prob] for lemma in jlist: l2r[lemma] = l2r.get(lemma , []) + [jaen] if mtrfile in ['lex-auto-jaen.mwe.mrs-tab.mtr','lex-auto-jaen.mwe.phr-tab.mtr']: mweids.append(jaen) else: sgids.append(jaen) rule = '' findrule('lex-auto-jaen.mwe.mrs-tab.mtr') findrule('lex-auto-jaen.mwe.phr-tab.mtr') #findrule('enamdict.mtr') findrule('lex-auto-jaen.single.mrs-tab.mtr') findrule('lex-auto-jaen.single.phr-tab.mtr') # Reading the batch file(s) and choosing rules that apply to the lemmas # in each sentence #print len(ja2jaen.keys()) # for ja in ja2jaen.keys(): # ruleids = ja2jaen[ja] # for ruleid in set(ruleids): # print ruledict[ruleid] l2rkeys = set(l2r.keys()) profiles = sys.argv[2:] for profile in profiles: batchfile = open(logonroot + profile) for line in batchfile: items = line.split('@') try: text = items[6] except: text = line relations = set([]) for word in text.split(): if word in jrelkeys: for reljap in jrel[word]: relations.add(reljap) else: relations.add(word) node = mecab.parseToNode(text) while node: word = node.surface relations.add(word) lemma = node.feature.split(",")[6] if lemma in jrelkeys: for reljap in jrel[lemma]: relations.add(reljap) else: relations.add(lemma) node = node.next relations_cp = relations for rel in relations: if rel in l2rkeys: for ruleid in l2r[rel]: if not ruleid in ruleset and not '"sei_7' in ruleid: writerule = 1 for lemma in r2l[ruleid]: if not lemma in relations_cp: writerule = 0 if writerule == 1: ruleset.add(ruleid) def sortfunc(x,y): return cmp(x[1],y[1]) jasgrels = set([]) def writerules(ids,outfilename): outfile = open(path + outfilename,'w') # h = open(path + 'mwe.selected.mtr','w') x = 0 idrest = {} rest = set([]) ja2jaen = {} newids = set([]) for key in set(ids): items = key.split('--') jap = items[0] if key in ruleset: ja2jaen[jap] = ja2jaen.get(jap,[]) + [[key,jaen2prob[key]]] newids.add(key) japs = ja2jaen.keys() newjaps = [] jap2len = {} jap2translist = {} for jap in japs: japlen = len(jap.split('+')) jap2len[jap] = japlen trans2prob = {} for trans in ja2jaen[jap]: trans2prob[trans[0]] = trans[1][0] transprobs = trans2prob.items() transprobs.sort(sortfunc) transprobs.reverse() translist = [] # Accepting rules with a probability of 0.1 and higher, and accepting rules of lower probability i) for single rules which is the most probable rule with the given japanese input, and ii) for the most probable mwe rule where at least one japanese predicate is not among the input predicates of the single rules (of probability higher than 0.1) if len(transprobs) == 1: if 'single' in outfilename: translist = [transprobs[0][0]] if float(transprobs[0][1]) > threshold: jasgrels.add(transprobs[0][0].split('--')[0]) if 'mwe' in outfilename: if float(transprobs[0][1]) < threshold: jarels = transprobs[0][0].split('--')[0].split('+') jablock = 1 for jarel in jarels: if not jarel in jasgrels and not '_q' in jarel and not '_p' in jarel: jablock = 0 if jablock == 0: translist = [transprobs[0][0]] else: translist = [transprobs[0][0]] else: translist = [transprobs[0][0]] topprob = float(transprobs[0][1]) engcats = set([]) if 'single' in outfilename: if float(transprobs[0][1]) > threshold: jasgrels.add(transprobs[0][0].split('--')[0]) engpred = transprobs[0][0].split('--')[1] if len(engpred.split('_')) > 1: engcats = set([engpred.split('_')[1]]) for transprob in transprobs[1:]: engpred = transprob[0].split('--')[1] if len(engpred.split('_')) > 1: engcat = engpred.split('_')[1] else: engcat = '' if float(transprob[1]) > threshold or float(transprob[1]) == topprob: translist.append(transprob[0]) if 'single' in outfilename and float(transprob[1]) > threshold: jasgrels.add(transprobs[0][0].split('--')[0]) elif 'single' in outfilename and not engcat in engcats: translist.append(transprob[0]) engcats.add(engcat) # for transprob in transprobs: # translist.append(transprob[0]) jap2translist[jap] = translist japlens=jap2len.items() japlens.sort(sortfunc) for item in japlens: jap = item[0] newjaps.append(jap) japs = newjaps japs.reverse() # Writing the rules for jap in japs: #print jap jaens = jap2translist[jap] # jaens = ja2jaen[jap] jaens2prob = {} # if len(jaens) > 4: # jaens = jaens[4:] for jaen in jaens: #print jaen problist = jaen2prob[jaen] problist.sort() problist.reverse() jaens2prob[jaen] = problist[0] jaenprobs=jaens2prob.items() jaenprobs.sort(sortfunc) # jaenprobs.reverse() y = 0 # Selecting the 3 most probable transfer rules # if len(jaenprobs) > 3: # jaenprobs = jaenprobs[-3:] while len(jaenprobs) > 1: jaenprob = jaenprobs.pop() jaen = jaenprob[0] rule = ruledict[jaen] rule = rule.replace('_mtr','_omtr') rule = rule.replace('-mtr','-omtr') prob = jaenprob[1] outfile.write('; ' + str(prob) + '\n') outfile.write(rule + '\n') x = x+1 y = x+1 if len(jaenprobs) > 0: # try: rule = ruledict[jaenprobs[0][0]] ruleitems = rule.split() ruletype = ruleitems[2] ruleid = ruleitems[0] if ruletype in set(['n+n_n_mtr','n+n_n_omtr']): iditems = ruleid.split('--') jas = iditems[0].split('+') eng = iditems[1] try: testrule = '"' + rel2lem['"_' + jas[0] + '_rel"'] + '" "' + rel2lem['"_'+jas[1] +'_rel"'] + '"\t' + '"' + erel2lem['"_' + eng +'_rel"'] + '"' # print testrule if testrule in unfulltrans: pass #print rule except: #pass print rule jap = items[0] eng = items[1] japs = jap.split('+') engs = eng.split('+') rule = rule.replace('_omtr','_mtr') rule = rule.replace('-omtr','-mtr') prob = jaenprobs[0][1] outfile.write('; ' + str(prob) + '\n') outfile.write(rule + '\n') x = x+1 # except: # print jaenprobs return x # Opening output files # Writing the rules x = writerules(sgids,'single.selected.mtr') y = writerules(mweids,'mwe.selected.mtr') print 'Wrote ' + str(x) + ' rules into \''+path+'single.selected.mtr\'' print 'Wrote ' + str(y) + ' rules into \''+path+'mwe.selected.mtr\''