#!/usr/bin/env python #-*- coding: utf-8 -*- ### ### Script for choosing the most probable phrase alignments ### # # phrase tables should be in: # corpus + '-profiles/moses.mrs' # corpus + '-anymalign.mrs' # # and are written to # corpus + '-profiles/mrs-thin import sys, os corpus = sys.argv[1] #pt = open('/home/petterha/jaentools/parmrs/moses.mrs.2012-01-06') try: # This is the Moses phrase table pt = open(corpus + '-profiles/moses.mrs') except: pt = [] ptt = open(corpus + '-profiles/mrs-thin','w') thindict = {} for line in pt: items = line.split(' ||| ') source = items[0] target = items[1] targetlen = len(target.split()) sourcelen = len(source.split()) probs = items[2].split() prob = probs[2] try: prob = float(prob) except: prob == 0 if sourcelen < 6 and targetlen < 6 and prob > 0.01: thindict[source + '\t' + target + '\tmos'] = prob # This is the Anymalign phrase table pt = open(corpus + '-anymalign.mrs') for line in pt: items = line.split('\t') source = items[0] target = items[1] targetlen = len(target.split()) sourcelen = len(source.split()) probs = items[3] pitems = probs.split(' ') try: prob = float(pitems[0]) except: prob = 0 freq = int(items[4][:-1]) if freq > 1 and sourcelen < 6 and targetlen < 6 and prob > 0.01: thindict[source + '\t' + target + '\tany'] = prob def sortfunc(x,y): return cmp(x[1],y[1]) items=thindict.items() items.sort(sortfunc) items.reverse() for item in items: thinitems = item[0].split('\t') source = thinitems[0] target = thinitems[1] table = thinitems[2] prob = item[1] ptt.write(source + '\t' + target + '\t' + str(prob) + '\t' + table + '\n')