# # This is a procedure for extracting transfer rules from parallel mrs # corpora # # The program requires that the LOGON system is installed: # http://moin.delph-in.net/LogonInstallation # # You need a parallel corpus in two files. You can execute this script # with the following command, where SOURCE is the file with the source # language items, TARGET is the file with the target language items, # CORPUS is the name of your corpus, and TRANSDIR refers to a # subdirectory with language specific data: # # bash rule-extr.bash SOURCE TARGET CORPUS TRANSDIR # # bash extr-rule.bash corpora/mini.ja corpora/mini.en mini jaen # infile=$1 outfile=$2 corpus=$3 transdir=$4/ # Part-of-speech tag the Japanese corpus with MeCab: # # To install MeCab, do: # # sudo apt-get install python-yaml # sudo apt-get install mecab-ipadic-utf8 python-mecab # # Comment the next three lines out if the source language is not # Japanese echo "Part-of-speech tagging the Japanese corpus" python ${transdir}ja2yy.py $infile > $infile.pos infile=$infile.pos # Divide the corpus into profiles of 1500 items each echo "Dividing the corpus into profiles in '"$transdir$corpus"-profiles/'" v=$(python divide-corpus.py 1500 $corpus $infile $outfile $transdir) # Batch parse the Japanese corpus with Jacy echo "Batch parsing the Japanese corpus" i="0" while [ $i -lt $v ] do i=$[$i+1] mkdir -p $transdir$corpus-profiles/$corpus${i}/source/ cheap -comment-passthrough -mrs -nsolutions=1 -results=1 -packing=15 -timeout=10 -yy -default-les -tsdbdum=$transdir$corpus-profiles/$corpus${i}/source -inputfile=$transdir$corpus-profiles/$corpus${i}/bitext/original ~/logon/dfki/jacy/japanese &> $transdir$corpus-profiles/$corpus${i}/source/log done # Batch parse the English corpus with the ERG echo "Batch parsing the English corpus" i="0" while [ $i -lt $v ] do i=$[$i+1] mkdir -p $transdir$corpus-profiles/$corpus${i}/target/ cheap -repp -tagger -default-les=all -cm -packing -mrs -nsolutions=1 -results=1 -packing=15 -timeout=10 -inputfile=$transdir$corpus-profiles/$corpus${i}/bitext/object -tsdbdump $transdir$corpus-profiles/$corpus${i}/target ~/logon/lingo/erg/english.grm &> $transdir$corpus-profiles/$corpus${i}/target/log done # Create a parallel corpus of MRSs: # * indicating valency of verbs with suffixes to the relations # * marking nominalized verb relations with an 'nmz_' prefix # * marking proper name predicates with an 'nmd_' prefix echo "Creating a parallel corpus of MRSs" python profiles2mrsparcorp.py $transdir $corpus $v # Use the Anymalign phrase aligner to produce a phrase table from the parallel # corpus of MRSs. The program runs until it is stopped with Ctrl-C # Can be downloaded from: # wget http://perso.limsi.fr/Individu/alardill/anymalign/latest/anymalign2.5.zip # unzip anymalign2.5.zip echo "Running Anymalign on the parallel MRS corpus" python anymalign2.5/anymalign.py $transdir${corpus}_mrs_source.txt $transdir${corpus}_mrs_target.txt > $transdir$corpus-anymalign.mrs # Choosing the most probable phrase alignments echo "Choosing the most probable phrase alignments" python phrtab-thin.py $transdir$corpus # Reading the existing transfer rule files echo "Finding exisiting transfer rules" python hand-rules.py $LOGONROOT $transdir > $transdir/hand-rules # Representing the lexicons of the parsing grammar and generating # grammar as tables python lex.py ${LOGONROOT}/lingo/erg/lexicon.tdl > $transdir/target-lex.tab python lex.py ${LOGONROOT}/dfki/jacy/lexicon.tdl > $transdir/source-lex.tab # Reading the processed phrase table and matching with templates. If # your source language is not Japanese you need to change the # src_prefix in the top of the file to ''. The script calls a function # in 'jaen/templates.py' with language specific templates. You may # need to modify the templates in this file. echo "Writing transfer rules" python thin2mtr.py $LOGONROOT $corpus $transdir