#include "mrg.h" #include #include #include #include #include "process.h" #include "align.h" using namespace std; void processMRG(UnicodeString &rmap, istream &mrgf, map > > &elts, map &deletelabels, bool fuzzy, bool boundary, bool rawFromGold) { boost::regex startnode("^\\s*(\\([^(\\s]*)(?:\\s*|$)"); boost::regex token("^([^)]+)\\)\\s*"); boost::regex endnode("^\\s*\\)\\s*"); boost::smatch res; int depth = 0; int sentcnt = 0; int sentstart = -1; //token the last sentence started with vector >stack; //for tree processing vector toks; //tokens for alignment vector > brackets; //temporary store, pre-alignment vector tokdel; //record which tokens are int lineno; string mrgfline; getline(mrgf, mrgfline); lineno++; while (!mrgf.eof()) { if (mrgfline.empty()) { getline(mrgf, mrgfline); lineno++; continue; } if (depth == 0 && mrgfline.compare(0, 1, "(") != 0) { //header/comment getline(mrgf, mrgfline); lineno++; continue; } string rest = mrgfline; while (!rest.empty() && isspace(rest.at(0))) rest.erase(0,1); while (rest.length() > 0) { // cerr << rest << endl; if (rest.compare(0,1,"(") == 0) { while (boost::regex_search(rest, res, startnode)) { string node = res[1]; if (node.length() > 1) node.erase(0,1); //remove paren unless empty node stack.push_back(pair(toks.size(),node)); if (depth == 0) { sentcnt++; sentstart = toks.size(); } depth++; rest = res.suffix(); } //eaten up node starts, should be mid token node if (rest.length() > 0) { if (boost::regex_search(rest, res, token)) { string tok = res[1]; UnicodeString utok = Conv->convert(tok); if (stack.back().second == "-NONE-") { // token strings for traces won't align utok = UnicodeString(); } toks.push_back(utok); pair bracket = stack.back(); brackets.push_back(eTuple(bracket.first, toks.size()-1, string("POS:"+bracket.second), -1, -1)); brackets.back().setSent(sentcnt); if (deletelabels.count(bracket.second)) brackets.back().setStatus(DEL); brackets.push_back(eTuple(toks.size()-1, toks.size()-1, string("TOK"), -1, -1)); brackets.back().setSent(sentcnt); //record tok string as comment for debugging brackets.back().setComment(tok); if (deletelabels.count(bracket.second)) { brackets.back().setStatus(DEL); tokdel.push_back(1); } else { tokdel.push_back(0); } //token regex includes the closing paren stack.pop_back(); depth--; rest = res.suffix(); } else { cerr << "MRG parsing error. No token at line " << lineno << endl; // cerr << rest << endl; exit(1); } //finished processing token while (boost::regex_search(rest, res, endnode)) { if (depth > 0 && stack.size() > 0) { pair bracket = stack.back(); brackets.push_back(eTuple(bracket.first, toks.size()-1, bracket.second, -1, -1)); brackets.back().setSent(sentcnt); if (deletelabels.count(bracket.second)) brackets.back().setStatus(DEL); stack.pop_back(); depth--; if (depth == 0) { brackets.push_back(eTuple(sentstart, toks.size()-1, "SENT", -1, -1)); brackets.back().setSent(sentcnt); } rest = res.suffix(); } else { cerr << "Error parsing MRG at line " << lineno << ": depth " << depth << " and stack size " << stack.size() << endl; exit(1); } } } } else { //line starts with node close, rather than node open if (!boost::regex_search(rest, res, endnode)) { cerr << "MRG parsing error at line " << lineno << "." << endl; cerr << "Is this an MRG file?" << endl; exit(1); } while (boost::regex_search(rest, res, endnode)) { if (depth > 0 && stack.size() > 0) { pair bracket = stack.back(); brackets.push_back(eTuple(bracket.first, toks.size()-1, bracket.second, -1, -1)); brackets.back().setSent(sentcnt); if (deletelabels.count(bracket.second)) brackets.back().setStatus(DEL); stack.pop_back(); depth--; if (depth == 0) { brackets.push_back(eTuple(sentstart, toks.size()-1, "SENT", -1, -1)); brackets.back().setSent(sentcnt); } rest = res.suffix(); } } } } getline(mrgf, mrgfline); lineno++; } //finished reading MRG file if (brackets.empty()) { cerr << "No trees found in MRG file." << " Not an MRG file?" << endl; exit(1); } //align tokens to raw character positions vector tokstart(toks.size(), -1); vector tokend(toks.size(), -1); if (rawFromGold) { createRawFromGold(rmap, toks, tokstart, tokend); } else { lcsAlign (rmap, toks, tokstart, tokend); //alignRaw (rmap, toks, tokstart, tokend); //positions, not chars for (int x = 0; x < toks.size(); ++x) { // cerr << x << ":" << Conv->convert(toks[x]) << " (" // << tokstart[x] << "," << tokend[x] << ")" << endl; if (!toks[x].isEmpty() && tokend[x] != -1) { tokend[x]++; } } } //Mark brackets not to be evaluated as DELeted, which includes those //explicitly listed in the prm file, and any others with a span that only //includes deleted tokens. Update character spans to only incorporate //non-deleted tokens. for (vector >::iterator bitr = brackets.begin(); bitr != brackets.end(); ++bitr) { if (bitr->getThird() != "(") {//don't store empty brackets int start = -1; int end = -1; bool del = true; //check token span, to make sure there is a non-deleted token for (int delcheck = bitr->getFirst(); delcheck <= bitr->getSecond(); ++delcheck) { if (tokdel[delcheck] == 0) { del = false; if (start == -1 && tokstart[delcheck] > -1) { start = tokstart[delcheck]; } if (tokend[delcheck] > -1 && end < tokend[delcheck]) { end = tokend[delcheck]; } } } //give unaligned tuples zero-length span, at end of last aligned token if (start == -1) { if (end != -1) { start = end; } else { for (int x = bitr->getFirst(); x >= 0; --x) { if (tokend[x] != -1) { start = tokend[x]; end = tokend[x]; break; } } if (start == -1) { start = 0; end = 0; } } } if (end == -1) { end = start; } if (boundary && bitr->getThird() == "SENT") start = end; if (elts.count(start) == 0) elts.insert(pair > >(start, vector >())); elts[start].push_back(eTuple(start, end, bitr->getThird(), -1, -1)); elts[start].back().setSent(bitr->getSent()); elts[start].back().setComment(bitr->getComment()); if (del) { elts[start].back().setStatus(DEL); } else { elts[start].back().setStatus(bitr->getStatus()); } } } for (map > >::iterator litr = elts.begin(); litr != elts.end(); ++litr) { for (vector >::iterator titr = litr->second.begin(); titr != litr->second.end(); ++titr) { IndexType x = titr->getSecond() - 2; //char not pos while (x >=0 && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzySecond(x+1); //pos not char x--; } x = titr->getSecond(); //char not pos while (x < rmap.length() && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzySecond(x+1); //pos not char x++; } if (titr->getFirst() < titr->getSecond()) {//non-zero span x = titr->getFirst()-1; while (x >=0 && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzyFirst(x); x--; } x = titr->getFirst() + 1; while (x < rmap.length() && (u_isspace(rmap.charAt(x)) || (fuzzy && u_ispunct(rmap.charAt(x))))) { titr->addFuzzyFirst(x); x++; } } else { //zero span for (set::iterator sit = titr->getFuzzySecond().begin(); sit != titr->getFuzzySecond().end(); ++sit) { titr->addFuzzyFirst(*sit); } } } } }