#ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include #include "tRepp.h" #include "postag.h" #include "unicode.h" #include "grammar.h" #include "profile.h" #include "resultReader.h" #include "resultReader.cpp" using namespace std; enum Tokeniser {NONE, REPP, CHASEN, YY}; enum TagType {LETYPE, LETYPEMORPH}; TagType tagtype; void collectleaves(string s, vector &ancestors, vector &leaves); void parse_options(int argc, char **argv, string *gfname, string *pname, Tokeniser *tok, string *tokfname, vector *calllist, bool *tnt, string *tntmodel, TagType *tagtype, tTokenFormat *format, bool *numopt, int *lim); vector splitOnSpace(string item); bool is_infl(string rule); void align(vector &itok, vector &rtok, vector *tmap); double alignscore(UChar a, UChar b); int main (int argc, char **argv) { // setting option variables string gfname, pname, tokfname, tntmodel; Tokeniser tok; vector calllist; tTokenFormat format; bool tnt, numopt; int lim; parse_options(argc, argv, &gfname, &pname, &tok, &tokfname, &calllist, &tnt, &tntmodel, &tagtype, &format, &numopt, &lim); // UTF-8 encoder initialize_encoding_converter("utf-8"); tRepp *tokrepp; if (tok == REPP) { boost::filesystem::path tokenizer(tokfname); tokrepp = new tRepp(tokenizer.parent_path().string(), tokenizer.stem()); } Grammar g(gfname); Profile p(pname); vector leaves; //used in collectleaves() ResultReader > reader(g, leaves, NULL, NULL, &collectleaves); pair,string> result = p.getResult(); int context = -1; // context is item id int event = -1; // event is parse id vector itokens; while (result.first.first >= 0) {//new item/context if (result.first.first != context) {//new item/context if (context != -1) { //finish last context }//finished last context switch (tok) { case REPP: itokens = tokrepp->tokenize(p.getItem(result.first.first), calllist); break; case CHASEN: break; case YY: // itokens = readYY(p.getItem(result.first.first)); break; case NONE: itokens = splitOnSpace(p.getItem(result.first.first)); break; } if(tnt) { tnttag(&itokens,""); } } context = result.first.first; event = result.first.second; if (lim > 0 && p.getReadings(context) >= lim) { //skip items with result = p.getResult(); // greater than lim continue; // readings } //do stuff with result leaves.clear(); reader.readResult(result.second); vectortokmap(itokens.size()); align(itokens, leaves, &tokmap); if (numopt) cout << context << " " << event << itokens[0].separator(format); if (format == FSC) { stringstream ss; ss << context; string contextstring = ss.str(); string itemstring = p.getItem(context); print_header(format, contextstring, itemstring, itokens.size()); } for (int idx=0; idx < itokens.size(); ++idx) { string tag("NULL"); string morph("NULL"); if (tokmap[idx] != -1) { tag = leaves[tokmap[idx]].tag(); UnicodeString us = Conv->convert(itokens[idx].surface()); if (us.length() == 1 && u_ispunct(us.charAt(0))) { if (u_charType(us.charAt(0)) == U_START_PUNCTUATION) tag = "PUNCT"; else { if (idx > 0 && tokmap[idx-1] != -1 && tag.compare(leaves[tokmap[idx-1]].tag()) == 0) tag = "PUNCT"; } } } itokens[idx].print(format, pair(tag,1)); if (idx != (itokens.size()-1)) cout << itokens[idx].separator(format); } print_footer(format); result = p.getResult(); } //finish last context return 0; } void parse_options(int argc, char **argv, string *gfname, string *pname, Tokeniser *tok, string *tokfname, vector *calllist, bool *tnt, string *tntmodel, TagType *tagtype, tTokenFormat *format, bool *numopt, int *lim) { namespace po = boost::program_options; po::options_description visible("Options"); visible.add_options() ("help,h", "This usage information.") ("tok,t", po::value()->default_value("none"), "tokeniser: none, repp, chasen,yy (default: none)" " Tokeniser will be applied to the item string in the item file." " (YY and ChaSen unimplemented as yet. Let Rebecca know if you want them.)") ("rpp,r", po::value(tokfname), "tokenizer .rpp file") ("call,c", po::value< vector >(calllist)->composing(), "rpp calls, multiple options valid.") ("pos,p", "Use TnT POS tags.") ("model,m", po::value(tntmodel), "TnT model, defaults to WSJ model in LOGON tree.") ("infl,i", "Tags include morphological inflection rules.") ("format", po::value()->default_value("TNT"), "token format:TNT,CANDC,FSC (default: TNT)") ("num,n", "Output item and parse number.") ("limit,l", po::value(lim)->default_value(0), "Number of readings at which a context is ignored. " "Set to nbest to negate the effect of using a model during parsing.") ; po::options_description hidden("Hidden options"); hidden.add_options() ("grammar-file", po::value(gfname), "grammar .tdl file") ("profile", po::value(pname), "profile") ; po::options_description cmd_line ("Command line options"); cmd_line.add(visible).add(hidden); po::positional_options_description p; p.add("grammar-file",1).add("profile",1); po::variables_map vm; po::store(po::command_line_parser(argc, argv). options(cmd_line).positional(p).run(), vm); notify(vm); if (vm.count("help")) { cout << "Usage: " << argv[0] << " [options] " << "grammar-file profile" << endl; cout << visible << endl; exit(0); } if (!vm.count("grammar-file") || !vm.count("profile")) { cerr << "Insufficient arguments given." << endl; cerr << "Usage: " << argv[0] << " [options] " << "grammar-file profile" << endl; cerr << visible << endl; exit(1); } string t(vm["tok"].as()); if (t.compare("none") == 0) *tok= NONE; else if (t.compare("repp") == 0) *tok= REPP; else if (t.compare("chasen") == 0) *tok= CHASEN; else if (t.compare("yy") == 0) *tok= YY; else { cerr << "Warning: invalid tokeniser (--tok) " << t << " given. " << "Setting tokeniser to none" << endl; *tok= NONE; } if (*tok == REPP && !vm.count("rpp")) { cerr << "A repp file must be specified when using REPP tokenisation" << endl; cerr << "Usage: " << argv[0] << " [options] " << "grammar-file profile" << endl; cerr << visible << endl; exit(1); } if (vm.count("infl")) *tagtype = LETYPEMORPH; else *tagtype = LETYPE; string f(vm["format"].as()); if (f.compare("FSC") == 0) *format = FSC; else if (f.compare("CANDC") == 0) *format = CANDC; else if (f.compare("TNT") == 0) *format = TNT; else { cerr << "Warning: Invalid format " << f << " given. " << "Setting format to TNT" << endl; *format = TNT; } if (vm.count("pos")) { *tnt = true; if (*format == TNT) cerr << "Warning: TnT output format and POS tag option set. " << endl << " When using TnT as a supertagger, do not use POS tags " << "in the training input." << endl; } else { *tnt = false; if (vm.count("model")) cerr << "Warning: TnT model given, but TnT option not set." << endl; } if (!vm.count("model")) *tntmodel = ""; if (vm.count("num")) *numopt = true; else *numopt = false; } vector splitOnSpace(string item) { vector toks; int start=0; int end=0; end = item.find(' ', start); while (end != string::npos) { toks.push_back(tToken(item.substr(start, end-start))); start=end+1; end = item.find(' ', start); } toks.push_back(tToken(item.substr(start))); return toks; } bool is_infl(string rule) { if (rule.size() > 3 && ( rule.compare(rule.size()-3,3,"olr") == 0 || rule.compare(rule.size()-3,3,"ilr") == 0)) return true; if (rule.size() > 9 && rule.compare(rule.size()-9,9,"infl-rule") == 0) return true; return false; } void collectleaves(string s, vector &ancestors, vector &leaves) { leaves.push_back(tToken(s)); string tag(ancestors.back().surface()); if (tagtype == LETYPEMORPH) { for (int x=ancestors.size()-2; x >= 0 && is_infl(ancestors[x].surface()); x--) { if (ancestors[x].surface().compare(0,3,"plr") != 0) { tag+=":"; tag+=ancestors[x].surface(); } } } leaves.back().tag(tag); } class AlignNode { public: int data; double score; AlignNode *back; AlignNode(int d, double s, AlignNode *b):data(d),score(s),back(b){}; }; void align(vector &itok, vector &rtok, vector *tmap) { typedef pair > Slot; //create char map from rtok vector cmap; vector ctmap; cmap.push_back('^'); ctmap.push_back(-1); int tokcount = 0; for (vector::iterator ti = rtok.begin(); ti != rtok.end(); ++ti) { UnicodeString tmptok = Conv->convert((*ti).surface()); for (int x=0; x < tmptok.length(); ++x) { cmap.push_back(tmptok.charAt(x)); ctmap.push_back(tokcount); } tokcount++; } //foreach char in itok, find possible aligns, find possible pre-aligns, score tokcount = 0; vector slots; for (vector::iterator ti = itok.begin(); ti != itok.end(); ++ti) { UnicodeString tmptok = Conv->convert((*ti).surface()); for (int x=0; x < tmptok.length(); ++x) { Slot nextslot(tokcount,vector()); for (int y=1; y < cmap.size(); ++y) { double score = alignscore(tmptok.charAt(x), cmap[y]); if (score > 0) { AlignNode *alignment = new AlignNode(y,score,NULL); if (slots.size() == 0) { nextslot.second.push_back(alignment); } else { //work out path to here and adjust score for (vector::iterator si = slots.back().second.begin(); si != slots.back().second.end(); ++si) { if (abs((*si)->data) < y) { //possible predecessor if ( ((*si)->data >= 0 && ((*si)->score)+score >= alignment->score) || (*si)->score + score > alignment->score) { //best path seen so far alignment->score = (*si)->score + score; alignment->back = *si; } } } nextslot.second.push_back(alignment); } } } if (slots.size() == 0) { //align first char to null AlignNode *anode = new AlignNode(0,0, NULL); nextslot.second.push_back(anode); } else { //add null matches that record highest char in path so far map nullNodes; for (vector::iterator si = slots.back().second.begin(); si != slots.back().second.end(); ++si) { if ((*si)->data > 0) { AlignNode * anode = new AlignNode(-1*(*si)->data,(*si)->score,*si); nextslot.second.push_back(anode); nullNodes[-1*(*si)->data] = *si; } else { if (nullNodes.count((*si)->data)) { //already seen this null if ((*si)->score > nullNodes[(*si)->data]->score ) { nullNodes[(*si)->data]->score = (*si)->score; nullNodes[(*si)->data]->back = *si; } } else { AlignNode *anode = new AlignNode((*si)->data,(*si)->score,*si); nextslot.second.push_back(anode); } } } } slots.push_back(nextslot); } tokcount++; } //we have the best char to char mapping, now calculate tok to tok int currentitok; AlignNode *maxnode, *curr; double maxscore = 0; map rtoks; for (vector::reverse_iterator ri = slots.rbegin(); ri != slots.rend(); ++ri) { currentitok = ri->first; if (ri == slots.rbegin()) { for (vector::iterator si = (*ri).second.begin(); si != (*ri).second.end(); ++si) { if ((*si)->score > maxscore) { maxscore = (*si)->score; maxnode = *si; } curr = maxnode; } //found end of best path } if (curr->data > 0) { if (rtoks.count(ctmap[curr->data])) { rtoks[ctmap[curr->data]]++; } else rtoks[ctmap[curr->data]] = 1; } if (ri+1 == slots.rend() || (ri+1)->first != currentitok) {//end of input tok if (rtoks.size() == 1) { //only one matching rtok found (*tmap)[currentitok] = (*rtoks.begin()).first; } else { if (rtoks.size() > 1) { for (map::reverse_iterator mi=rtoks.rbegin(); mi != rtoks.rend(); ++mi) { UnicodeString tok = Conv->convert(rtok[mi->first].surface()); if (!(tok.length() == 1 && u_ispunct(tok.charAt(0)))) { (*tmap)[currentitok] = mi->first; break; } } } else { cerr << "couldn't match " << itok[currentitok].surface() << endl; (*tmap)[currentitok] = -1; } } rtoks.clear(); } curr = curr->back; } } double alignscore(UChar a, UChar b) { if (u_foldCase(a,U_FOLD_CASE_DEFAULT) == u_foldCase(b,U_FOLD_CASE_DEFAULT)) return 1; else { if (!u_isalpha(a) && u_charType(a) == u_charType(b)) return 0.5; else { if (u_ispunct(a) && u_ispunct(b)) return 0.1; else return 0; } } }