#include "eTuple.h" #include "unicode.h" #include "resa.h" #include "mrg.h" #include "process.h" #include "align.h" #include "eval.h" #include #include #include #include #include #include #include #include using namespace std; enum FileType {LINE, TAB, MRG, CONLLX, CHAR}; void parse_options(int argc, char **argv, string *rawfn, string *goldfn, string *testfn, FileType *glt, FileType *tlt, bool *interim, bool *v, string *paramfn, bool *stats, bool *fuzzy, bool *boundary, bool *multi, bool *unlabelled, bool *rawFromGold, bool *sent_counts); int main (int argc, char **argv) { string rawfn, goldfn, testfn, paramfn, delimchars; FileType glt, tlt; bool interim, verbose, stats, fuzzy, boundary, multi, unlabelled, rawFromGold, sent_counts; map deletelabels; map > equivlabels; try { parse_options(argc, argv, &rawfn, &goldfn, &testfn, &glt, &tlt, &interim, &verbose, ¶mfn, &stats, &fuzzy, &boundary, &multi, &unlabelled, &rawFromGold, &sent_counts); } catch (const boost::program_options::error& e) { cerr << "Invalid command: " << e.what() << endl; cerr << "Exiting." << endl; exit(1); } catch (...) { std::cerr << "Unknown error in option handling!" << "\n"; exit(1); } initialize_encoding_converter("utf-8"); UnicodeString rmap; //raw string if (!paramfn.empty()) readParam(paramfn, deletelabels, equivlabels, delimchars); if (!rawfn.empty()) readRaw(rawfn, rmap); map > > gelts; map > > telts; if (!goldfn.empty()) { ifstream goldf(goldfn.c_str()); if (goldf.is_open()) { istream &gf = goldf; switch (glt) { case MRG: processMRG(rmap, gf, gelts, deletelabels, fuzzy, boundary, rawFromGold); break; case CHAR: processCHAR(gf, gelts, boundary); break; case LINE: processLINE(rmap, gf, gelts, fuzzy, boundary, rawFromGold); break; case TAB: processTAB(rmap, gf, gelts, fuzzy, boundary, multi, rawFromGold); break; case CONLLX: processCONLLX(rmap, gf, gelts, fuzzy, boundary, rawFromGold); break; default: cerr << "Gold file format not implemented yet!" << endl; break; } goldf.close(); } else { cerr << "Couldn't open gold file " << goldfn << endl; exit(1); } } if (!testfn.empty()) { ifstream testf(testfn.c_str()); if (testf.is_open()) { istream &tf = testf; switch (tlt) { case MRG: processMRG(rmap, tf, telts, deletelabels, fuzzy, boundary); break; case CHAR: processCHAR(tf, telts, boundary); break; case LINE: processLINE(rmap, tf, telts, fuzzy, boundary); break; case TAB: processTAB(rmap, tf, telts, fuzzy, boundary, multi); break; case CONLLX: processCONLLX(rmap, tf, telts, fuzzy, boundary); break; default: cerr << "Test file format not implemented yet!" << endl; break; } testf.close(); } else { cerr << "Couldn't open test file " << testfn << endl; exit(1); } } if (interim) { if (!goldfn.empty()) { if (gelts.size() > 0) { string gbase = goldfn; unsigned int slash = gbase.find_last_of('/'); if (slash != string::npos) gbase.erase(0, slash+1); string gfn(gbase+".gold.tuples"); ofstream goutf(gfn.c_str()); if (goutf.is_open()) { ostream &output = goutf; writeInterim(gelts, output); goutf.close(); } else { cerr << "Couldn't open interim file " << gfn << " for writing. Skipping." << endl; } } } if (!testfn.empty()) { if (telts.size() > 0) { string tbase = testfn; unsigned int slash = tbase.find_last_of('/'); if (slash != string::npos) tbase.erase(0, slash+1); string tfn(tbase+".test.tuples"); ofstream toutf(tfn.c_str()); if (toutf.is_open()) { ostream &output = toutf; writeInterim(telts, output); toutf.close(); } else { cerr << "Couldn't open interim file " << tfn << " for writing. Skipping." << endl; } } } } if (gelts.size() > 0 && telts.size() > 0) { compareTuples(gelts, telts, equivlabels, stats, multi, unlabelled, delimchars, sent_counts); } if (verbose) { printSortedUnmatched(gelts, telts, cerr); } finalize_encoding_converter(); return 0; } void parse_options(int argc, char **argv, string *rawfn, string *goldfn, string *testfn, FileType *glt, FileType *tlt, bool *interim, bool *v, string *paramfn, bool *stats, bool *fuzzy, bool *boundary, bool *multi, bool *unlabelled, bool *rawFromGold, bool *sent_counts) { namespace po = boost::program_options; po::options_description visible("Options", 74); visible.add_options() ("help,h", "This usage information.") ("raw,r", po::value(rawfn), "Raw text file, " "required unless gformat and tformat are CHAR, or extractraw option given.") ("extractraw,e", "Extract pseudo-raw from gold tokens.") ("gold,g", po::value(goldfn), "Gold annotation file.") ("test,t", po::value(testfn), "Test annotation file.") ("verbose,v", "Unmatched tuples printed to STDERR.") ("stats,s", "Print summary output in tab-delimited form.") ("counts,c", "Print sentence by sentence counts, suppresses summary output.") ("fuzzy,f", "Allow fuzziness in spans around punctuation") ("unlabelled,u", "Unlabelled evaluation for phrase structure labels or dependencies") ("multi,m", "Allow multiple tags (only valid for POS tags in LINE format).") ("boundary,b", "Use sentence boundary end point, rather than span.") ("interim,i", "Output interim characterised files.\n" "\tFiles will be of form\n" ".{gold,test}.tuples") ("param,p", po::value(paramfn), "evalb-style parameter file.") ("gformat,G", po::value()->default_value("MRG"), "Annotation format of gold file:\n" " LINE: \t1 sentence per line\n" " TAB: \t1 token per line, second+ column(s) (if present) " "considered to be POS. Empty lines considered sentence breaks.\n" " MRG: \t.mrg format as in Penn Treebank.\n" " CONLLX: \tCONLL-X format.\n" " CHAR: \tCharacterised tuples as from option --interim") ("tformat,T", po::value()->default_value("MRG"), "Annotation format of test file, options as for gold format.") ; po::options_description cmd_line ("Command line options"); cmd_line.add(visible); po::variables_map vm; po::store(po::command_line_parser(argc, argv).options(cmd_line).run(), vm); if (vm.count("help")) { cout << "Usage: " << argv[0] << " " << endl; cout << visible; exit(0); } notify(vm); if (vm.count("verbose")) { *v = true; } else { *v = false; } if (vm.count("unlabelled")) { *unlabelled = true; } else { *unlabelled = false; } if (vm.count("stats")) { *stats = true; } else { *stats = false; } if (vm.count("counts")) { *sent_counts = true; } else { *sent_counts = false; } if (vm.count("fuzzy")) { *fuzzy = true; } else { *fuzzy = false; } if (vm.count("boundary")) { *boundary = true; } else { *boundary = false; } if (vm.count("interim")) { *interim = true; } else { *interim = false; } string gt(vm["gformat"].as()); if (gt.compare("LINE") == 0) *glt = LINE; else if (gt.compare("TAB") == 0) *glt = TAB; else if (gt.compare("MRG") == 0) *glt = MRG; else if (gt.compare("CONLLX") == 0) *glt = CONLLX; else if (gt.compare("CHAR") == 0) *glt = CHAR; else { cerr << "Warning: invalid format (--gformat) " << gt << " given. " << "Setting gformat to MRG." << endl; *glt = MRG; } if (vm.count("extractraw")) { if (vm.count("gold") && *glt != CHAR) *rawFromGold = true; else { cerr << "ERROR: can't extractraw from gold, unless " << "--gold is given and --gformat is not CHAR." << endl; cerr << "Usage: " << argv[0] << " " << endl; cerr << visible; exit(1); } } else { *rawFromGold = false; } string tt(vm["tformat"].as()); if (tt.compare("LINE") == 0) *tlt = LINE; else if (tt.compare("TAB") == 0) *tlt = TAB; else if (tt.compare("MRG") == 0) *tlt = MRG; else if (tt.compare("CONLLX") == 0) *tlt = CONLLX; else if (tt.compare("CHAR") == 0) *tlt = CHAR; else { cerr << "Warning: invalid format (--tformat) " << tt << " given. " << "Setting tformat to MRG." << endl; *tlt = MRG; } if (vm.count("multi")) { if (tt.compare("TAB") == 0) *multi = true; else { cerr << "Warning: -multi option currently only valid with tformat=TAB." << " Setting -multi to false." << endl; *multi = false; } } else { *multi = false; } if ((!vm.count("gold") || !vm.count("test")) && !(*interim)) { cerr << "ERROR: the options --gold and --test are required, unless " << "--interim is given." << endl; cerr << "Usage: " << argv[0] << " " << endl; cerr << visible; exit(1); } if (!vm.count("raw") && !(*glt == CHAR && *tlt == CHAR) && !vm.count("extractraw")) { cerr << "ERROR: --raw is required unless both gformat and tformat " << "are CHAR, or --extractraw is given." << endl; cerr << "Usage: " << argv[0] << " " << endl; cerr << visible; exit(1); } }