#include "eTuple.h" #include "unicode.h" #include "resa.h" #include "process.h" #include "align.h" #include #include #include #include #include #include #include #include using namespace std; enum FileType {LINE, TAB}; void parse_options(int argc, char **argv, string *rawfn, ifstream *inf, ofstream *outf, FileType *slt, bool *boundary); int main (int argc, char **argv) { string rawfn; ifstream inf; ofstream outf; FileType slt; bool boundary; try { parse_options(argc, argv, &rawfn, &inf, &outf, &slt, &boundary); } catch (const boost::program_options::error& e) { cerr << "Invalid command: " << e.what() << endl; cerr << "Exiting." << endl; exit(1); } catch (...) { std::cerr << "Unknown error in option handling!" << "\n"; exit(1); } initialize_encoding_converter("utf-8"); UnicodeString rmap; //raw string if (!rawfn.empty()) readRaw(rawfn, rmap); map > > selts; istream &ins = (inf.is_open()?inf:cin); switch (slt) { case LINE: processLINE(rmap, ins, selts, false, boundary, false); break; case TAB: processTAB(rmap, ins, selts, false, boundary, false, false); break; default: cerr << "File format not implemented yet!" << endl; break; } if (inf.is_open()) inf.close(); if (selts.size() > 0) { ostream &output = (outf.is_open()?outf:cout); writeInterim(selts, output, false); if (outf.is_open()) outf.close(); } finalize_encoding_converter(); return 0; } void parse_options(int argc, char **argv, string *rawfn, ifstream *inf, ofstream *outf, FileType *slt, bool *boundary) { namespace po = boost::program_options; po::options_description visible("Options", 74); visible.add_options() ("help,h", "This usage information.") ("raw,r", po::value(rawfn), "Raw text file, required") ("input,i", po::value(), "Segmented file (otherwise STDIN).") ("output,o", po::value(), "Output file (otherwise STDOUT)") ("boundary,b", "Use sentence boundary end point, rather than span.") ("format,f", po::value()->default_value("LINE"), "Annotation format of segmented file:\n" " LINE: \t1 sentence per line\n" " TAB: \t1 token per line, empty lines are" " considered sentence breaks.") ; po::options_description cmd_line ("Command line options"); cmd_line.add(visible); po::variables_map vm; po::store(po::command_line_parser(argc, argv).options(cmd_line).run(), vm); if (vm.count("help")) { cout << "Usage: " << argv[0] << " " << endl; cout << visible; exit(0); } notify(vm); if (vm.count("boundary")) { *boundary = true; } else { *boundary = false; } string st(vm["format"].as()); if (st.compare("LINE") == 0) *slt = LINE; else if (st.compare("TAB") == 0) *slt = TAB; else { cerr << "Warning: invalid format (--format) " << st << " given. " << "Setting format to LINE." << endl; *slt = LINE; } if (vm.count("input")) { inf->open(vm["input"].as().c_str()); if (!inf->is_open()) { cerr << "Couldn't open " << vm["input"].as() << endl; exit(1); } } if (vm.count("output")) { outf->open(vm["output"].as().c_str()); if (!outf->is_open()) { cerr << "Couldn't open " << vm["output"].as() << endl; exit(1); } } if (!vm.count("raw")) { cerr << "ERROR: --raw is required." << endl; cerr << "Usage: " << argv[0] << " " << endl; cerr << visible; exit(1); } }