#include "tRepp.h" #include #include #include #include #include "postag.h" using namespace std; void parse_options(int ac, char **argv, string *tokfname, ifstream *inputfile, vector *calllist, tTokenFormat *format, bool *tnt, bool *numopt, string *tntmodel, bool *quiet); int main(int argc, char **argv) { //option handling tTokenFormat format; vector calllist; string tokfname; string tntmodel; ifstream inputfile; bool tnt,numopt,quiet; parse_options(argc, argv, &tokfname, &inputfile, &calllist, &format, &tnt, &numopt, &tntmodel, &quiet); //tokeniser boost::filesystem::path tokenizer(tokfname); #if BOOST_FILESYSTEM_VERSION >= 3 tRepp repp(tokenizer.parent_path().string(), tokenizer.stem().string(), quiet); #else tRepp repp(tokenizer.parent_path().string(), tokenizer.stem(), quiet); #endif //take input from file if given, else stdin istream &input = (inputfile.is_open()?inputfile:cin); string line; getline(input, line); while (!input.eof()) { string itemno = "UNKNOWN"; if (numopt) { //input file can have itemno in first column, separated by @ int at = line.find_first_of('@'); if (at != string::npos) { itemno = line.substr(0,at); line.erase(0,at+1); } } vector tokens(repp.tokenize(line, calllist)); if (tnt) { tnttag(&tokens, tntmodel); } if (format == FSC || format == TNT) print_header(format, itemno, line, tokens.size()); for (vector::iterator iter = tokens.begin(); iter != tokens.end(); ++iter) { iter->print(format); if (iter+1 != tokens.end()) { cout << iter->separator(format); } } if(numopt) cout << "#" << itemno; print_footer(format); getline(input, line); } return 0; } void parse_options(int ac, char **av, string *tokfname, ifstream *inputfile, vector *calls, tTokenFormat *format, bool *tnt, bool *numopt, string *tntmodel, bool *quiet) { namespace po = boost::program_options; po::options_description opts("Options"); opts.add_options() ("help,h,?", "this usage information") ("call,c", po::value< vector >(calls)->composing(), "rpp calls, multiple options valid.") ("format", po::value()->default_value("STRING"), "token format:STRING,PET,FSC,CANDC,TNT (default: STRING)") ("tnt,t", "Use TnT POS tags") ("num,n", "Output item and parse number.") ("model,m", po::value(tntmodel), "TnT model file (defaults to wsj model in LOGON tree)") ("quiet,q", "Suppress informational messages.") ; po::options_description hidden("Hidden options"); hidden.add_options() ("tokenizer-file", po::value(tokfname), "tokenizer .rpp file") ("input-file", po::value(), "input file") ; po::options_description cmd_line("Command line options"); cmd_line.add(opts).add(hidden); po::positional_options_description p; p.add("tokenizer-file", 1).add("input-file", 1); po::variables_map vm; po::store(po::command_line_parser(ac, av). options(cmd_line).positional(p).run(), vm); notify(vm); if (vm.count("help")) { cout << "Usage: " << av[0] << " [options] tokenizer-file [input-file]" << endl; cout << opts << endl; exit(0); } if (!vm.count("tokenizer-file")) { cerr << "Usage: " << av[0] << " [options] tokenizer-file [input-file]" << endl; cerr << opts << endl; exit(1); } if (vm.count("input-file")) { inputfile->open(vm["input-file"].as().c_str()); if (!inputfile->is_open()) { cerr << "Couldn't open " << vm["input-file"].as() << endl; exit(1); } } if (!vm.count("model")) *tntmodel = ""; else { if (vm.count("tnt")) cerr << "Warning: TnT model given, but TnT is not requested." << endl; } string f(vm["format"].as()); if (f.compare("STRING") == 0) *format = STRING; else if (f.compare("PET") == 0) *format = PET; else if (f.compare("FSC") == 0) *format = FSC; else if (f.compare("CANDC") == 0) *format = CANDC; else if (f.compare("TNT") == 0) *format = TNT; else { cerr << "Warning: invalid format " << f << " given. " << "Setting format to STRING" << endl; *format = STRING; } if (vm.count("tnt")) *tnt = true; else *tnt = false; if (vm.count("num")) *numopt = true; else *numopt = false; if (vm.count("quiet")) *quiet = true; else *quiet = false; }