#include #include #include #include #include #include #include "trigram.h" using namespace std; namespace fs = boost::filesystem; void parse_options(int argc, char **argv, tdlOptions *tdlopts, string *mname, vector *ifiles, bool *lambdaset, double *l1, double *l2, double *l3, bool *quiet, string *comment); string uppercase(string orig); int main (int argc, char **argv) { // setting option variables string mname, efname, tfname, comment; vector ifiles; bool lambdaset, quiet, genmap; double l1, l2, l3; tdlOptions *tdlopts = new tdlOptions(); try { parse_options(argc, argv, tdlopts, &mname, &ifiles, &lambdaset, &l1, &l2, &l3, &quiet, &comment); } catch ( const boost::program_options::error& e ) { cerr << "Invalid command: " << e.what() << "\nExiting." << endl; exit(1); } efname = mname + ".ex.gz"; tfname = mname + ".tx.gz"; ifstream testf(efname.c_str()); if (testf.is_open()) { cerr << "Won't overwrite existing model file " << efname << ". Remove to re-run." << endl; exit(1); } testf.open(tfname.c_str()); if (testf.is_open()) { cerr << "Won't overwrite existing model file " << tfname << ". Remove to re-run." << endl; exit(1); } if (ifiles.empty()) //use stdin ifiles.push_back("-"); tTrigramModel model(ifiles, tdlopts, lambdaset, l1, l2, l3, quiet); if (!lambdaset) { model.trigramDeletedInterpolation(); } else { cerr << "using given lambda " << l1 << "," << l2 << "," << l3 << endl; } // cerr << "λ1:" << model.lambda1 << ", λ2:" << model.lambda2 << ", λ3:" // << model.lambda3 << endl; model.calculateEmissionScores(); model.calculateTransitionScores(); model.writeCompiled(mname, quiet, comment); return 0; } void parse_options(int argc, char **argv, tdlOptions *tdl_opts, string *mname, vector *ifiles, bool *lambdaset, double *l1, double *l2, double *l3, bool *quiet, string *comment) { namespace po = boost::program_options; po::options_description general("Options"); general.add_options() ("help,h", "This usage information.") ("tagtype,t", po::value(), "FULL, NOAFFIX, LETYPE, MSCAFFIX, MSC (default FULL)") ("map,m", "Map generic letypes to appropriate native letypes.") ("details,d", po::value(comment), "Comment to add to model header") ("config,c", po::value(), "Configuration file") ("quiet,q", "Don't write status messages to stderr.") ; string smoothing_desc = "Smoothing options to force interpolation weights, " "instead of calculating \nfrom data." "The total of l1+l2+l3 must be 1.\n" "Setting any of l1, l2 or l3 will " "implicitly set the unset to an equal\nproportion of 1"; po::options_description smoothing(smoothing_desc); smoothing.add_options() ("l1", po::value(l1), "Interpolation weight for unigrams.") ("l2", po::value(l2), "Interpolation weight for bigrams.") ("l3", po::value(l3), "Interpolation weight for trigrams.") ; po::options_description hidden("Hidden options"); hidden.add_options() ("model", po::value(mname), "HMM model basename") ("input", po::value< vector >(ifiles)->composing(), "Input files") ; po::options_description cmd_line ("Command line options"); cmd_line.add(general).add(smoothing).add(hidden); po::positional_options_description p; p.add("model",1).add("input", -1); po::variables_map vm; po::store(po::command_line_parser(argc, argv). options(cmd_line).positional(p).run(), vm); notify(vm); if (vm.count("help")) { cout << "Usage: " << argv[0] << " [options] " << "modelname [input file(s)]" << endl; cout << general << endl; cout << smoothing << endl; exit(0); } if (!vm.count("model")) { cerr << "Model name required." << endl; cerr << "Usage: " << argv[0] << " [options] " << "modelname [input file(s)]" << endl; cerr << general << endl; cerr << smoothing << endl; exit(1); } if (vm.count("config")) { try { string cfname = vm["config"].as(); fs::path cpath(cfname); if (cpath.has_parent_path()) tdl_opts->set("ut-basedir", cpath.parent_path().string()); else tdl_opts->set("ut-basedir", fs::current_path().string()); tdl_opts->read(cfname); } catch (exception &e) { cerr << "Error reading config file: " << e.what() << endl; exit(1); } } if (vm.count("quiet")) *quiet = true; else *quiet = false; if (vm.count("map")) tdl_opts->set("ut-mapgen", "true"); if (vm.count("tagtype")) { string type = vm["tagtype"].as(); type = uppercase(type); tdl_opts->set("ut-tagtype", type); } if (vm.count("l1") || vm.count("l2") || vm.count("l3")) { *lambdaset = true; if (vm.count("l1") && vm.count("l2") && vm.count("l3")) { if (*l1+*l2+*l3 != 1) { cerr << "Error: If setting all lambda values, they must sum to 1!" << endl; exit(1); } } else { if (vm.count("l1")) { if (vm.count("l2")) { *l3 = 1 - (*l1 + *l2); } else { if (vm.count("l3")) { *l2 = 1 - (*l1 + *l3); } else { *l2 = (1 - *l1)/2; *l3 = *l2; } } } else { if (vm.count("l2")) { if (vm.count("l3")) { *l1 = 1 - (*l2 + *l3); } else { *l1 = (1 - *l2)/2; *l3 = *l1; } } else {//only l3 set *l1 = (1 - *l3)/2; *l2 = *l1; } } } } else { *lambdaset = false; } } string uppercase(string orig) { for(int i = 0; orig[i]; i++){ orig[i] = toupper(orig[i]); } return orig; }