#include "postag.h" #include #include #include #include #include #include #include using namespace std; void tnttag(vector *tokens, string model) { string tntcall("tnt -v0 -z100 "); string tntm(model); if (tntm.empty()) { // use default wsj model from the LOGON tree char *logonroot = getenv("LOGONROOT"); if (!logonroot) { cerr << "Please set LOGONROOT so the TnT model can be found." << endl; exit(1); } tntm = string(logonroot)+"/coli/tnt/models/wsj.tnt"; } string tmpdir("/tmp"); char *tmpvar = getenv("TMPDIR"); if (tmpvar) tmpdir = string(tmpvar); string ppidstr; stringstream pid; pid << getpid(); char *tntinname = strdup(string(tmpdir+"/tntin."+pid.str()).c_str()); ofstream tntinput(tntinname); if (!tntinput.is_open()) { cerr << "Couldn't open temp file " << tntinname << endl; exit(1); } //one token per line for (vector::iterator iter = tokens->begin(); iter != tokens->end(); ++iter) tntinput << iter->surface() << endl; tntinput.close(); char *tntoutname = strdup(string(tmpdir+"/tntout."+pid.str()).c_str()); tntcall = tntcall + tntm + " " + tntinname + " >" + tntoutname; int tntret = system(tntcall.c_str()); if (tntret != 0) { perror(string(string("Error in TnT call: ")+tntcall).c_str()); exit(1); } remove(tntinname); ifstream tntout(tntoutname); if (!tntout.is_open()) { cerr << "Couldn't open TnT output in " << tntoutname << endl; exit(1); } string line; int line_count = 0; getline(tntout, line); while (!tntout.eof()) { istringstream tagline(line); string tok, tag, prob; tagline >> tok; if (tok.compare((*tokens)[line_count].surface()) != 0) { cerr << "Token mismatch " << tok << " vs " << (*tokens)[line_count].surface() << endl; exit(1); } while (!tagline.eof()) { tagline >> tag >> prob; if (tagline.fail()) { cerr << "Malformed TnT ouput: " << line << endl; exit(1); } (*tokens)[line_count].tag(tag); (*tokens)[line_count].prob(prob); } line_count++; getline(tntout, line); } remove(tntoutname); } void chasentag(vector *tokens, string chasenrc) { string chasencall("chasen -i w"); string chasenrcfile(chasenrc); if (chasenrcfile.empty()) { // use default .chasenrc from the LOGON tree char *logonroot = getenv("LOGONROOT"); if (!logonroot) { cerr << "Please set LOGONROOT so .chasenrc file can be found." << endl; exit(1); } chasenrcfile = string(logonroot)+"/.chasenrc"; } //this format needs to match how the output is read below string cformat("'%m %P-+%Tn-%Fn\\n'"); string input; for (vector::iterator iter = tokens->begin(); iter != tokens->end(); ++iter) { input += iter->surface(); if (iter + 1 != tokens->end()) input += " "; } string tmpdir("/tmp"); char *tmpvar = getenv("TMPDIR"); if (tmpvar) tmpdir = string(tmpvar); string ppidstr; stringstream pid; pid << getpid(); char *chasenoutname = strdup(string(tmpdir+"/chasenout."+pid.str()).c_str()); chasencall = "echo \"" +input + "\" |"+ chasencall + " -r " + chasenrcfile + " -F " + cformat + " > " + chasenoutname; int chasenret = system(chasencall.c_str()); if (chasenret != 0) { perror(string(string("Error in chasen call: ")+chasencall).c_str()); exit(1); } ifstream chasenout(chasenoutname); if (!chasenout.is_open()) { cerr << "Couldn't open chasen output in " << chasenoutname << endl; exit(1); } string line; int line_count = 0; getline(chasenout, line); while (!chasenout.eof()) { istringstream tagline(line); string tok, tag; tagline >> tok; if (tokens->size() > line_count) { if (tok.compare((*tokens)[line_count].surface()) != 0) { cerr << "Token mismatch " << tok << " vs " << (*tokens)[line_count].surface() << endl; exit(1); } while (!tagline.eof()) { tagline >> tag; if (tagline.fail()) { cerr << "Malformed chasen ouput: " << line << endl; exit(1); } (*tokens)[line_count].tag(tag); (*tokens)[line_count].prob("1.00"); } } line_count++; getline(chasenout, line); } remove(chasenoutname); }