#include "tRepp.h" #include #include #include #include #include #include "unicode.h" using namespace std; tRepp::tRepp(const string path, const string fname, bool quiet) :_id(fname), _path(path), _quiet(quiet) { initialize_encoding_converter("utf-8"); tRegex emptyre = boost::make_u32regex("^$"); tRegex commentre = boost::make_u32regex("^;.*$"); tRegex versionre = boost::make_u32regex("^@\\$Date: (.*) \\$$"); tRegex includere = boost::make_u32regex("^<(\\S+)$"); tRegex rungroupre = boost::make_u32regex("^>(\\d+)$"); tRegex readreppre = boost::make_u32regex("^>(\\S+)$"); tRegex tokre = boost::make_u32regex("^:(.*)$"); tRegex groupstartre = boost::make_u32regex("^#(\\d+)$"); tRegex groupendre = boost::make_u32regex("^#$"); tRegex rulere = boost::make_u32regex("^([!-+^])([^\\t]+)\\t+([^\\t]*)$"); ifstream mainf(string(_path+"/"+_id+".rpp").c_str()); if (mainf.is_open()) { int rule_count = 0; std::string line; std::stack in_group; int line_no=0; getline(mainf,line); while (!mainf.eof()) { line_no++; boost::smatch res; if (!boost::u32regex_match(line, res, emptyre) && //not blank !boost::u32regex_match(line, res, commentre)){ //or comment if (boost::u32regex_match(line, res, versionre)) _version = res[1]; else if (boost::u32regex_match(line, res, tokre)) _tokenizer.assign(boost::make_u32regex(res.str(1).c_str())); else if (boost::u32regex_match(line, res, includere)) { read_file(res.str(1).c_str()); //TODO } else if (boost::u32regex_match(line, res, rungroupre)) { int rgroup; istringstream(res[1]) >> rgroup; tReppGroupRule *newrule = new tReppGroupRule(">G", rgroup); _rules.push_back(newrule); rule_count++; } else if (boost::u32regex_match(line, res, readreppre)) { std::string const rname(res.str(1).c_str()); tRepp *rrepp = new tRepp(_path, rname, _quiet); _repps.insert(map::value_type(rname, rrepp)); tReppIncludeRule *newrule = new tReppIncludeRule(">I", rname); _rules.push_back(newrule); rule_count++; } else if (boost::u32regex_match(line, res, groupstartre)) { int rgroup; istringstream(res[1]) >> rgroup; in_group.push(rgroup); _groups.insert(map::value_type(rgroup, new tReppGroup)); } else if (boost::u32regex_match(line, res, groupendre)) { if (in_group.empty()) { cerr << fname << ":" << line_no << " spurious group close" <::iterator iter = _repps.begin(); iter != _repps.end(); ++iter) delete iter->second; for (map::iterator iter = _groups.begin(); iter != _groups.end(); ++iter) { //delete the group's rules for (vector::iterator iter2 = (iter->second)->begin(); iter2 != (iter->second)->end(); ++iter2) delete *iter2; delete iter->second; } for (vector::iterator iter = _rules.begin(); iter != _rules.end(); ++iter) delete *iter; } vector tRepp::tokenize(const string item, vector &calls) { vector tokens; boost::smatch res; string rest(item); //initialise the mappings to make the start and end anchor cells work vector *smap = new vector(Conv->convert(rest).length() + 2, 0); vector *emap = new vector(Conv->convert(rest).length() + 2, 0); (*smap)[0] = 1; (*emap)[emap->size()-1] = -1; startmap.push_back(smap); endmap.push_back(emap); for (vector::iterator iter = _rules.begin(); iter != _rules.end(); ++iter) { if ((*iter)->get_type() == ">I") { //conditional include string name = (*iter)->name(); bool found = false; for (vector::iterator citer = calls.begin(); citer != calls.end(); ++citer) { if (name == *citer) {//this include was in the calls list found = true; break; } } if (!found) //this include was not in the calls list, skip continue; } rest = (*iter)->apply(this, rest); } //all full string rules have been applied, now tokenise int idx = 1; // 1-based because of the anchors used in mapping while (rest.length() > 0) { if (boost::u32regex_search(rest, res, _tokenizer)) { if (res.prefix().matched) {//something found before the first match tokens.push_back(tToken(string(res.prefix().first, res[0].first))); tokens.back().id(tokens.size()-1); UnicodeString utok = Conv->convert(tokens.back().surface()); int tokstart = idx; int tokend = idx + utok.length() - 1; for (vector *>::reverse_iterator it = startmap.rbegin(); it != startmap.rend(); ++it) { tokstart += (*(*it))[tokstart]; } for (vector *>::reverse_iterator it = endmap.rbegin(); it != endmap.rend(); ++it) { if (tokend < (*it)->size() ) { tokend += (*(*it))[tokend]; } else { tokend += (*(*it))[(*it)->size()-1]; } } tokens.back().start(tokstart-1); tokens.back().end(tokend); idx += Conv->convert(string(res.prefix().first, res.suffix().first)).length(); } else { idx += Conv->convert(string(res[0])).length(); } rest = string(res.suffix().first, res.suffix().second); } else {//a one token string tokens.push_back(tToken(rest)); tokens.back().id(tokens.size()-1); UnicodeString utok = Conv->convert(tokens.back().surface()); int tokstart = idx; int tokend = idx + utok.length() - 1; for (vector *>::reverse_iterator it = startmap.rbegin(); it != startmap.rend(); ++it) { tokstart += (*(*it))[tokstart]; } for (vector *>::reverse_iterator it = endmap.rbegin(); it != endmap.rend(); ++it) { tokend += (*(*it))[tokend]; } tokens.back().start(tokstart-1); tokens.back().end(tokend); rest = string(); } } for (vector *>::iterator it = startmap.begin(); it != startmap.end(); ++it) delete *it; for (vector *>::iterator it = endmap.begin(); it != endmap.end(); ++it) delete *it; startmap.clear(); endmap.clear(); return tokens; } tReppFSRule::tReppFSRule(string type, const char *target, const char *format) : tReppRule(type), _targetstr(target) { _format = string(format); string tmp(target); if (tmp[0] == '^') tmp.replace(0,1, "\\G", 2); if (tmp[0] == '(' && tmp[1] == '^') tmp.replace(1, 1, "\\G", 2); //escape braces in target pattern int i = 0; while (true) { i = tmp.find('{', i); if (i == string::npos) break; tmp.replace(i, 1, "\\{", 2); i+=2; } i = 0; while (true) { i = tmp.find('}', i); if (i == string::npos) break; tmp.replace(i, 1, "\\}", 2); i+=2; } //record capture group reference positions in format _cgroups.push_back(0); UnicodeString uformat = Conv->convert(_format); int flen = uformat.length(); for (int x=0; x < flen; ++x) { if (uformat.charAt(x) == '\\') { if (x+1 < flen && isdigit(uformat.charAt(x+1))) { string temp = Conv->convert(uformat.charAt(x+1)); int groupno; istringstream(temp) >> groupno; if (groupno == _cgroups.size()) { _cgroups.push_back(x); } else break; //out of order group } else { if (x+1 == flen) cerr << "unescaped backslash in " << _format << endl; } } } _target = boost::make_u32regex(tmp); } string tReppGroupRule::name() { ostringstream out; out << _group_id; return(out.str()); } string tReppFSRule::apply(tRepp *r, string origitem) { string item = origitem; string::const_iterator start, end; start = item.begin(); end = item.end(); boost::smatch res; string newstring("^"); int stringindex = 0; vector *smap = new vector(item.length()); vector *emap = new vector(item.length()); (*smap)[stringindex] = 0; //represent zero length (*emap)[stringindex] = 0; //start anchor (^) stringindex++; int shift = 0; while (boost::u32regex_search(start, end, res, _target, boost::match_default|boost::format_sed)) { // copy string before match newstring += string(res.prefix().first, res.prefix().second); int nslen = Conv->convert(newstring).length(); if (nslen > smap->size()) { smap->resize(nslen); emap->resize(nslen); } for (; stringindex < nslen; stringindex++) { (*smap)[stringindex] = shift; (*emap)[stringindex] = shift; } // matched portion of string newstring += res.format(_format, boost::match_default|boost::format_sed); nslen = Conv->convert(newstring).length(); if (nslen > smap->size()) { smap->resize(nslen); emap->resize(nslen); } int mlen = Conv->convert(string(res[0])).length(); int sublength = nslen - stringindex; int nextgroup = 0; //next capture group to look for int ingroup = 0; //capture group we are inside int groupspan = 0; //length of capture group int nextgroupstart = 0; //start index of next capture group int endgroup = 0; //end index of next capture group (outside the group) int gap; //span between groups (or string boundaries and groups) int newstart; int newend; if (_cgroups.size() > 1) { nextgroup = 1; //looking for capture group 1 //first group so offset in format is fixed nextgroupstart = _cgroups[nextgroup]; gap = nextgroupstart; // gap from start of sentence to first group ref newstart = shift; //before first group, difference between matched and replaced length shift += Conv->convert(string(res[0].first,res[1].first)).length() - gap; newend = shift + gap - 1; } else { //no (in-order) capture groups gap = sublength; newstart = shift; //difference between matched and replaced length of full match shift += mlen - gap; newend = shift + gap - 1; } for (int count = 0; count < sublength; count++, stringindex++) { if (nextgroup > 0 && count == nextgroupstart) { //we are looking for a group and found one, set up numbers ingroup = nextgroup; //in span of capture group groupspan = Conv->convert(string(res[ingroup])).length(); endgroup = count + groupspan; if (_cgroups.size() > nextgroup+1) { nextgroup++; //next capture group to look for gap = _cgroups[nextgroup] - _cgroups[ingroup] - 2; //gap between nextgroupstart = endgroup + gap; } else { nextgroup = 0; //no more capture groups gap = sublength - endgroup; } } // cerr << "count: " << count << ", ingroup: " << ingroup << ", shift: " // << shift << ",newstart: " << newstart << ", newend: " << newend << // endl; if (ingroup) { if (count == endgroup) { //adding count so we can subtract count for the adjustment, //rather than subtracting index within the not_ingroup span newstart = shift + count; if (nextgroup > 0) { shift += (Conv->convert(string(res[ingroup].first, res[nextgroup].first)).length() - Conv->convert(string(res[ingroup])).length() - gap); } else { shift += (Conv->convert(string(res[ingroup].first, res[0].second)).length() - Conv->convert(string(res[ingroup])).length() - gap); } //adding count so we can subtract count for the adjustment, //rather than subtracting index within the not_ingroup span newend = shift + gap - 1 + count; ingroup = 0; // cerr << "-count: " << count << ", ingroup: " << ingroup << ", shift: " // << shift << ",newstart: " << newstart << ", newend: " << newend << // endl; } else { (*smap)[stringindex] = shift; (*emap)[stringindex] = shift; } } if (!ingroup) { (*smap)[stringindex] = newstart - count; (*emap)[stringindex] = newend - count; } } if (sublength == endgroup) { //end of match was end of group shift += (Conv->convert(string(res[ingroup].first, res.suffix().first)).length() - Conv->convert(string(res[ingroup])).length()); } start = res[0].second; } if (start == item.begin()) { //never matched return origitem; } else { // copy trailing portion of string newstring += string(res.suffix().first, res.suffix().second); int nslen = Conv->convert(newstring).length(); if (nslen >= smap->size()) { smap->resize(nslen+1); emap->resize(nslen+1); } for (; stringindex < nslen; stringindex++) { (*smap)[stringindex] = shift; (*emap)[stringindex] = shift; } // a zero length cell on the end to deal with zero length replacement (*smap)[nslen] = shift; (*emap)[nslen] = shift - 1; // cerr << "applied s/" << _targetstr << "/" << _format << "/g to \n\"^" // << origitem << "$\" and got \n\"" << newstring << "$\"" << endl; // int ctx = 0; // cerr << "smap:" << endl; // for (vector::iterator it = smap->begin(); // it != smap->end(); ++it) { // cerr << *it << " "; // ctx++; // if (ctx % 20 == 0) // cerr << endl; // } // cerr << endl; // // ctx = 0; // cerr << "emap:" << endl; // for (vector::iterator it = emap->begin(); // it != emap->end(); ++it) { // cerr << *it << " "; // ctx++; // if (ctx % 20 == 0) // cerr << endl; // } // cerr << endl; r->startmap.push_back(smap); r->endmap.push_back(emap); newstring.erase(0,1); return newstring; } } string tReppGroupRule::apply(tRepp *r, string item) { tReppGroup *gptr = r->getGroup(_group_id); string newitem(item); while (true) { string orig = newitem; for (vector::iterator iter = gptr->begin(); iter != gptr->end(); ++iter) newitem = (*iter)->apply(r, newitem); if (newitem == orig) break; } return newitem; } string tReppIncludeRule::apply(tRepp *r, string item) { string newitem(item); tRepp *increpp = r->getRepp(_iname); vector &incrules = increpp->rules(); for (vector::iterator iter = incrules.begin(); iter != incrules.end(); ++iter) { newitem = (*iter)->apply(increpp, newitem); } // copy the char maps from this included repp back to the main repp for (vector *>::iterator it = increpp->startmap.begin(); it != increpp->startmap.end(); ++it) { r->startmap.push_back(*it); } increpp->startmap.clear(); for (vector *>::iterator it = increpp->endmap.begin(); it != increpp->endmap.end(); ++it) { r->endmap.push_back(*it); } increpp->endmap.clear(); return newitem; }