;;; ;;; Normalize surviving XML punctuation marks ;;; xml-mdash-rule := inpmap-rule & [ +INPUT < [ +FORM "/([A-Za-z0-9]+)(—)([A-Za-z0-9]*)/" ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}-${I1:+FORM:3}" ] > ]. ; split off 's (possessive 's or contracted auxiliary): apostrophe-s-rule := inpmap-rule & [ +INPUT < [ +FORM "/([A-Za-z]+)('s)/" ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}" ], [ +FORM "${I1:+FORM:2}" ] > ]. ; split off other contracted auxiliaries contracted-aux-ll-rule := inpmap-rule & [ +INPUT < [ +FORM "/([A-Za-z]+)('|ll|LL)/" ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}" ], [ +FORM "${I1:+FORM:2}" ] > ]. contracted-aux-d-rule := inpmap-rule & [ +INPUT < [ +FORM "/([A-Za-z]+)('[dD])/" ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}" ], [ +FORM "${I1:+FORM:2}" ] > ]. contracted-aux-ve-rule := inpmap-rule & [ +INPUT < [ +FORM "/([A-Za-z]+)('|ve|VE)/" ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}" ], [ +FORM "${I1:+FORM:2}" ] > ]. contracted-aux-m-rule := inpmap-rule & [ +INPUT < [ +FORM "/([A-Za-z]+)('[mM])/" ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}" ], [ +FORM "${I1:+FORM:2}" ] > ]. contracted-aux-re-rule := inpmap-rule & [ +INPUT < [ +FORM "/([A-Za-z]+)('|re|RE)/" ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}" ], [ +FORM "${I1:+FORM:2}" ] > ]. ;;; ;;; Parenthetical plurals - remove parens ;;; parenthetical-plural-rule := inpmap-rule & [ +INPUT < [ +FORM "/([A-Za-z]+)(\(s\))/" ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}s" ] > ]. ;;; ;;; Dates: 12-12-2005 12-12-05 12-2005 ;;; date-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{4}/" ] >, +OUTPUT < [ +FORM "dateersatz" ] > ]. date-2-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2}/" ] >, +OUTPUT < [ +FORM "dateersatz" ] > ]. date-3-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{1,2}[/-][0-9]{4}/" ] >, +OUTPUT < [ +FORM "dateersatz" ] > ]. ;;; ;;; phone numbers; making these a little more general would require more work. ;;; us-phone-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{3}-[0-9]{4}/" ] >, +OUTPUT < [ +FORM "threefournumberersatz" ] > ]. nor-phone-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/plus ?47 ?[0-9]{2} ?[0-9]{2} ?[0-9]{2} ?[0-9]{2}/" ]>, +OUTPUT < [ +FORM "norphoneersatz" ] > ]. ;;; ;;; product number identifiers like 1234-5678 or 1234- 5678 ;;; prod-identif-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{4,}-[0-9]{4,}/" ] >, +OUTPUT < [ +FORM "numberersatz" ] > ]. prod-identif-2-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{4,}-/" ] >, +OUTPUT < [ +FORM "numberersatz" ] > ]. ;;; ;;; ranges, e.g 10-20 ;;; range-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{1,3}-[0-9]{1,3}/" ] >, +OUTPUT < [ +FORM "rangeersatz" ] > ]. ;;; 2-2.5 or 2-2,5 (European) range-2-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{1,3}-[0-9]{1,3}[.,][0-9]{1,2}/" ] >, +OUTPUT < [ +FORM "rangeersatz" ] > ]. ;;; ;;; Decades: |in the |1970s|, at least, the world was still in order ...| ;;; decade-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/1[0-9][0-9]0[sS]/" ] >, +OUTPUT < [ +FORM "decadeersatz" ] > ]. decade-2-rule := inpmap-rule & [ +INPUT < [ +FORM "/1[0-9]0[sS]/" ] >, +OUTPUT < [ +FORM "decadeersatz" ] > ]. ;;; ;;; mixed alphanumerics as identifiers; for the ecommerce corpus, we know that ;;; (by convention) five- and six-digit sequences are (product) identifiers. ;;; mixed-alphanum-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]*[A-Z]+[0-9][A-Z0-9]*/" ] >, +OUTPUT < [ +FORM "identifierersatz" ] > ]. mixed-alphanum-2-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]+[A-Z]+[A-Z0-9]*/" ] >, +OUTPUT < [ +FORM "identifierersatz" ] > ]. mixed-alphanum-3-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]*[A-Z]+\.[A-Z]*[0-9]+/" ] >, +OUTPUT < [ +FORM "identifierersatz" ] > ]. mixed-alphanum-4-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{5,6}/" ] >, +OUTPUT < [ +FORM "identifierersatz" ] > ]. ;;; ;;; Item identifiers: 1a, 3d, 1-a (but crucially not 1st, 2nd, 3rd) ;;; mixed-alphanum-5-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]+[a-zA-Z]/" ] >, +OUTPUT < [ +FORM "identifierersatz" ] > ]. mixed-alphanum-6-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{3}[0-9]+[a-zA-Z]+/" ] >, +OUTPUT < [ +FORM "identifierersatz" ] > ]. mixed-alphanum-7-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]+[-][a-zA-Z]/" ] >, +OUTPUT < [ +FORM "identifierersatz" ] > ]. ;;; ;;; Identifiers used in Sciborg ;;; mixed-alphanum-sciborg-rule := inpmap-rule & [ +INPUT < [ +FORM "/[?][/][b][0-9]{6}[a-z]/" ] >, +OUTPUT < [ +FORM "identifierersatz" ] > ]. ;;; ;;; Measure NPs ;;; measure-np-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]+([m'\"]|cm|mm|σ)/" ] >, +OUTPUT < [ +FORM "measnpersatz" ] > ]. measure-np-2-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]+[Kk][m]/" ] >, +OUTPUT < [ +FORM "measnpersatz" ] > ]. measure-np-3-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]+[,.][0-9]+[m'\"]/" ] >, +OUTPUT < [ +FORM "measnpersatz" ] > ]. measure-np-4-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]+[,.][0-9]+[Kk][m]/" ] >, +OUTPUT < [ +FORM "measnpersatz" ] > ]. measure-np-5-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]+[.,-][0-9]+[m'\"]/" ] >, +OUTPUT < [ +FORM "measnpersatz" ] > ]. measure-np-6-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]+[.,-][0-9]+[Kk][m]/" ] >, +OUTPUT < [ +FORM "measnpersatz" ] > ]. measure-np-7-rule := inpmap-rule & [ +INPUT < [ +FORM "/1:[0-9][0-9][0-9]+/" ] >, +OUTPUT < [ +FORM "measnpersatz" ] > ]. ;;; ;;; _fix_me_ ;;; in the case of hyphens, if we have decided to strip these off in the string ;;; rewrite rules already, the ersatzing at this point may fail. it seems one ;;; would either have to allow ersatzing at the string level too and devise an ;;; encoding scheme (using 0x1 to 0x4, say, to number ersatz occurences) that ;;; makes sure ersatzes are not mangled in further string-level processing; at ;;; the end of the day, then, look up the original surface string and put the ;;; readable ersatz into the token. (2-feb-02; oe) ;;; email-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/[-a-zA-Z0-9]+@[-.a-zA-Z0-9]+/" ] >, +OUTPUT < [ +FORM "emailersatz" ] > ]. ;;; ;;; Fractions ;;; fraction-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{1}\/[0-9]{1,2}st/" ] >, +OUTPUT < [ +FORM "fractionersatz" ] > ]. fraction-2-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{1}\/[0-9]{1,2}nd/" ] >, +OUTPUT < [ +FORM "fractionersatz" ] > ]. fraction-3-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{1}\/[0-9]{1,2}rd/" ] >, +OUTPUT < [ +FORM "fractionersatz" ] > ]. fraction-4-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{1}\/[0-9]{1,2}th/" ] >, +OUTPUT < [ +FORM "fractionersatz" ] > ]. fraction-5-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{1,2}\/[0-9]{1,3}/" ] >, +OUTPUT < [ +FORM "fractionersatz" ] > ]. fraction-6-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{1,3},[0-9]{3}/" ] >, +OUTPUT < [ +FORM "fractionersatz" ] > ]. fraction-7-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{1,3},[0-9]{1,3},[0-9]{3}/" ] >, +OUTPUT < [ +FORM "fractionersatz" ] > ]. ;;; ;;; Cardinal numerals ;;; cardinal-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/[2-9]{1}/" ] >, +OUTPUT < [ +FORM "onedigitersatz" ] > ]. cardinal-2-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{2}/" ] >, +OUTPUT < [ +FORM "twodigitersatz" ] > ]. cardinal-3-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{3}/" ] >, +OUTPUT < [ +FORM "threedigitersatz" ] > ]. cardinal-4-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{4}/" ] >, +OUTPUT < [ +FORM "fourdigitersatz" ] > ]. cardinal-5-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{5}/" ] >, +OUTPUT < [ +FORM "fivedigitersatz" ] > ]. cardinal-6-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{6}/" ] >, +OUTPUT < [ +FORM "sixdigitersatz" ] > ]. cardinal-7-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{7}/" ] >, +OUTPUT < [ +FORM "sevendigitersatz" ] > ]. cardinal-8-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{8}/" ] >, +OUTPUT < [ +FORM "eightdigitersatz" ] > ]. cardinal-9-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{9}/" ] >, +OUTPUT < [ +FORM "ninedigitersatz" ] > ]. cardinal-10-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{10}/" ] >, +OUTPUT < [ +FORM "tendigitersatz" ] > ]. cardinal-11-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{11}/" ] >, +OUTPUT < [ +FORM "elevendigitersatz" ] > ]. cardinal-12-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{12}/" ] >, +OUTPUT < [ +FORM "twelvedigitersatz" ] > ]. cardinal-13-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{13,}/" ] >, +OUTPUT < [ +FORM "thirteenplusdigitersatz" ] > ]. neg-digit-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/\-[0-9]+/" ] >, +OUTPUT < [ +FORM "negdigitersatz" ] > ]. neg-decimal-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/\-[0-9]+.[0-9]+/" ] >, +OUTPUT < [ +FORM "negdecimalersatz" ] > ]. ;;; ;;; Numerical ordinals like "360th" ;;; ordinal-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/1st/" ] >, +OUTPUT < [ +FORM "onedigitordersatz" ] > ]. ordinal-2-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]1st/" ] >, +OUTPUT < [ +FORM "twodigitordersatz" ] > ]. ordinal-3-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{2}1st/" ] >, +OUTPUT < [ +FORM "threedigitordersatz" ] > ]. ordinal-4-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{3}1st/" ] >, +OUTPUT < [ +FORM "fourdigitordersatz" ] > ]. ordinal-5-rule := inpmap-rule & [ +INPUT < [ +FORM "/2nd/" ] >, +OUTPUT < [ +FORM "onedigitordersatz" ] > ]. ordinal-6-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]2nd/" ] >, +OUTPUT < [ +FORM "twodigitordersatz" ] > ]. ordinal-7-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{2}2nd/" ] >, +OUTPUT < [ +FORM "threedigitordersatz" ] > ]. ordinal-8-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{3}2nd/" ] >, +OUTPUT < [ +FORM "fourdigitordersatz" ] > ]. ordinal-9-rule := inpmap-rule & [ +INPUT < [ +FORM "/3rd/" ] >, +OUTPUT < [ +FORM "onedigitordersatz" ] > ]. ordinal-10-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]3rd/" ] >, +OUTPUT < [ +FORM "twodigitordersatz" ] > ]. ordinal-11-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{2}3rd/" ] >, +OUTPUT < [ +FORM "threedigitordersatz" ] > ]. ordinal-12-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{3}3rd/" ] >, +OUTPUT < [ +FORM "fourdigitordersatz" ] > ]. ordinal-13-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]th/" ] >, +OUTPUT < [ +FORM "onedigitordersatz" ] > ]. ordinal-14-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{2}th/" ] >, +OUTPUT < [ +FORM "twodigitordersatz" ] > ]. ordinal-15-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{3}th/" ] >, +OUTPUT < [ +FORM "threedigitordersatz" ] > ]. ordinal-16-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{4}th/" ] >, +OUTPUT < [ +FORM "fourdigitordersatz" ] > ]. ;;; ;;; a couple of currencies, as they occur now and again ;;; currency-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/US\$/" ] >, +OUTPUT < [ +FORM "currencyersatz" ] > ]. currency-2-rule := inpmap-rule & [ +INPUT < [ +FORM "/HK\$/" ] >, +OUTPUT < [ +FORM "currencyersatz" ] > ]. currency-3-rule := inpmap-rule & [ +INPUT < [ +FORM "/C\$/" ] >, +OUTPUT < [ +FORM "currencyersatz" ] > ]. ;;; ;;; temperatures ;;; temperature-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/[-]?[0-9]{1,3}°[CF]?/" ] >, +OUTPUT < [ +FORM "temperatureersatz" ] > ]. ;;; ;;; times ;;; time-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-2]?[0-9]:[0-5][0-9]/" ] >, +OUTPUT < [ +FORM "clocktimeersatz" ] > ]. time-2-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-2]?[0-9].[0-5][0-9]/" ] >, +OUTPUT < [ +FORM "clockordecimalersatz" ] > ]. time-3-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-2]?[0-9][:.]?[0-5][0-9][aApP][mM]/" ] >, +OUTPUT < [ +FORM "clocktimeersatz" ] > ]. ;;; ;;; ratios, e.g. 1:50,000 ;;; ratio-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{1,3}[:][0-9]{1,3}/" ] >, +OUTPUT < [ +FORM "ratioersatz" ] > ]. ratio-2-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{1,3}[:][0-9]{1,3},[0-9]{3}/" ] >, +OUTPUT < [ +FORM "ratioersatz" ] > ]. ;;; ;;; General decimal conversion, ordered after clocktime ;;; decimal-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]*\.[0-9]+/" ] >, +OUTPUT < [ +FORM "decimalersatz" ] > ]. ;;; ;;; Section numbers: 3.2.4 ;;; section-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]+\.[0-9]+\.[0-9]+/" ] >, +OUTPUT < [ +FORM "identifierersatz" ] > ]. ;;; Allow decimal variant in other countries, with comma rather than period decimal-2-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]+\,[0-9]+/" ] >, +OUTPUT < [ +FORM "decimalersatz" ] > ]. ;;; ;;; email and web addresses ... lots of room for improvement (2-jul-03; oe) ;;; web-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/?/" ] >, +OUTPUT < [ +FORM "webersatz" ] > ]. web-2-rule := inpmap-rule & [ +INPUT < [ +FORM "/?/" ] >, +OUTPUT < [ +FORM "webersatz" ] > ]. web-3-rule := inpmap-rule & [ +INPUT < [ +FORM "/?/" ] >, +OUTPUT < [ +FORM "webersatz" ] > ]. web-4-rule := inpmap-rule & [ +INPUT < [ +FORM "/?/" ] >, +OUTPUT < [ +FORM "webersatz" ] > ]. email-2-rule := inpmap-rule & [ +INPUT < [ +FORM "/?/" ] >, +OUTPUT < [ +FORM "emailersatz" ] > ]. ;;; ;;; reduced year names; possibly another case where, in full generality, we ;;; would have to be able to strip off the leading apostrophe first and later, ;;; in the token-level part, introduce a tokenization alternative, re-uniting ;;; the apostrophe and two-digit year. ;;; year-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/'[0-9][0-9]/" ] >, +OUTPUT < [ +FORM "yearersatz" ] > ]. ;;; ;;; Range of years, as in |1970-75| ;;; year-2-rule := inpmap-rule & [ +INPUT < [ +FORM "/[0-9]{3,4}-[0-9]{2,4}/" ] >, +OUTPUT < [ +FORM "yearersatz" ] > ]. ;;; ;;; Special treatment for two-letter abbreviations like ;;; OR (Oregon), IN (Indiana), CO (Colorado), US, and IT ;;; And similarly for ON, OFF, as in "the ON switch" ;;; abb-or-rule := inpmap-rule & [ +INPUT < [ +FORM "/OR/" ] >, +OUTPUT < [ +FORM "_OR" ] > ]. abb-in-rule := inpmap-rule & [ +INPUT < [ +FORM "/IN/" ] >, +OUTPUT < [ +FORM "_IN" ] > ]. abb-co-rule := inpmap-rule & [ +INPUT < [ +FORM "/CO/" ] >, +OUTPUT < [ +FORM "_CO" ] > ]. abb-us-rule := inpmap-rule & [ +INPUT < [ +FORM "/US/" ] >, +OUTPUT < [ +FORM "_US" ] > ]. abb-it-rule := inpmap-rule & [ +INPUT < [ +FORM "/IT/" ] >, +OUTPUT < [ +FORM "_IT" ] > ]. abb-on-rule := inpmap-rule & [ +INPUT < [ +FORM "/ON/" ] >, +OUTPUT < [ +FORM "_ON" ] > ]. abb-off-rule := inpmap-rule & [ +INPUT < [ +FORM "/OFF/" ] >, +OUTPUT < [ +FORM "_OFF" ] > ]. ;;; ;;; Squished compounds ;;; split-compound-1-rule := inpmap-rule & [ +INPUT < [ +FORM "/(back)(country)/" ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}" ], [ +FORM "${I1:+FORM:2}" ] > ].