;;; -*- mode: fundamental; coding: utf-8; indent-tabs-mode: t; -*- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; first shot at a finite-state language for preprocessing, normalization, and ;;; tokenization in LKB grammars. requires LKB version after 1-feb-03. note ;;; that the syntax is rigid: everything starting in column 2 (i.e. right after ;;; the rule type marker) is used as the match pattern until the first `\t' ;;; (tabulator sign); one or more tabulator sign are considered the separator ;;; between the matching pattern and the replacement, but other whitespace will ;;; be considered part of the patterns. empty lines or lines with a semicolon ;;; in column 1 (i.e. in place of the rule type marker, this is not Lisp) will ;;; be ignored. ;;; ;;; rules are applied in order and, in the case of substitution rules, each see ;;; the output of the previous iteration. token-level augmentation rules (the ;;; `+' type, for now) are different in that they add an alternative for the ;;; token but the original form remains in the input buffer for subsequent rule ;;; applications (i.e. the alternative is _not_ visible to further rules). ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; preprocessor rules versioning; auto-maintained upon CVS check-in. ;;; @$Date: 2005/11/16 19:11:45 $ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; tokenization pattern: after normalization, the string will be broken up at ;;; each occurrence of this pattern; the pattern match itself is deleted. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; :[ \t]+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; string rewrite rules: all matches, over the entire string, are replaced by ;;; the right-hand side; grouping (using `(' and `)') in the pattern) and group ;;; references (`\1' for the first group, et al.) carry over part of the match. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; pad the full string with trailing and leading whitespace; makes matches for ;;; word boundaries a little easier down the road. ;;; !^(.+)$ \1 ;;; ;;; separate hash sign from right-adjacent number(s) ;;; !([#])([0-9]) \1 \2 ;;; ;;; Replace three or more dots with token 'threedots ' ;;; !([.])([.])([.])+ threedots ;;; ;;; replace multiple sentence-final punctuation marks with only the first one ;;; ;!([?!.])[?!.]+ \1 ;;; ;;; Only for robust EC, VM where some items have final comma rather then period ;;; ;!([,]) ;;; ;;; _fix_me_ ;;; at least for hyphens, we should introduce a notion of `bound' tokens, ;;; e.g. |^.| for a hyphen that was stripped off from one or more tokens: ;;; |US-led| --> |US| |^-| |led|. this way, separating hyphens as ;;; individual tokens need not create ambiguity with the parenthetical hyphen, ;;; Collapse triple-hyphen with double-hyphen (for now), and pad on both ;;; sides with whitespace !([-]){3} __ ;;; Replace "--" with "__" to keep double-hyphen separate from single hyphen !([-]){2} __ ;;; Replace |«| and |»| with |"| !([«»]) " ;;; ;;; Separate funny punctuation with whitespace on either side from the ;;; preceding and following word(s). ;;; !([a-zA-Z0-9])([#$%~+]) \1 \2 ! ([#$%~+])([a-zA-Z0-9]) \1 \2 ;;; ;;; Eliminate spurious space preceding ordinary punctuation ;;; ! ([:;,.:!\)]) \1 ;;; Eliminate space preceding double quote when space on both sides ;;; (admittedly arbitrary, and not correct for S-initial stranded quote, ;;; but this is just a patch-up for odd punctuation convention). ! (["]) \1 ;;; ;;; Add white space to the right of squished commas and colons ;;; except for numbers on both sides (but separate e.g. |2-day| and |V-neck|) ;;; Same for periods between capital letters: "D.B. Smith", "2.Kim" ;;; Add white space on both sides for forward slash ;;; !([a-zA-Z])([,:])([a-zA-Z]) \1\2 \3 !([0-9])([,:])([a-zA-Z]) \1\2 \3 !([0-9])([-])([a-zA-Z]) \1 \2 \3 ! ([a-zA-Z])([,:-])([a-zA-Z]) \1\2 \3 !([a-zA-Z])([,:])([0-9]) \1\2 \3 !([A-Z0-9])([.])([A-Z]) \1\2 \3 !([a-zA-Z])([/])([a-zA-Z]) \1 \2 \3 !([0-9])([/])([a-zA-Z]) \1 \2 \3 !([a-zA-Z])([/])([0-9]) \1 \2 \3 !([a-zA-Z])([-])([a-zA-Z0-9]) \1\2 \3 ;;; ;;; Eliminate spurious space preceding right paren ;;; ! ([\)]) \1 ;;; ;;; Eliminate spurious space following left paren ;;; !([\(]) \1 ;;; ;;; Add white space to left of (. ;;; !([a-zA-Z0-9])([\(]) \1 \2 ;;; And the variants with surrounding punctuation !([a-zA-Z])([,:/])([a-zA-Z])([.?!,;]) \1\2 \3\4 !([0-9])([,:/])([a-zA-Z])([.?!,;]) \1\2 \3\4 !([a-zA-Z])([,:/])([0-9])([.?!,;]) \1\2 \3\4 ;;; ;;; For now, simplify punctuation clusters found in subordinate quoted Ss, as ;;; in "Who arrived?, she asked" since current suffixing machinery doesn't ;;; produce result. Also for "... two hrs., ..." FIX !([a-zA-Z0-9])([.?!])(,) \1\3 ;;; ;;; apostrophes are a bit tricky: generally, we want to separate leading and ;;; trailing single quotes from adjacent word material, so that they become a ;;; separate token (e.g. |abrams'| --> |abrams '|); the possesive |'s|, on the ;;; other hand, we want to separate but then consider a single token. ;;; !([sS])' \1 ' !([^ ])'[sS] \1 's !([^ ])'[sS]([.?!,;]) \1 's\2 ;;; ;;; contracted auxiliaries: separate contracted part from preceding word. ;;; !([^ ])'ll \1 'll !([^ ])'d \1 'd !([^ ])'ve \1 've !([^ ])'m \1 'm !([^ ])'re \1 're !([^ ])'LL \1 'LL !([^ ])'D \1 'D !([^ ])'VE \1 'VE !([^ ])'M \1 'M !([^ ])'RE \1 'RE ;;; ;;; Experimental: mark capitalization with preceding special character |_| ;;; but right now only for single letters used as proper names. ;;; ! ([A-Z]) _\1 ! ([A-Z])([.?!,;":\)-]+) _\1\2 ;;; And for now, put back capital I since it's so frequent as pronoun !_I I ;;; Also add special treatment for abbrev OR for Oregon and for US ;;; (consider also IN) ! OR _OR ! US _US ;;; ;;; Try correcting squished compounds, here just by listing ;;; !backcountry back country ;;; ;;; Parenthetical plurals - remove parens !([A-Za-z]*)\(s\) \1s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; file inclusion: there is an ad hoc set of `spell correction' rules for the ;;; static ecommerce data sets which we want to keep in a separate file. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;?([.?!,;":\)]*) \1WebErsatz\2 ^([\(]*)?([.?!,;":\)]*) \1WebErsatz\2 ^([\(]*)?([.?!,;":\)]*) \1WebErsatz\2 ^([\(]*)?([.?!,;":\)]*) \1WebErsatz\2 ^([\(]*)?([.?!,;":\)]*) \1EmailErsatz\2 ;;; ;;; reduced year names; possibly another case where, in full generality, we ;;; would have to be able to strip off the leading apostrophe first and later, ;;; in the token-level part, introduce a tokenization alternative, re-uniting ;;; the apostrophe and two-digit year. ;;; ^([\(]*)'[0-9][0-9]([.?!,;":\)]*) \1YearErsatz\2 ;;; |an 800m hill| ;;; ^~?([\(]*)[0-9]+[m'"]([.?!,;":\)]*) \1MeasNPErsatz\2 ^~?([\(]*)[0-9]+[Kk][m]([.?!,;":\)]*) \1MeasNPErsatz\2 ^~?([\(]*)[0-9]+[,.][0-9]+[m'"]([.?!,;":\)]*) \1MeasNPErsatz\2 ^~?([\(]*)[0-9]+[,.][0-9]+[Kk][m]([.?!,;":\)]*) \1MeasNPErsatz\2 ^~?([\(]*)[0-9]+[.,-][0-9]+[m'"]([.?!,;":\)]*) \1MeasNPErsatz\2 ^~?([\(]*)[0-9]+[.,-][0-9]+[Kk][m]([.?!,;":\)]*) \1MeasNPErsatz\2 ^~?([\(]*)1:[0-9]+([.?!,;":\)]*) \1MeasNPErsatz\2