;;; -*- mode: fundamental; coding: utf-8; indent-tabs-mode: t; -*- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; first shot at a finite-state language for preprocessing, normalization, and ;;; tokenization in LKB grammars. requires LKB version after 1-feb-03. note ;;; that the syntax is rigid: everything starting in column 2 (i.e. right after ;;; the rule type marker) is used as the match pattern until the first `\t' ;;; (tabulator sign); one or more tabulator sign are considered the separator ;;; between the matching pattern and the replacement, but other whitespace will ;;; be considered part of the patterns. empty lines or lines with a semicolon ;;; in column 1 (i.e. in place of the rule type marker, this is not Lisp) will ;;; be ignored. ;;; ;;; rules are applied in order and, in the case of substitution rules, each see ;;; the output of the previous iteration. token-level augmentation rules (the ;;; `+' type, for now) are different in that they add an alternative for the ;;; token but the original form remains in the input buffer for subsequent rule ;;; applications (i.e. the alternative is _not_ visible to further rules). ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; preprocessor rules versioning; auto-maintained upon CVS check-in. ;;; @$Date: 2006/11/09 18:15:31 $ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; tokenization pattern: after normalization, the string will be broken up at ;;; each occurrence of this pattern; the pattern match itself is deleted. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; :[ \t]+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; string rewrite rules: all matches, over the entire string, are replaced by ;;; the right-hand side; grouping (using `(' and `)') in the pattern) and group ;;; references (`\1' for the first group, et al.) carry over part of the match. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; pad the full string with trailing and leading whitespace; makes matches for ;;; word boundaries a little easier down the road. ;;; !^(.+)$ \1 ;;; ;;; separate hash sign from right-adjacent number(s) ;;; !([#])([0-9]) \1 \2 ;;; ;;; Replace three or more dots with token 'threedots ' ;;; !\.{3,} threedots ;;; ;;; replace multiple sentence-final punctuation marks with only the first one ;;; ;!([?!.])[?!.]+ \1 ;;; ;;; Only for robust EC, VM where some items have final comma rather then period ;;; ;!([,]) ;;; ;;; _fix_me_ ;;; at least for hyphens, we should introduce a notion of `bound' tokens, ;;; e.g. |^.| for a hyphen that was stripped off from one or more tokens: ;;; |US-led| --> |US| |^-| |led|. this way, separating hyphens as ;;; individual tokens need not create ambiguity with the parenthetical hyphen, ;;; Collapse triple-hyphen with double-hyphen (for now), and pad on both ;;; sides with whitespace !-{3} __ ;;; Replace "--" with "__" to keep double-hyphen separate from single hyphen !-{2} __ ;;; Replace |«| and |»| with |"| !(?:«|») " ;;; DPF 18-oct-06 - Temporarily replace |+| with |plus| for PET tokenizer bug ! ([+]) plus ;;; ;;; Separate funny punctuation with whitespace on either side from the ;;; preceding and following word(s). ;;; !([a-zA-Z0-9])([#$%~+]) \1 \2 ! ([#$%~+])([a-zA-Z0-9]) \1 \2 ;;; ;;; Eliminate spurious space preceding ordinary punctuation ;;; ! ([;,.!\)\?]) \1 ;;; Eliminate space preceding double quote when space on both sides ;;; (admittedly arbitrary, and not correct for S-initial stranded quote, ;;; but this is just a patch-up for odd punctuation convention). ! (") \1 ;;; ;;; Add white space to the right of squished commas and colons ;;; except for numbers on both sides (but separate e.g. |2-day| and |V-neck|) ;;; Same for periods between capital letters: "D.B. Smith", "2.Kim" ;;; Add white space on both sides for forward slash ;;; !([a-zA-Z])([,:])([a-zA-Z]) \1\2 \3 !([0-9])([,:])([a-zA-Z]) \1\2 \3 !([0-9])(-)([a-zA-Z][a-zA-Z]+) \1 \2 \3 ! ([a-zA-Z])([,:-])([a-zA-Z]) \1\2 \3 !([a-zA-Z])([,:])([0-9]) \1\2 \3 !([A-Z0-9])(\.)([A-Z]) \1\2 \3 !([a-zA-Z])(/)([a-zA-Z]) \1 \2 \3 !([a-zA-Z])(/) ([a-zA-Z]) \1 \2 \3 !([0-9])(/)([a-zA-Z]) \1 \2 \3 !([a-zA-Z])(/)([0-9]) \1 \2 \3 !([a-zA-Z])(-)([a-zA-Z0-9]) \1\2 \3 ;;; ;;; Eliminate spurious space preceding right paren ;;; ! ([\)]) \1 ;;; ;;; Eliminate spurious space following left paren ;;; !([\(]) \1 ;;; ;;; Add white space to left of ( ;;; !([a-zA-Z0-9.])([\(]) \1 \2 ;;; ;;; Add white space around colon if sandwiched with following alphanumeric ;;; (but not e.g. |http://...| and not ratios with numbers on both sides) ;;; !([a-zA-Z0-9.])([:])([a-zA-Z]) \1 \2 \3 !([a-zA-Z.])([:])([a-zA-Z0-9]) \1 \2 \3 ;;; And add white space to the left of colon when followed by white space: !([a-zA-Z0-9.])([:]) \1 \2 ;;; And the variants with surrounding punctuation !([a-zA-Z])([,/])([a-zA-Z])([.?!,;]) \1\2 \3\4 !([0-9])([,/])([a-zA-Z])([.?!,;]) \1\2 \3\4 !([a-zA-Z])([,/])([0-9])([.?!,;]) \1\2 \3\4 ;;; ;;; For now, simplify punctuation clusters found in subordinate quoted Ss, as ;;; in "Who arrived?, she asked" since current suffixing machinery doesn't ;;; produce result. Also for "... two hrs., ..." FIX !([a-zA-Z0-9])[.?!](,) \1\2 ;;; ;;; apostrophes are a bit tricky: generally, we want to separate leading and ;;; trailing single quotes from adjacent word material, so that they become a ;;; separate token (e.g. |abrams'| --> |abrams '|); the possesive |'s|, on the ;;; other hand, we want to separate but then consider a single token. ;;; !([sS])' \1 ' !([^ ])'[sS] \1 's !([^ ])'[sS]([.?!,;]) \1 's\2 ;;; ;;; contracted auxiliaries: separate contracted part from preceding word. ;;; !([^ ])'ll \1 'll !([^ ])'d \1 'd !([^ ])'ve \1 've !([^ ])'m \1 'm !([^ ])'re \1 're !([^ ])'LL \1 'LL !([^ ])'D \1 'D !([^ ])'VE \1 'VE !([^ ])'M \1 'M !([^ ])'RE \1 'RE ;;; ;;; Experimental: mark capitalization with preceding special character |_| ;;; but right now only for single letters used as proper names. ;;; Add special case for sequence of two capitals separated by space, since ;;; the space after the first one gets consumed by the simple rule. ;;; Exclude "I" since it's so frequent as pronoun ;;; ! ([A-HJ-Z]) ([A-HJ-Z]) _\1 _\2 ! ([A-HJ-Z]) _\1 ! ([A-HJ-Z])([.?!,;":\)-]+) _\1\2 ;;; ;;; Try correcting squished compounds, here just by listing ;;; !backcountry back country ;;; ;;; Parenthetical plurals - remove parens !([A-Za-z]*)\(s\) \1s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; file inclusion: there is an ad hoc set of `spell correction' rules for the ;;; static ecommerce data sets which we want to keep in a separate file. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ?([.?!,;":\)]*) \1WebErsatz\2 ^([\(]*)?([.?!,;":\)]*) \1WebErsatz\2 ^([\(]*)?([.?!,;":\)]*) \1WebErsatz\2 ^([\(]*)?([.?!,;":\)]*) \1WebErsatz\2 ^([\(]*)?([.?!,;":\)]*) \1EmailErsatz\2 ;;; ;;; reduced year names; possibly another case where, in full generality, we ;;; would have to be able to strip off the leading apostrophe first and later, ;;; in the token-level part, introduce a tokenization alternative, re-uniting ;;; the apostrophe and two-digit year. ;;; ^([\(]*)'[0-9][0-9]([.?!,;":\)]*) \1YearErsatz\2 ;;; Range of years, as in |1970-75| ^([\(]*)[0-9]{3,4}-[0-9]{2,4}([.?!,;":\)]*) \1YearErsatz\2 ;;; Also add special treatment for abbrev OR (Oregon), IN (Indiana) and US +OR _OR +IN _IN +US _US ;+([^:]+): \1 ;| _: