;;; -*- mode: fundamental; coding: utf-8; indent-tabs-mode: t; -*-

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; first shot at a finite-state language for preprocessing, normalization, and
;;; tokenization in LKB grammars.  requires LKB version after 1-feb-03.  note
;;; that the syntax is rigid: everything starting in column 2 (i.e. right after
;;; the rule type marker) is used as the match pattern until the first `\t'
;;; (tabulator sign); one or more tabulator sign are considered the separator
;;; between the matching pattern and the replacement, but other whitespace will
;;; be considered part of the patterns.  empty lines or lines with a semicolon
;;; in column 1 (i.e. in place of the rule type marker, this is not Lisp) will
;;; be ignored.
;;;
;;; rules are applied in order and, in the case of substitution rules, each see
;;; the output of the previous iteration.  token-level augmentation rules (the
;;; `+' type, for now) are different in that they add an alternative for the 
;;; token but the original form remains in the input buffer for subsequent rule
;;; applications (i.e. the alternative is _not_ visible to further rules).
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;
;;; preprocessor rules versioning; auto-maintained upon CVS check-in.
;;; 
@$Date: 2006/11/09 18:15:31 $

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; tokenization pattern: after normalization, the string will be broken up at
;;; each occurrence of this pattern; the pattern match itself is deleted.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
:[ \t]+

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; string rewrite rules: all matches, over the entire string, are replaced by
;;; the right-hand side; grouping (using `(' and `)') in the pattern) and group
;;; references (`\1' for the first group, et al.) carry over part of the match.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;
;;; pad the full string with trailing and leading whitespace; makes matches for
;;; word boundaries a little easier down the road.
;;;
!^(.+)$					 \1 

;;;
;;; separate hash sign from right-adjacent number(s)
;;;
!([#])([0-9])				\1 \2

;;;
;;; Replace three or more dots with token 'threedots '
;;;
!\.{3,}			 threedots 

;;;
;;; replace multiple sentence-final punctuation marks with only the first one
;;;
;!([?!.])[?!.]+		 		\1

;;;
;;; Only for robust EC, VM where some items have final comma rather then period
;;;
;!([,])					

;;;
;;; _fix_me_
;;; at least for hyphens, we should introduce a notion of `bound' tokens, 
;;; e.g. |^.| for a hyphen that was stripped off from one or more tokens: 
;;; |US-led| --> |US| |^-| |led|.  this way, separating hyphens as
;;; individual tokens need not create ambiguity with the parenthetical hyphen,

;;; Collapse triple-hyphen with double-hyphen (for now), and pad on both
;;; sides with whitespace
!-{3}				 __ 

;;; Replace "--" with "__" to keep double-hyphen separate from single hyphen
!-{2}				 __ 

;;; Replace |«| and |»| with |"|
!(?:«|»)					"

;;; DPF 18-oct-06 - Temporarily replace |+| with |plus| for PET tokenizer bug
! ([+])					 plus 

;;;
;;; Separate funny punctuation with whitespace on either side from the
;;; preceding and following word(s).
;;;
!([a-zA-Z0-9])([#$%~+])		\1 \2 
! ([#$%~+])([a-zA-Z0-9])		 \1 \2

;;;
;;; Eliminate spurious space preceding ordinary punctuation
;;;
! ([;,.!\)\?])			\1

;;; Eliminate space preceding double quote when space on both sides
;;; (admittedly arbitrary, and not correct for S-initial stranded quote,
;;; but this is just a patch-up for odd punctuation convention).
! (") 			\1 

;;;
;;; Add white space to the right of squished commas and colons
;;; except for numbers on both sides (but separate e.g. |2-day| and |V-neck|)
;;; Same for periods between capital letters: "D.B. Smith", "2.Kim"
;;; Add white space on both sides for forward slash
;;;
!([a-zA-Z])([,:])([a-zA-Z])		\1\2 \3
!([0-9])([,:])([a-zA-Z])		\1\2 \3
!([0-9])(-)([a-zA-Z][a-zA-Z]+)	\1 \2 \3
! ([a-zA-Z])([,:-])([a-zA-Z])		 \1\2 \3
!([a-zA-Z])([,:])([0-9])		\1\2 \3

!([A-Z0-9])(\.)([A-Z])			\1\2 \3

!([a-zA-Z])(/)([a-zA-Z])		\1 \2 \3
!([a-zA-Z])(/) ([a-zA-Z])		\1 \2 \3
!([0-9])(/)([a-zA-Z])			\1 \2 \3
!([a-zA-Z])(/)([0-9])			\1 \2 \3

!([a-zA-Z])(-)([a-zA-Z0-9])		\1\2 \3

;;;
;;; Eliminate spurious space preceding right paren
;;;
! ([\)])				\1

;;;
;;; Eliminate spurious space following left paren
;;;
!([\(]) 				\1

;;;
;;; Add white space to left of (
;;;
!([a-zA-Z0-9.])([\(])			\1 \2

;;;
;;; Add white space around colon if sandwiched with following alphanumeric
;;; (but not e.g. |http://...| and not ratios with numbers on both sides)
;;;
!([a-zA-Z0-9.])([:])([a-zA-Z])		\1 \2 \3
!([a-zA-Z.])([:])([a-zA-Z0-9])		\1 \2 \3

;;; And add white space to the left of colon when followed by white space:
!([a-zA-Z0-9.])([:]) 			\1 \2 

;;; And the variants with surrounding punctuation

!([a-zA-Z])([,/])([a-zA-Z])([.?!,;])	\1\2 \3\4
!([0-9])([,/])([a-zA-Z])([.?!,;])	\1\2 \3\4
!([a-zA-Z])([,/])([0-9])([.?!,;])	\1\2 \3\4

;;;
;;; For now, simplify punctuation clusters found in subordinate quoted Ss, as
;;; in "Who arrived?, she asked"  since current suffixing machinery doesn't
;;; produce result.  Also for "... two hrs., ..." FIX
!([a-zA-Z0-9])[.?!](,)		\1\2

;;;
;;; apostrophes are a bit tricky: generally, we want to separate leading and 
;;; trailing single quotes from adjacent word material, so that they become a
;;; separate token (e.g. |abrams'| --> |abrams '|); the possesive |'s|, on the
;;; other hand, we want to separate but then consider a single token.
;;;
!([sS])' 				\1 ' 
!([^ ])'[sS] 				\1 's 
!([^ ])'[sS]([.?!,;])			\1 's\2

;;;
;;; contracted auxiliaries: separate contracted part from preceding word.
;;;
!([^ ])'ll 				\1 'll 
!([^ ])'d 				\1 'd 
!([^ ])'ve 				\1 've 
!([^ ])'m 				\1 'm 
!([^ ])'re 				\1 're 
!([^ ])'LL 				\1 'LL 
!([^ ])'D 				\1 'D 
!([^ ])'VE 				\1 'VE 
!([^ ])'M 				\1 'M 
!([^ ])'RE 				\1 'RE 

;;;
;;; Experimental: mark capitalization with preceding special character |_|
;;; but right now only for single letters used as proper names.
;;; Add special case for sequence of two capitals separated by space, since
;;; the space after the first one gets consumed by the simple rule.
;;; Exclude "I" since it's so frequent as pronoun
;;;
! ([A-HJ-Z]) ([A-HJ-Z]) 		 _\1 _\2 
! ([A-HJ-Z]) 				 _\1 
! ([A-HJ-Z])([.?!,;":\)-]+)		 _\1\2

;;;
;;; Try correcting squished compounds, here just by listing
;;;
!backcountry				back country

;;;
;;; Parenthetical plurals - remove parens

!([A-Za-z]*)\(s\)				\1s

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; file inclusion: there is an ad hoc set of `spell correction' rules for the
;;; static ecommerce data sets which we want to keep in a separate file.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
<ecommerce.fsr
<gcide.fsr

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; while we are working on this against several grammar versions, keep rules
;;; for LOGON hiking data in a file of their own.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
<rondane.fsr

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; from here on, token-specific rules, i.e. the pattern has to match the full
;;; string of the token (implicit `^' and `$' anchoring).  three types of rules
;;; for now: (i) substitution (`-'), replacing the token with the right-hand
;;; side match, (ii) augmentation (`+'), adding an alternative spelling for the
;;; token, and ersatzing (`^'), effectively a substitution but recording what
;;; the original string was for later retrieval (to be implementend :-).
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;
;;; another type of token-level substitution rules: replace the token surface
;;; form with the replacement string (the `ersatz') but keep the original for
;;; later retrieval.
;;;

^([\(]*)[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{4}([.?!,;":\)]*)	\1DateErsatz\2
^([\(]*)[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2}([.?!,;":\)]*)	\1DateErsatz\2
^([\(]*)[0-9]{1,2}[/-][0-9]{4}([.?!,;":\)]*)			\1DateErsatz\2
;; This one conflicts with rangeersatz: 'we hired 10-20 people'
;;^([\(]*)[0-9]{1,2}[/-][0-9]{2}([.?!,;":\)]*)			\1DateErsatz\2

;;;
;;; phone numbers; making these a little more general would require more work.
;;;
^([\(]*)[0-9]{3}-[0-9]{4}([.?!,;":\)]*)		\1ThreeFourNumberErsatz\2
^([\(]*)plus ?47 ?[0-9]{2} ?[0-9]{2} ?[0-9]{2} ?[0-9]{2}([.?!,;":\)]*)	\1NorPhoneErsatz\2

;;;
;;; product number identifiers like 1234-5678
;;;
^([\(]*)[0-9]{4,}-[0-9]{4,}([.?!,;":\)]*)		\1NumberErsatz\2

;;;
;;; ranges, e.g 10-20
;;;
^([\(]*)[0-9]{1,3}-[0-9]{1,3}([.?!,;":\)]*)		\1RangeErsatz\2
;;; 2-2.5 or 2-2,5 (European)
^([\(]*)[0-9]{1,3}-[0-9]{1,3}[.,][0-9]{1,2}([.?!,;":\)]*)	\1RangeErsatz\2


;;;
;;; in the |1970s|, at least, the world was still in order ...
;;;
^([\(]*)1[0-9][0-9]0[sS]([.?!,;":\)]*)		\1DecadeErsatz\2
^([\(]*)[0-9]0[sS]([.?!,;":\)]*)		\1DecadeErsatz\2

;;;
;;; mixed alphanumerics as identifiers; for the ecommerce corpus, we know that
;;; (by convention) five- and six-digit sequences are (product) identifiers.
;;;
^([\(]*)[0-9]*[A-Z]+[0-9][A-Z0-9]*([.?!,;":\)]*)	\1IdentifierErsatz\2
^([\(]*)[0-9]+[A-Z]+[A-Z0-9]*([.?!,;":\)]*)		\1IdentifierErsatz\2
^([\(]*)[0-9]*[A-Z]+\.[A-Z0-9]+([.?!,;":\)]*)		\1IdentifierErsatz\2
;^([\(]*)[0-9]{5}([.?!,;":\)]*)				\1IdentifierErsatz\2
;^([\(]*)[0-9]{6}([.?!,;":\)]*)				\1IdentifierErsatz\2

;;; |an 800m hill|
;;;
^~?([\(]*)[0-9]+([m'"]|cm|mm|σ)([.?!,;":\)]*)	\1MeasNPErsatz\3
^~?([\(]*)[0-9]+[Kk][m]([.?!,;":\)]*)		\1MeasNPErsatz\2
^~?([\(]*)[0-9]+[,.][0-9]+[m'"]([.?!,;":\)]*)	\1MeasNPErsatz\2
^~?([\(]*)[0-9]+[,.][0-9]+[Kk][m]([.?!,;":\)]*)	\1MeasNPErsatz\2
^~?([\(]*)[0-9]+[.,-][0-9]+[m'"]([.?!,;":\)]*)	\1MeasNPErsatz\2
^~?([\(]*)[0-9]+[.,-][0-9]+[Kk][m]([.?!,;":\)]*)	\1MeasNPErsatz\2
^~?([\(]*)1:[0-9]+([.?!,;":\)]*)		\1MeasNPErsatz\2

;;;
;;; Item identifiers: 1a, 3d, 1-a  (but crucially not 1st, 2nd, 3rd)
;;;
^([\(]*)[0-9]+[a-zA-Z]([.?!,;":\)]*)			\1IdentifierErsatz\2
^([\(]*)[0-9]{3}[0-9]+[a-zA-Z]+([.?!,;":\)]*)		\1IdentifierErsatz\2
^([\(]*)[0-9]+[-][a-zA-Z]([.?!,;":\)]*)			\1IdentifierErsatz\2

;;;
;;; _fix_me_
;;; in the case of hyphens, if we have decided to strip these off in the string
;;; rewrite rules already, the ersatzing at this point may fail.  it seems one
;;; would either have to allow ersatzing at the string level too and devise an
;;; encoding scheme (using 0x1 to 0x4, say, to number ersatz occurences) that
;;; makes sure ersatzes are not mangled in further string-level processing; at
;;; the end of the day, then, look up the original surface string and put the
;;; readable ersatz into the token.                           (2-feb-02; oe)
;;;

^([\(]*)[-a-zA-Z0-9]+@[-.a-zA-Z0-9]+([.?!,;":\)]*)	\1EmailErsatz\2

;;;
;;; _fix_me_
;;; our numbers will need a little more work sometime.        (22-feb-03; oe)
;;;

;;; Fractions

^([\(]*)[0-9]{1}\/[0-9]{1,2}st([.?!,;":\)]*)		\1FractionErsatz\2
^([\(]*)[0-9]{1}\/[0-9]{1,2}nd([.?!,;":\)]*)		\1FractionErsatz\2
^([\(]*)[0-9]{1}\/[0-9]{1,2}rd([.?!,;":\)]*)		\1FractionErsatz\2
^([\(]*)[0-9]{1}\/[0-9]{1,2}th([.?!,;":\)]*)		\1FractionErsatz\2
^([\(]*)[0-9]{1,2}\/[0-9]{1,3}([.?!,;":\)]*)		\1FractionErsatz\2
^([\(]*)[0-9]{1,3},[0-9]{3}([.?!,;":\)]*)		\1NumberErsatz\2
^([\(]*)[0-9]{1,3},[0-9]{1,3},[0-9]{3}([.?!,;":\)]*)	\1NumberErsatz\2

;;; Cardinal numerals

^~?([\(]*)[2-9]{1}([.?!,;":\)]*)		\1OneDigitErsatz\2
^~?([\(]*)[0-9]{2}([.?!,;":\)]*)		\1TwoDigitErsatz\2
^~?([\(]*)[0-9]{3}([.?!,;":\)]*)		\1ThreeDigitErsatz\2
^~?([\(]*)[0-9]{4}([.?!,;":\)]*)		\1FourDigitErsatz\2
^~?([\(]*)[0-9]{5}([.?!,;":\)]*)		\1FiveDigitErsatz\2
^~?([\(]*)[0-9]{6}([.?!,;":\)]*)		\1SixDigitErsatz\2
^~?([\(]*)[0-9]{7}([.?!,;":\)]*)		\1SevenDigitErsatz\2
^~?([\(]*)[0-9]{8}([.?!,;":\)]*)		\1EightDigitErsatz\2
^~?([\(]*)[0-9]{9}([.?!,;":\)]*)		\1NineDigitErsatz\2
^~?([\(]*)[0-9]{10}([.?!,;":\)]*)		\1TenDigitErsatz\2
^~?([\(]*)[0-9]{11}([.?!,;":\)]*)		\1ElevenDigitErsatz\2
^~?([\(]*)[0-9]{12}([.?!,;":\)]*)		\1TwelveDigitErsatz\2
^~?([\(]*)[0-9]{13,}([.?!,;":\)]*)		\1ThirteenPlusDigitErsatz\2

^~?([\(]*)\-[0-9]+([.?!,;":\)]*)		\1NegDigitErsatz\2
^~?([\(]*)\-[0-9]+.[0-9]+([.?!,;":\)]*)		\1NegDecimalErsatz\2

;;; Numerical ordinals like "360th"

^([\(]*)1st([.?!,;":\)]*)			\1OnedigitordErsatz\2
^([\(]*)[0-9]1st([.?!,;":\)]*)			\1TwodigitordErsatz\2
^([\(]*)[0-9]{2}1st([.?!,;":\)]*)		\1ThreedigitordErsatz\2
^([\(]*)[0-9]{3}1st([.?!,;":\)]*)		\1FourdigitordErsatz\2
^([\(]*)2nd([.?!,;":\)]*)			\1OnedigitordErsatz\2
^([\(]*)[0-9]2nd([.?!,;":\)]*)			\1TwodigitordErsatz\2
^([\(]*)[0-9]{2}2nd([.?!,;":\)]*)		\1ThreedigitordErsatz\2
^([\(]*)[0-9]{3}2nd([.?!,;":\)]*)		\1FourdigitordErsatz\2
^([\(]*)3rd([.?!,;":\)]*)			\1OnedigitordErsatz\2
^([\(]*)[0-9]3rd([.?!,;":\)]*)			\1TwodigitordErsatz\2
^([\(]*)[0-9]{2}3rd([.?!,;":\)]*)		\1ThreedigitordErsatz\2
^([\(]*)[0-9]{3}3rd([.?!,;":\)]*)		\1FourdigitordErsatz\2
^([\(]*)[0-9]th([.?!,;":\)]*)			\1OnedigitordErsatz\2
^([\(]*)[0-9]{2}th([.?!,;":\)]*)		\1TwodigitordErsatz\2
^([\(]*)[0-9]{3}th([.?!,;":\)]*)		\1ThreedigitordErsatz\2
^([\(]*)[0-9]{4}th([.?!,;":\)]*)		\1FourdigitordErsatz\2


;;;
;;; a couple of currencies, as they occur now and again
;;;
^([\(]*)US\$([.?!,;":\)]*)			\1CurrencyErsatz\2
^([\(]*)HK\$([.?!,;":\)]*)			\1CurrencyErsatz\2
^([\(]*)C\$([.?!,;":\)]*)			\1CurrencyErsatz\2

;;;
;;; times
;;;
^([\(]*)[0-2]?[0-9]:[0-5][0-9]([.?!,;":\)]*)		\1ClockTimeErsatz\2
^([\(]*)[0-2]?[0-9]:[0-5][0-9][aA][mM]([.?!,;":\)]*)	\1ClockTimeErsatz\2
^([\(]*)[0-2]?[0-9]:[0-5][0-9][pP][mM]([.?!,;":\)]*)	\1ClockTimeErsatz\2
^([\(]*)[0-2]?[0-9].[0-5][0-9][aA][mM]([.?!,;":\)]*)	\1ClockTimeErsatz\2
^([\(]*)[0-2]?[0-9].[0-5][0-9][pP][mM]([.?!,;":\)]*)	\1ClockTimeErsatz\2
^([\(]*)[0-2]?[0-9][0-5][0-9][aA][mM]([.?!,;":\)]*)	\1ClockTimeErsatz\2
^([\(]*)[0-2]?[0-9][0-5][0-9][pP][mM]([.?!,;":\)]*)	\1ClockTimeErsatz\2
^([\(]*)[0-2]?[0-9].[0-5][0-9]([.?!,;":\)]*)		\1ClockorDecimalErsatz\1

;;;
;;; ratios, e.g. 1:50,000
;;;
^([\(]*)[0-9]{1,3}[:][0-9]{1,3}([.?!,;":\)]*)		\1RatioErsatz\2
^([\(]*)[0-9]{1,3}[:][0-9]{1,3},[0-9]{3}([.?!,;":\)]*)	\1RatioErsatz\2

; Move general decimal conversion to be after clocktime

^([\(]*)[0-9]+\.[0-9]+([.?!,;":\)]*)		\1DecimalErsatz\2

; Allow variant in some other countries, where comma rather than period is used

^([\(]*)[0-9]+\,[0-9]+([.?!,;":\)]*)		\1DecimalErsatz\2

;;;
;;; email and web addresses ... lots of room for improvement   (2-jul-03; oe)
;;;
^([\(]*)<?http://.*>?([.?!,;":\)]*)			\1WebErsatz\2
^([\(]*)<?www\.[a-zA-Z0-9.?%/_-]+>?([.?!,;":\)]*)	\1WebErsatz\2
^([\(]*)<?[a-zA-Z]{2,}\.[a-zA-Z]{2,}>?([.?!,;":\)]*)	\1WebErsatz\2
^([\(]*)<?[a-zA-Z]{2,}\.[a-zA-Z]{2,}\.[a-zA-Z]{2,}>?([.?!,;":\)]*)	\1WebErsatz\2
^([\(]*)<?[a-zA-Z0-9_-]{2,}@[a-zA-Z0-9._-]{2,}>?([.?!,;":\)]*)	\1EmailErsatz\2

;;;
;;; reduced year names; possibly another case where, in full generality, we
;;; would have to be able to strip off the leading apostrophe first and later, 
;;; in the token-level part, introduce a tokenization alternative, re-uniting
;;; the apostrophe and two-digit year.
;;;
^([\(]*)'[0-9][0-9]([.?!,;":\)]*)			\1YearErsatz\2

;;; Range of years, as in |1970-75|
^([\(]*)[0-9]{3,4}-[0-9]{2,4}([.?!,;":\)]*)		\1YearErsatz\2

;;; Also add special treatment for abbrev OR (Oregon), IN (Indiana) and US
+OR					_OR
+IN					_IN
+US					_US

;+([^:]+):				\1
;|					_: