;;; -*- mode: fundamental; coding: utf-8; indent-tabs-mode: t; -*-

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; first shot at a finite-state language for preprocessing, normalization, and
;;; tokenization in LKB grammars.  requires LKB version after 1-feb-03.  note
;;; that the syntax is rigid: everything starting in column 2 (i.e. right after
;;; the rule type marker) is used as the match pattern until the first `\t'
;;; (tabulator sign); one or more tabulator sign are considered the separator
;;; between the matching pattern and the replacement, but other whitespace will
;;; be considered part of the patterns.  empty lines or lines with a semicolon
;;; in column 1 (i.e. in place of the rule type marker, this is not Lisp) will
;;; be ignored.
;;;
;;; rules are applied in order and, in the case of substitution rules, each see
;;; the output of the previous iteration.  token-level augmentation rules (the
;;; `+' type, for now) are different in that they add an alternative for the 
;;; token but the original form remains in the input buffer for subsequent rule
;;; applications (i.e. the alternative is _not_ visible to further rules).
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;
;;; preprocessor rules versioning; auto-maintained upon CVS check-in.
;;; 
@$Date: 2005/08/10 13:27:38 $

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; tokenization pattern: after normalization, the string will be broken up at
;;; each occurrence of this pattern; the pattern match itself is deleted.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
:[ \t]+

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; string rewrite rules: all matches, over the entire string, are replaced by
;;; the right-hand side; grouping (using `(' and `)') in the pattern) and group
;;; references (`\1' for the first group, et al.) carry over part of the match.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; file inclusion: there is an ad hoc set of `spell correction' rules for the
;;; static ecommerce data sets which we want to keep in a separate file.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; from here on, token-specific rules, i.e. the pattern has to match the full
;;; string of the token (implicit `^' and `$' anchoring).  three types of rules
;;; for now: (i) substitution (`-'), replacing the token with the right-hand
;;; side match, (ii) augmentation (`+'), adding an alternative spelling for the
;;; token, and ersatzing (`^'), effectively a substitution but recording what
;;; the original string was for later retrieval (to be implementend :-).
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;
;;; another type of token-level substitution rules: replace the token surface
;;; form with the replacement string (the `ersatz') but keep the original for
;;; later retrieval.
;;;

;^[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{4}		DateErsatz
;^[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2}		DateErsatz
;^[0-9]{1,2}[/-][0-9]{4}				DateErsatz
;^[0-9]{1,2}[/-][0-9]{2}				DateErsatz

;;;
;;; _fix_me_
;;; in the case of hyphens, if we have decided to strip these off in the string
;;; rewrite rules already, the ersatzing at this point may fail.  it seems one
;;; would either have to allow ersatzing at the string level too and devise an
;;; encoding scheme (using 0x1 to 0x4, say, to number ersatz occurences) that
;;; makes sure ersatzes are not mangled in further string-level processing; at
;;; the end of the day, then, look up the original surface string and put the
;;; readable ersatz into the token.                           (2-feb-02; oe)
;;;
^[-a-zA-Z0-9]+@[-.a-zA-z0-9]+			EmailErsatz
^[-a-zA-Z0-9]+@[-.a-zA-z0-9]+			EmailErsatz

;^[０-９]{2,}					NumberErsatz
;^[０-９]{2,}					NumberErsatz2