;;; -*- mode: fundamental; coding: utf-8; indent-tabs-mode: t; -*-

;;;
;;; another shot at a finite-state language for preprocessing, normalization,
;;; and tokenization in LKB grammars.  requires LKB version of 1-feb-09 or
;;; newer.  note that the syntax is rigid: everything starting in column 2
;;; (i.e. right after the rule type marker) is used as the match pattern until
;;; the first `\t' (tabulator sign); one or more tabulator are considered the
;;; separator between the matching pattern and the replacement, but other
;;; whitespace will be considered part of the patterns.  empty lines or lines
;;; with a semicolon in column 1 (i.e. in place of the rule type marker, this
;;; is not Lisp) will be ignored.
;;;


;;
;; preprocessor rules versioning; auto-maintained upon CVS (or SVN) check-in.
;; 
@$Date: 2009-02-06 08:33:49 +0100 (fre, 06 feb 2009) $

;;
;; tokenization pattern: after normalization, the string will be broken up at
;; each occurrence of this pattern; the pattern match itself is deleted.
;;
:[ \t]+

;;;
;;; string rewrite rules: all matches, over the entire string, are replaced by
;;; the right-hand side; grouping (using `(' and `)') in the pattern) and group
;;; references (`\1' for the first group, et al.) carry over part of the match.
;;;

;;
;; pad the full string with trailing and leading whitespace; makes matches for
;; word boundaries a little easier down the road; also, squash multiple spaces
;; and replace tabulators with a space.
;;
!^(.+)$								 \1 
!  +								 
!\t								 

;;
;; a set of `mark-up modules', often replacing mark-up character entitities
;; with actual UniCode characters (e.g. |&mdash;| or |---|), or just ditching
;; mark-up that has no bearing on parsing for now (e.g. most wiki mark-up).
;; these modules can be activated selectively by name in the REPP environment
;; or the top-level call into REPP.
;;
>xml
>latex
>ascii
>wiki

;;
;; two special cases involving periods: map ASCII ellipsis (|...|) to a single
;; UniCode character (|…|), and convert |..| between numbers into an n-dash,
;; i.e. a numeric range (typically tokenized off, i.e. |42| |–| |43|).  maybe
;; the latter can also occur between non-numbers?  we could also just preserve
;; it, but always make it a token in its own right?
;;
;; _fix_me_
;; what about a sentence-final period following the ellipsis (as in cb/7060)?
;;                                                              (24-sep-08; oe)
;;
!([^.])\.\.\.+([^.])						\1 … \2
!\[\.\.\.\]							…
!([0-9]) *\.\. *([0-9])						\1 – \2


;;
;; some UniCode characters force token boundaries: m-dash, n-dash, ellipsis.
;;
!([—–…])							 \1