;;; -*- mode: fundamental; coding: utf-8; indent-tabs-mode: t; -*-

;;;
;;; an optional REPP module to re-attach (affix) punctuation marks to adjacent
;;; tokens, effectively un-doing the PTB-style separation.  this is a stand-in
;;; for the new token mapping rule set, which for now is only supported in PET.
;;; for the majority of inputs, these rules should allow parse success in the
;;; LKB.  however, compared to the token mapping rules, this is approximative.
;;;


!([[({“‘]+) 						\1
! ([])}”"’,;.!?]+)					\1
!([^s]) +'([^dDlLmMrRsSvV])				\1'\2
!(.) +n't						\1n't

;;
;; replicate some of the NE rules from `tmr.tdl', purely for the benefit of
;; interactive use in the LKB, i.e. when giving demos.
;;
!<?[a-zA-Z0-9._-]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+>?	_generic_proper_ne_
!<?http://[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+(/.*)?>?	_generic_proper_ne_
!<?www(\.[a-zA-Z0-9_-]+)+(/.*)?>?			_generic_proper_ne_
!<[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+(/.*)?>		_generic_proper_ne_

;;
;; make hyphen a token in its own right between numbers (an n-dash, actually),
;; e.g. |50-60|.  otherwise, break at hyphens following alphabetic prefixes,
;; but keep the hyphen on the prefix, e.g. |sub-discipline|.
;;
!([+-]?[0-9]+(?:\.[0-9]*)?)-([0-9]+(?:\.[0-9]*)?)	\1 – \2
!(.+-)([a-zA-Z0-9]+-?)					\1 \2
!((?:mis|p?re)-) ([a-zA-Z0-9]+)				\1\2
!(.+)/([a-zA-Z0-9]+)					\1 / \2

;;
;; a second pass at lightweight NEs, now that we have further split up tokens
;; at hyphens and dashes.
;;
!(1[0-9])?[0-9]0[sS]					_generic_plur_ne_
![+-~]?[1-9][0-9]*\.?					generic_number
![+-~]?[0-9]*\.[0-9]+					generic_number
![+-]?[1-9][0-9]{0,2}([,.][0-9]{3})+([,.]([0-9]*|-))?	generic_number
![0-9]*((^|[^1])(1st|2nd|3rd)|(11|12|13|[04-9])th)	_generic_ord_ne_