;;; -*- mode: fundamental; coding: utf-8; indent-tabs-mode: t; -*- ;;; ;;; an optional REPP module to re-attach (affix) punctuation marks to adjacent ;;; tokens, effectively un-doing the PTB-style separation. this is a stand-in ;;; for the new token mapping rule set, which for now is only supported in PET. ;;; for the majority of inputs, these rules should allow parse success in the ;;; LKB. however, compared to the token mapping rules, this is approximative. ;;; !([[({“‘]+) \1 ! ([])}”"’,;.!?]+) \1 !([^s]) +'([^dDlLmMrRsSvV]) \1'\2 !(.) +n't \1n't ;; ;; replicate some of the NE rules from `tmr.tdl', purely for the benefit of ;; interactive use in the LKB, i.e. when giving demos. ;; !? _generic_proper_ne_ !? _generic_proper_ne_ !? _generic_proper_ne_ !<[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+(/.*)?> _generic_proper_ne_ ;; ;; make hyphen a token in its own right between numbers (an n-dash, actually), ;; e.g. |50-60|. otherwise, break at hyphens following alphabetic prefixes, ;; but keep the hyphen on the prefix, e.g. |sub-discipline|. ;; !([+-]?[0-9]+(?:\.[0-9]*)?)-([0-9]+(?:\.[0-9]*)?) \1 – \2 !(.+-)([a-zA-Z0-9]+-?) \1 \2 !((?:mis|p?re)-) ([a-zA-Z0-9]+) \1\2 !(.+)/([a-zA-Z0-9]+) \1 / \2 ;; ;; a second pass at lightweight NEs, now that we have further split up tokens ;; at hyphens and dashes. ;; !(1[0-9])?[0-9]0[sS] _generic_plur_ne_ ![+-~]?[1-9][0-9]*\.? generic_number ![+-~]?[0-9]*\.[0-9]+ generic_number ![+-]?[1-9][0-9]{0,2}([,.][0-9]{3})+([,.]([0-9]*|-))? generic_number ![0-9]*((^|[^1])(1st|2nd|3rd)|(11|12|13|[04-9])th) _generic_ord_ne_