;;; -*- Mode: tdl; Coding: utf-8; -*-

;;;
;;; token mapping is the process of inspecting and re-arranging input tokens,
;;; i.e. a lattice of structured objects (feature structures), to best match
;;; the expectations of the grammar proper.  the general mechnism is described
;;; by Adolphs, et al. (2008); see:
;;;
;;;   http://www.lrec-conf.org/proceedings/lrec2008/summaries/349.html
;;;
;;; as of August 2008, we are assuming an initial tokenization that is (mostly)
;;; compatible to Penn Treebank (PTB) conventions; this may or may not turn out
;;; to be a good idea, but if nothing else it makes the core parser compatible
;;; with a wide variety of existing tools and pre-processing approaches.  for a
;;; critical (and subjective) discussion of PTB tokenization issues, see:
;;;
;;;  http://lingpipe-blog.com/2008/06/26/the-curse-of-intelligent-tokenization/
;;;
;;; in the process of token mapping, we move from a PTB-compatible tokenization
;;; to an ERG-compatible one: specifically, many punctuation marks are attached
;;; as prefixes or suffixes on other tokens.  the process is broken down into a
;;; number of (more or less) distinct phases, viz.
;;;
;;; - normalization: anything the (ideal) tokenizer _should_ have done.
;;; - NE recognition: surface-based identification of URLs, numbers, et al.
;;; - tokenization adjustments: hyphens, slashes, sandwiched punctuation.
;;; - decoration: filling in missing or underspecified token properties.
;;; - token combination: re-attach punctuation marks and contracted forms.
;;; - PoS explosion: multiply out alternate PoS readings as separate tokens.
;;; - PoS reduction: prune low-probability, unwanted, and overlapping tags.
;;;
;;; we hope we have (now) arrived at a relatively stable inventory of token
;;; properties, of which some typically are only introduced in token mapping:
;;; these are ONSET, PRED, CARG.  however, in principle a tokenizer might pass
;;; in any of these properties, or they could be introduced very early in the
;;; rewrite process.  hence, all rules must make sure to always preserve all
;;; token information.
;;;


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; correct tokenization `damage', inherited from the PTB conventions.
;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;
;; convert (PTB-style) ASCII directional quotes to UniCode characters.  when
;; running from an actual PTB file, these are what we would see as input.
;;
ptb_opening_single_quote_tmr := one_one_form_tmt &
[ +INPUT < [ +FORM "`" ] >,
  +OUTPUT < [ +FORM "‘" ] > ].
           
ptb_opening_double_quote_tmr := one_one_form_tmt &
[ +INPUT < [ +FORM "``" ] >,
  +OUTPUT < [ +FORM "“" ] > ].
           
ptb_closing_double_quote_tmr := one_one_form_tmt &
[ +INPUT < [ +FORM "''" ] >,
  +OUTPUT < [ +FORM "”" ] > ].

ptb_opening_bracket_tmr := one_one_form_tmt &
[ +INPUT < [ +FORM ^-[lL][sS][bB]-$ ] >,
  +OUTPUT < [ +FORM "[" ] > ].
           
ptb_closing_bracket_tmr := one_one_form_tmt &
[ +INPUT < [ +FORM ^-[rR][sS][bB]-$ ] >,
  +OUTPUT < [ +FORM "]" ] > ].

;;
;; _fix_me_
;; it seems several occurences (though not all) of regular parentheses in the
;; PTB files show up as -LCB- and -RCB- (curly braces); according to the PoS
;; tags (and the context), there is no doubt these are just plain parentheses.
;;                                                             (28-sep-08; oe)
ptb_opening_parenthesis_tmr := one_one_form_tmt &
[ +INPUT < [ +FORM ^-[lL][cCrR][bB]-$ ] >,
  +OUTPUT < [ +FORM "(" ] > ].
           
ptb_closing_parenthesis_tmr := one_one_form_tmt &
[ +INPUT < [ +FORM ^-[rR][cCrR][bB]-$ ] >,
  +OUTPUT < [ +FORM ")" ] > ].

ptb_ellipsis_tmr := one_one_form_tmt &
[ +INPUT < [ +FORM ^\.\.\.+$ ] >,
  +OUTPUT < [ +FORM "…" ] > ].
           
ptb_ndash_tmr := one_one_form_tmt &
[ +INPUT < [ +FORM "--" ] >,
  +OUTPUT < [ +FORM "–" ] > ].

;;
;; for whatever reason, my `.mrg' files systematically have slashes `escaped'
;; or something, e.g. |1\/10th|.
;;
ptb_slash_tmr := one_one_form_tmt &
[ +INPUT < [ +FORM ^(.*)\\/(.*)$ ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}/${I1:+FORM:2}" ] > ].

;;
;; {|do| |has| |wo| ...} |n't| --> {|don't| |hasn't| |won't| 
;;
ptb_contracted_negation_tmr := two_one_initial_form_tmt &
[ +INPUT < [ +FORM ^([[:alpha:]]+)$ ],
           [ +FORM ^(n't|N'T)$ ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}${I2:+FORM:1}" ] > ].


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; spell correction: a few high-frequency typos, not introducing ambiguity.
;;; in principle, we should maybe also have confusion pairs (|their|, |there|),
;;; as ambiguity-introducing rules.  but then we would need a way of turning
;;; on this latter class selectively, i.e. when parsing carefully edited text,
;;; these rules would (at best) introduce spurious ambiguity.
;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;
;; _fix_me_
;; i wonder whether we should stamp something like [ +CLASS.+ROBUST + ] onto
;; these tokens.  but then we would also need to make sure, at some point, to
;; default everyone else to a non-robust value.                 (24-sep-08; oe)
;;
sc_didnt_tmr := one_one_form_tmt &
[ +INPUT < [ +FORM ^([dD])idn;?t$] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}didn't" ] > ].

sc_dont_tmr := one_one_form_tmt &
[ +INPUT < [ +FORM ^([dD])ont$] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}on't" ] > ].

sc_everytime_tmr := one_two_all_form_tmt &
[ +INPUT < [ +FORM ^([eE])verytime$] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}very" ], [ +FORM "time" ] > ].

;;
;; _fix_me_
;; this one, in principle, conflicts with `many-syllabled' and `syllabling'.
;; maybe these would be better addressed as alternate, robust lexical entries?
;;                                                              (23-sep-08; oe)
sc_lable_tmr := one_one_form_tmt &
[ +INPUT < [ +FORM ^([lL])abl(ed|ing)?$] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}abel${I1:+FORM:2}" ] > ].

sc_recieve_tmr := one_one_form_tmt &
[ +INPUT < [ +FORM ^([rR])eciev(e|ed|es|ing)$] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}eceiv${I1:+FORM:2}" ] > ].

sc_wont_tmr := one_one_form_tmt &
[ +INPUT < [ +FORM ^([wW])ont$] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}on't" ] > ].


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; lightweight NEs: form-driven generic entries (formerly `ersatz' entries)
;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;
;;; email addresses
;;;

;;
;; any valid DNS string, prefixed by address, with optional angle brackets
;;
email_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM
             ^<?[[:alnum:]._-]+@[[:alnum:]_-]+(\.[[:alnum:]_-]+)+>?$ ] >,
  +OUTPUT < [ +CLASS email_ne ] > ].

;;;
;;; uniform resource locators (URLs)
;;;

;;
;; any valid DNS string, prefixed by `http://', with optional angle brackets
;;
url_ne_1_tmr := ne_tmt &
[ +INPUT < [ +FORM ^<?http://[[:alnum:]_-]+(\.[[:alnum:]_-]+)+(/.*)?>?$ ] >,
  +OUTPUT < [ +CLASS url_ne ] > ].

;;
;; any valid DNS string, prefixed by `www', with optional angle brackets
;;
url_ne_2_tmr := ne_tmt &
[ +INPUT < [ +FORM ^<?www(\.[[:alnum:]_-]+)+(/.*)?>?$ ] >,
  +OUTPUT < [ +CLASS url_ne ] > ].

;;
;; any valid DNS string, with obligatory angle brackets
;;
url_ne_3_tmr := ne_tmt &
[ +INPUT < [ +FORM ^<[[:alnum:]_-]+(\.[[:alnum:]_-]+)+(/.*)?>$ ] >,
  +OUTPUT < [ +CLASS url_ne ] > ].

;;;
;;; file names
;;;

;;
;; fully-qualified Un*x style, starting with a slash, e.g. |/etc/config|.  
;;
;; _fix_me_
;; we require a minimum of two components, such that |/etc| by itself will not
;; match.  maybe we should allow these too but create an ambiguity here, i.e.
;; output two tokens, one [ CLASS file_ne }, the other [ CLASS non_ne ]?
;;                                                              (19-sep-08; oe)
;;
file_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^(/[[:alnum:]._-]+){2,}/?$ ] >,
  +OUTPUT < [ +CLASS file_ne ] > ].


;;;
;;; time-of-day expressions: |9am|, |11:45pm|, |20:15|
;;;

;;
;; an |am| or |pm| suffix unambiguously indicates a time expression.  we also
;; grab all tokens of the form `H:M' where `H' and `M' are numbers in the right
;; ranges.
;;
;; _fix_me_
;; i wonder about `mix in a ratio of 1:15', which the second rule below would
;; consider a time-of-day expression.  should we approach such cases with more
;; `optional' NE rules, i.e. ones outputting two tokens?  or should we rather
;; introduce an abstraction over `time_ne' and `ratio_ne', such that a single
;; token can activate multiple lexical entries?  once we get regular expression
;; matching for lexical instantiation (peter is working on that), in principle,
;; we could just drop `time_ne_2_tmr', make `time_ne_ a sub-type of `ratio_ne',
;; and put the `H:M' regular expression into the generic lexical entry.  with
;; great power comes great responsibility :-).                 (19-sep-08; oe)
;;                       
time_ne_1_tmr := ne_tmt &
[ +INPUT < [ +FORM ^(0?[0-9]|1[0-2])([:.][0-5][0-9])?([aApP][mM])$ ] >,
  +OUTPUT  < [ +CLASS time_ne ] > ].

;;
;; _fix_me_
;; dan wanted to add a period |.| as a possible separator, which seems to make
;; this rule overlap with floating point numbers (which have run already)?
;;                                                             (12-jan-09; oe)
time_ne_2_tmr := ne_tmt &
[ +INPUT < [ +FORM ^(0?[0-9]|1[0-9]|2[0-4])[:.][0-5][0-9]$ ] >,
  +OUTPUT  < [ +CLASS time_ne ] > ].

;;;
;;; dates
;;;

;;
;; _fix_me_
;; things are getting a little murky here: some of the current date formats
;; overlap with some of the identifiers and the fractions and ranges.  to do
;; justice to these ambiguities, we would have to introduce multiple NE tokens.
;; or find ways of underspecification in the lexicon, maybe?    (23-sep-08; oe)
;;

;;
;; US and European variants: |11-01-1957|, |11/01/1957|, |11-01-57| |57/01/11|
;;
date_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^([0-9]{1,2}[-/]){2}[0-9]{2,4}$ ] >,
  +OUTPUT < [ +CLASS date_ne ] > ].   

;;
;; _fix_me_
;; i am leaving out |12-2005| |12/05| |12-05| |12/05|, and |'06| for now.  they
;; overlap too much with other patterns, so maybe should be optional rules?
;;                                                              (23-sep-08; oe)
;;


;;;
;;; ratios: |1:1000|, |1:100,000|, et al.
;;;

;;
;; we make the conservative assumption that the first element not exceed three
;; digits and not have leading zeros.
;;
ratio_ne_1_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[1-9][0-9]{0,2}:[1-9][0-9]*$ ] >,
  +OUTPUT < [ +CLASS meas_ne ] > ].   

ratio_ne_2_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[1-9][0-9]{0,2}:[1-9][0-9]{0,2}(,[0-9]{3})*$ ] >,
  +OUTPUT < [ +CLASS meas_ne ] > ].   


;;;
;;; fractions: |1/4|, |-1/3|, |51/100|, et al.
;;;

fraction_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]+(\.[0-9]*)?/[1-9][0-9]*(th)?$ ] >,
  +OUTPUT < [ +CLASS frct_ne ] > ].   


;;;
;;; measure noun phrases (taking precedence over alphanumeric identifiers)
;;;

;;
;; |25cm| or |+37.3ºC|
;;

measure_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[+-~]?[0-9]+([,.][0-9]*)?([kKmMgG"']+|[ckm]m|σ|º[CF]?)$ ] >,
  +OUTPUT < [ +CLASS meas_ne ] > ].

;;
;; |1:1000m|
;;
ratio_measure_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM 
             ^[1-9][0-9]{0,2}:[1-9][0-9]*([kKmMgG"']+|[ckm]m|σ|º[CF]?)$ ] >,
  +OUTPUT < [ +CLASS meas_ne ] > ].   

;;
;; |US$20|
;;
currency_measure_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^(AU?|CA?|US)?\$[0-9]+$ ] >,
  +OUTPUT < [ +CLASS meas_ne ] > ].


;;;
;;; various kinds of identifiers (accumulated throughout the years); some of
;;; these are allowed to include punctuation marks that can (later) lead to
;;; additional token boundaries (i.e. |-| and |/|), hence we need to be fairly
;;; restrictive about our identifier patterns.  in principle, i guess, this is
;;; something that should be adapted specifically for a target domain and type
;;; of text.
;;;
;;; _fix_me_
;;; some of these patterns conflict with others, e.g. `56K data line' should
;;; be a measure NP, not an identifier.  in all-caps strings, `PRE-1980' would
;;; be mis-analyzed too.  i am not really sure what to do about these.
;;;                                                            (24-sep-08; oe)

;;
;; (at least) one or more letters, followed by digits, e.g. |ABC123DEF|
;;
alphanumeric_identifier_ne_1_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]*[[:alpha:]]+[0-9]+[[:alnum:]]*$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].

;;
;; a special case: a number, a hyphen, followed by a single letter: |22-b|
;;
alphanumeric_identifier_ne_2_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]+-[[:alpha:]]$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].

;;
;; in a similar spirit, a number followed by letters in parentheses: |22(B)|;
;; note that the closing parentheses will have been tokenized off, PTB-style.
;;
alphanumeric_identifier_ne_3_tmr := two_one_tmt &
[ +INPUT < [ +FORM ^([0-9]+\([[:alpha:]]+[[:alnum:]]*)$, 
             +ONSET #onset, +CLASS non_ne,
             +PRED #pred, +CARG #carg ],
           [ +FORM ")", +CLASS non_ne ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1})", 
              +ONSET #onset, +CLASS proper_ne,
              +PRED #pred, +CARG #carg, +TNT null_tnt ] > ].

;;
;; a number followed by one letter: |22a|
;;
alphanumeric_identifier_ne_4_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]+[[:alpha:]]$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].

;;
;; the following are maybe taken from chemistry: |B.25| |IL-10| |IL/10|
;;
chemistry_identifier_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]*[[:upper:]]+[-./][[:upper:]]*[0-9]+$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].

;;
;; strongly alphanumeric strings (i.e. including digits), with at least two
;; hyphens, e.g. |123-45-6789|.
;;
hyphenated_identifier_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM 
             ^[[:alpha:]]*[0-9]+[[:alpha:]0-9]*(-[[:alnum:]]+){2,}$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].

;;
;; section numbers (and the like): two or more decimal points
;;
section_number_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]+\.[0-9]+(\.[0-9]+)+$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; now with NEs out of our way, this would be a good time for adjustments to
;;; tokenization: introduce additional token boundaries (e.g. for hyphens and 
;;; slashes) and maybe some robustness rules for `sandwiched' punctuation.
;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;
;; make hyphen a token in its own right between numbers (an n-dash, actually),
;; e.g. |50-60|.  otherwise, break at hyphens following alphabetic prefixes,
;; but keep the hyphen on the prefix, e.g. |sub-discipline|.
;;
numeric_hyphen_tmr := one_three_tmt &
[ +INPUT < [ +FORM ^([+-]?[0-9]+(?:\.[0-9]*)?)-([0-9]+(?:\.[0-9]*)?)$,
             +ONSET #onset, +CLASS non_ne,
             +PRED #pred, +CARG #carg, +TNT #tnt ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}",
              +ONSET #onset, +PRED #pred, +CARG #carg, +TNT #tnt ],
            [ +FORM "–",
              +ONSET con_or_voc, +PRED #pred, +CARG #carg, +TNT null_tnt ],
            [ +FORM "${I1:+FORM:2}",
              +ONSET #onset, +PRED #pred, +CARG #carg, +TNT #tnt ] > ].

;;
;; _fix_me_
;; when we break up tokens, it is not obvious which tag to assign to the first
;; segment.  often, especially for unknown words (which most hyphenated tokens
;; are), the PoS value will reflect the suffix.  for now, copy over +TNT to the
;; initial segment.  if nothing else, names should still work when capitalized,
;; for tokens containing multiple hyphens, the rule will apply from the rear,
;; i.e. the final segment is guaranteed to carry the +TNT information.
;; i just re-tooled this rule a little, see whether dan likes it this way?
;;                                                              (12-jan-09; oe)
alphabetic_hyphen_tmr := one_two_tmt &
[ +INPUT < [ +FORM ^(.+-)([[:alnum:]]+-?)$,             
             +ONSET #onset, +CLASS non_ne,
             +PRED #pred, +CARG #carg, +TNT #tnt ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}",
              +ONSET #onset, +PRED #pred, +CARG #carg, +TNT #tnt ],
            [ +FORM "${I1:+FORM:2}",
              +ONSET #onset, +PRED #pred, +CARG #carg, +TNT #tnt ] > ].

;;
;; with the new addition of derivational lexical rules, immediately re-attach
;; certain verbal prefixes (e.g. |mis-| and |re-|).  it is a bit unfortunate
;; that we end up duplicating information from the orthographemic annotation
;; on those rules in token mapping, but i imagine the linguistic argument for
;; this particular treatment is overwhelming.
;;
derivational_prefix_tmr := two_one_final_form_tmt &
[ +INPUT < [ +FORM ^((?:mis|p?re)-)$ ],
           [ +FORM ^([[:alnum:]]+)$ ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}${I2:+FORM:1}" ] > ].

;;
;; _fix_me_
;; there will be more to do about slashes, no doubt ...         (12-jan-09; oe)
;;
alphabetic_slash_tmr := one_three_tmt &
[ +INPUT < [ +FORM ^(.+)/([[:alnum:]]+)$,             
             +ONSET #onset, +CLASS non_ne,
             +PRED #pred, +CARG #carg, +TNT #tnt ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}",
              +ONSET #onset, +PRED #pred, +CARG #carg, +TNT #tnt ],
            [ +FORM "/",
              +ONSET con_or_voc, +PRED #pred, +CARG #carg, +TNT null_tnt ],
            [ +FORM "${I1:+FORM:2}",
              +ONSET #onset, +PRED #pred, +CARG #carg, +TNT #tnt ] > ].


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; a second pass at lightweight NEs, now that we have further split up tokens
;;; at hyphens and dashes.
;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;
;;; decades: |1950s|, |50s|, |1950's|, and |50's|.
;;;

decade_ne_1_tmr := ne_tmt &
[ +INPUT < [ +FORM ^(1[0-9])?[0-9]0[sS]$ ] >,
  +OUTPUT < [ +CLASS plur_ne ] > ].   

;;
;; _fix_me_
;; this latter type (plural of decades and, in principle, other names) actually
;; is ambiguous with the possessive (i suspect), e.g. `the 1950's style'.  so,
;; maybe, optional rules for those?                             (24-sep-08; oe)
;;
decade_ne_2_tmr := two_one_tmt &
[ +INPUT < [ +FORM ^((1[0-9])?[0-9]0)$, 
             +ONSET #onset, +CLASS non_ne,
             +PRED #pred, +CARG #carg ],
           [ +FORM "('[sS])", +CLASS non_ne ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}${I2:+FORM:1}", 
              +ONSET #onset, +CLASS plur_ne,
              +PRED #pred, +CARG #carg, +TNT null_tnt ] > ].


;;;
;;; numerals, including some sub-sets (days of the month or years).
;;;

;;
;; days of the month: |1| to |9|, |10| to |29|, |30|, and |31|
;;
card_or_dom_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^([1-9]|[1-2][0-9]|3[01])$ ] >,
  +OUTPUT < [ +CLASS card_or_dom_ne ] > ].

;;
;; (candidate) years: |950|, |1805|, |1957|, |2005|, et al.
;;
card_or_year_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[12]?[0-9]{3}$ ] >,
  +OUTPUT < [ +CLASS card_or_year_ne ] > ].

;;
;; any sequence of digits, with optional sign and optional decimal point.
;;
card_ne_1_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[+-~]?[1-9][0-9]*\.?$ ] >,
  +OUTPUT < [ +CLASS card_ne ] > ].

;;
;; floating point numbers, with optional sign and at least one decimal
;;
card_ne_2_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[+-~]?[0-9]*\.[0-9]+$ ] >,
  +OUTPUT < [ +CLASS card_ne ] > ].

;;
;; US-style or German separators, optional sign and decimals: e.g. |23,000.-|
;;
card_ne_3_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[+-]?[1-9][0-9]{0,2}([,.][0-9]{3})+([,.]([0-9]*|-))?$ ] >,
  +OUTPUT < [ +CLASS card_ne ] > ].

;;
;; ordinals, mostly parallel to the numerals
;;
ord_or_dom_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^([23]?(1st|2nd|3rd)|(2?[1-9]|1[04-9]|20|30|31)th)$ ] >,
  +OUTPUT < [ +CLASS ord_or_dom_ne ] > ].

ord_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]*((^|[^1])(1st|2nd|3rd)|(11|12|13|[04-9])th)$ ] >,
  +OUTPUT < [ +CLASS ord_ne ] > ].


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; some tokenizers (e.g. the one of acrolinx) already distinguish a number of
;;; token classes.  our REPP tokenizer, however, does not; so, determine class
;;; values here, if need be.  with acrolinx, we might have to map their naming
;;; scheme into our type hierarchy, on the other hand.
;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

numeric_class_tmr := token_class_tmt &
[ +INPUT < [ +FORM ^[[:digit:]]+$ ] >,
  +OUTPUT < [ +CLASS numeric ] > ].

punct_class_tmr := token_class_tmt &
[ +INPUT < [ +FORM ^[[:punct:]]+$ ] >,
  +OUTPUT < [ +CLASS non_alphanumeric ] > ].

alphabetic_class_tmr := token_class_tmt &
[ +INPUT < [ +FORM ^[[:alpha:]]+$ ] >,
  +OUTPUT < [ +CLASS alphabetic ] > ].

alphanumeric_class_tmr := token_class_tmt &
[ +INPUT < [ +FORM ^[[:alnum:]._-]+$ ] >,
  +OUTPUT < [ +CLASS alphanumeric ] > ].

non_alphanumeric_class_tmr := token_class_tmt &
[ +OUTPUT < [ +CLASS non_alphanumeric ] > ].

;;
;; further decorate the token class with information about (a) sentence-initial
;; position and (b) capitalization.  because these are attributes of +CLASS
;; (and there is no way of overwriting), we play a nasty trick on +CARG, viz.
;; utilize it as a `scratch' slot to prevent cyclic rule applications.  we
;; (kind of) assume that no external tokenizer will pass in +CARG values; and
;; if it did, the worst that would happen is that the rules below cannot fire.
;;

;;
;; while we have tokenized off punctuation marks, the `initial' position is a
;; somwhat fuzzy notion.  we want to ignore punctuation-only tokens, i.e. the 
;; quote marks in |``Cookies'', she said.| do not make |Cookies| non-initial.
;; but neither should |,| or |she| be regarded initial, hence we need to allow
;; the +CONTEXT token anywhere to the left, not just immediately preceding.
;;
non_initial_tmr := token_case_tmt &
[ +CONTEXT < [+FORM ^.*[^[:punct:]].*$ ] >,
  +INPUT < [ +CARG anti_string, +ID #id, +FROM #from, +TO #to ] >,
  +OUTPUT < [ +CLASS [ +INITIAL - ], +CARG non_string,
              +ID #id, +FROM #from, +TO #to ] >,
  +POSITION "C1<<I1, O1@I1" ].

initial_tmr := one_one_token_case_tmt &
[ +INPUT < [ +CARG anti_string ] >,
  +OUTPUT < [ +CLASS [ +INITIAL + ], +CARG non_string ] > ].

;;
;; various combinations of initial capitalization and mixed case (see the type
;; definitions below `token_case' for details).  `alphanumeric' introduces 
;; +CASE, so the REs in the rules below presuppose an alphanumeric token.
;;
capitalized+lower_tmr := one_one_token_case_tmt &
[ +INPUT < [ +FORM ^[[:upper:]][[:lower:][:digit:]._-]+$, 
             +CARG non_string ] >,
  +OUTPUT < [ +CLASS [ +CASE capitalized+lower ], 
              +CARG anti_string ] > ].

capitalized+upper_tmr := one_one_token_case_tmt &
[ +INPUT < [ +FORM ^[[:upper:]][[:upper:][:digit:]._-]+$, 
             +CARG non_string ] >,
  +OUTPUT < [ +CLASS [ +CASE capitalized+upper ], 
              +CARG anti_string ] > ].

capitalized+mixed_tmr := one_one_token_case_tmt &
[ +INPUT < [ +FORM ^[[:upper:]][[:alnum:]._-]+$, 
             +CARG non_string ] >,
  +OUTPUT < [ +CLASS [ +CASE capitalized+mixed ], 
              +CARG anti_string ] > ].

capitalized+non_mixed_tmr := one_one_token_case_tmt &
[ +INPUT < [ +FORM ^[[:upper:]]$, 
             +CARG non_string ] >,
  +OUTPUT < [ +CLASS [ +CASE capitalized+non_mixed ], 
              +CARG anti_string ] > ].
 
non_capitalized+lower_tmr := one_one_token_case_tmt &
[ +INPUT < [ +FORM ^[[:lower:][:digit:]._-]+$, 
             +CARG non_string ] >,
  +OUTPUT < [ +CLASS [ +CASE non_capitalized+lower ], 
              +CARG anti_string ] > ].

non_capitalized+mixed_tmr := one_one_token_case_tmt &
[ +INPUT < [ +FORM ^[[:lower:][:digit:]._-].*[[:upper:]].*$, 
             +CARG non_string ] >,
  +OUTPUT < [ +CLASS [ +CASE non_capitalized+mixed ], 
              +CARG anti_string ] > ].
  
;;
;; finally, we need to make sure no [ +CARG non_string ] values are left, as
;; these were only meaningful within the set of `decoration' rules above.
;;
no_case_tmr := one_one_token_case_tmt &
[ +INPUT < [ +CARG non_string ] >,
  +OUTPUT < [ +CARG anti_string ] > ].


;;
;; in case we are running without a PoS tagger, or something went wrong in the
;; creation of token AVMs from our input (in one form or another), make sure to
;; fully annul part-of-speech information.
;;
null_tnt_tmr := one_one_tmt &
[ +INPUT < [ +FORM #form, +ONSET #onset, +CLASS #class,
             +PRED #pred, +CARG #carg, +TNT [ +TAGS < anti_string, ... > ] ] >,
  +OUTPUT < [ +FORM #form, +ONSET #onset, +CLASS #class,
              +PRED #pred, +CARG #carg, +TNT null_tnt ] > ].


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; re-combine punctuation marks with adjacent tokens, based on directionality
;;; of punctuation marks, e.g. opening vs. closing quotes and brackets.  doing
;;; one such re-combination at a time is sufficient, as each rewrite rule will
;;; apply as many times as it possible can, seeing its own output from earlier
;;; applications.
;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;
;; but first, preserve the current (non-punctuated) from in +CARG, for later
;; reference, e.g. in constructing +PRED values for generics.  NE rules have
;; done this already, hence make sure to not overwrite existing +CARGSs.
;;
default_carg_tmr := one_one_tmt &
[ +INPUT < [ +FORM #form, +ONSET #onset, +CLASS #class,
             +PRED #pred, +CARG anti_string, +TNT #tnt ] >,
  +OUTPUT < [ +FORM #form, +ONSET #onset, +CLASS #class,
              +PRED #pred, +CARG #form, +TNT #tnt ] > ].

;;
;; _fix_me_
;; when re-attaching pre- or suffix punctuation to NEs, we should find a way of
;; forcing the application of corresponding punctuation rules eventually.  as
;; things are now, an NE with adjacent punctuation creates spurious ambiguity:
;; |oe@yy.com.| is matched as an NE prior to re-attaching the period.  when the
;; token with the trailing period is sent through the morphology, two analyses
;; are created, one with, another without a `punct_period' expectation.  both
;; succeed, as there is no testing against a lexical stem with the generic LE.
;; for NEs, at least, i think one could work around this by adding properties 
;; to each token, +PRFX and +SFFX, say, each a list of strings.  in the case of
;; |oe@yy.com.|, the suffix punctuation rule would add to the +SFFX front, say:
;; [ +SFFX < "." > ].  the corresponding orthographemic rules would then have to
;; `pop' the list (to make things simpler, non-generic tokens could leave +SFFX
;; underspecified), and at some point (syntactic rules, for example), an empty
;; +SFFX (and +PRFX ) would be the pre-requisite to further rule applications.
;; --- discuss this with dan one day.                            (8-feb-09; oe)
;;

;;
;; _fix_me_
;; there is a problem here: where we `multiply out' tokens earlier, we need to
;; be able to (re-)attach prefix and suffix punctuation to more than one host.
;; that would require not consuming the punctuation mark(s) at this point, but
;; rather pick them up as CONTEXT (and later throw out any isolated punctuation
;; marks).                                                      (26-sep-08; oe)
;;
prefix_punctuation_tmr := two_one_final_form_tmt &
[ +INPUT < [ +FORM ^([[({“‘]+)$ ], 
           [ +FORM ^(.+)$ ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}${I2:+FORM:1}" ] > ].

;;
;; _fix_me_
;; there is a special case here: |'| following a token ending in |s| could be a
;; possessive marker (which should remain a token in its own right), or could 
;; be a closing single quote.  in principle, the same is true for |"|, but the
;; `inches' measure unit, maybe, will have been detected during NE recognition
;; earlier.  in either case, we would need a way of keeping a separate |'| in
;; the chart, and also re-combine it with the preceding token.  (14-sep-08; oe)
;;
;; _fix_me_
;; in principle, the single closing quote should be in the suffix class too,
;; but we need to address the token-level ambiguity first.      (13-nov-08; oe)
;;
suffix_punctuation_tmr := two_one_initial_form_tmt &
[ +INPUT < [ +FORM ^(.+)$ ],
           [ +FORM ^([])}”",;.!?]+)$ ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}${I2:+FORM:1}" ] > ].

suffix_apostrophe_tmr := two_one_initial_form_tmt &
[ +INPUT < [ +FORM ^(.+[^sS])$ ],
           [ +FORM ^(['’][])}”",;.!?]?)$ ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}${I2:+FORM:1}" ] > ].

;;
;; two similar rules, converting (some) directional GML tokens into affixes
;;
prefix_markup_tmr := two_one_final_form_tmt &
[ +INPUT < [ +FORM ^([(`“]?¦i)$ ], 
           [ +FORM ^(.+)$ ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}${I2:+FORM:1}" ] > ].

suffix_markup_tmr := two_one_initial_form_tmt &
[ +INPUT < [ +FORM ^(.+)$ ],
           [ +FORM ^(i¦[?,.!)”"]?)$ ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}${I2:+FORM:1}" ] > ].

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; _fix_me_
;;; i would prefer doing these rules earlier, but as long as i have no way of
;;; re-combining +INPUT and +CONTEXT tokens (see my email to peter of today),
;;; token level ambiguity cannot be introduced before the prefix and suffix
;;; punctuation rules.                                         (26-sep-08; oe)
;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;
;; now, with +CLASS information available, optionally make any token that is
;; (a) capitalized and not initial or (b) spelled in mixed case (|LinGO|) a
;; proper NE.
;;
;; _fix_me_
;; the ERG lexicon includes a few entries (e.g. titles like |Mr.| and |Jr.|)
;; with capitalized orthography.  currently capitalized NEs are about the only
;; class of generics that can survive alongside a native entry (in the lexical
;; filtering phase), hence it might make sense to prune unwanted tokens here,
;; even though that means knowledge about the ERG lexicon is applied at token
;; mapping already.                                            (23-jan-09; oe)
;;
capitalized_name_tmr := add_ne_tmt &
[ +CONTEXT < [ +CLASS alphanumeric & [ +INITIAL -, +CASE capitalized ] ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].   

mixed_name_tmr := add_ne_tmt &
[ +CONTEXT < [ +CLASS alphanumeric & [ +CASE mixed ] ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].   


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; at this point, we multiply out PoS values on all tokens, where for each
;;; original token as many additional tokens are created (in the same chart
;;; cell) as there are PoS readings.  at this point, we start distinguishing
;;; between tokens that activate native lexical entries (LEs), vs. those that
;;; activate generic LEs.  in the token universe, this distinction is made by
;;; virtue of +ONSET, with unk_onset reserved for generic LEs.  the two sets
;;; do not overlap, i.e. for a single original token with two PoS readings, we
;;; end up with a total of three new tokens.  the pair of rules below resembles
;;; a recursive function, terminating once the PoS list has been reduced to 
;;; a singleton element.  form-based named entities identified earlier avoid
;;; this kind of PoS multiplication because they have already emptied out their
;;; PoS list.
;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;
;; even though we originally made sure all tokens had a fully specified +TNT
;; value, intervening rules could have `leaked' PoS information.  if so, once
;; again, fully annul the +TNT value.
;;
tnt_default_tmr := one_one_tmt &
[ +INPUT < [ +FORM #form, +ONSET #onset, +CLASS #class,
             +PRED #pred, +CARG #carg, +TNT [ +TAGS < anti_string, ... > ] ] >,
  +OUTPUT < [ +FORM #form, +ONSET #onset, +CLASS #class,
              +PRED #pred, +CARG #carg, +TNT null_tnt ] > ].

tnt_recurse_tmr := token_mapping_rule &
[ +INPUT < [ +FORM #form, +ONSET anti_onset,
             +CLASS #class, +PRED #pred, +CARG #carg,
             +ID #id, +FROM #from, +TO #to,
             +TNT [ +TAGS < #tag . #tags & *cons* >,
                    +PRBS < #prb . #prbs & *cons* > ] ] > ,
  +OUTPUT < [ +FORM #form, +ONSET unk_onset,
              +CLASS #class, +PRED #pred, +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
              +TNT [ +TAGS < #tag >, +PRBS < #prb > ] ],
            [ +FORM #form,
              +CLASS #class, +PRED #pred, +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
              +TNT [ +TAGS #tags, +PRBS #prbs ] ] > ,
  +POSITION "O1@I1, O2@I1" ].

tnt_terminate_tmr := token_mapping_rule &
[ +INPUT < [ +FORM #form, +ONSET anti_onset,
             +CLASS #class, +PRED #pred, +CARG #carg,
             +ID #id, +FROM #from, +TO #to,
             +TNT [ +TAGS < #tag >, +PRBS < #prb > ] ] > ,
  +OUTPUT < [ +FORM #form, +ONSET unk_onset,
              +CLASS #class, +PRED #pred, +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
              +TNT [ +TAGS < #tag >, +PRBS < #prb > ] ],
            [ +FORM #form, +ONSET con_or_voc,
              +CLASS #class, +PRED #pred, +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
              +TNT null_tnt ] >,
  +POSITION "O1@I1, O2@I1" ].

;;;
;;; with singleton PoS readings multiplied out in each chart cell, we can prune
;;; undesirable alternatives, e.g. a foreign word reading when there also is a
;;; common noun.  also, ditch PoS readings with very low probability, and ones
;;; for which no PoS-activated generic entries exist anyway (function words).
;;;

tnt_ditch_unlikely_tmr := token_mapping_rule &
[ +INPUT < [ +TNT.+PRBS < ^0?\.0.*$ > ] >,
  +OUTPUT < > ].

tnt_ditch_function_1_tmr := token_mapping_rule &
[ +INPUT < [ +TNT.+TAGS < ^CC|DT|EX|IN|MD|PDT|POS|PRP\$?|RB[RS]$ > ] >,
  +OUTPUT < > ].

tnt_ditch_function_2_tmr := token_mapping_rule &
[ +INPUT < [ +TNT.+TAGS < ^RP|TO|UH|WDT|WP|WRB$ > ] >,
  +OUTPUT < > ].

tnt_ditch_function_3_tmr := token_mapping_rule &
[ +INPUT < [ +TNT.+TAGS < ^\$|#|``|''|\(|\)|,|\.|:$ > ] >,
  +OUTPUT < > ].

;;
;; _fix_me_
;; experimentally, also ditch PoS information on punctuation-only tokens.  we
;; appear to get noun and adjective readings for n- and m-dashes, which hardly
;; can do us any good.                                         (24-sep-08; oe)
;;
tnt_ditch_punctuation_tmr := token_mapping_rule &
[ +INPUT < [ +FORM ^[[:punct:]]+$, +TNT.+TAGS *cons* ] >,
  +OUTPUT < > ].

;;;
;;; _fix_me_
;;; should we eventually want to include the PoS probabilities as a feature in
;;; parse selection, this kind of pruning should disappear: a high-probability
;;; FW, say, should not be bullied out by an unlikely NN.       (31-aug-08; oe)
;;;
tnt_filter_dup_fw_tmr := token_mapping_rule &
[ +CONTEXT  < [ +TNT.+TAGS < "NN" > ] >,
  +INPUT    < [ +TNT.+TAGS < "FW" > ] >,
  +OUTPUT   < >,
  +POSITION "I1@C1" ].

tnt_filter_dup_nnp_tmr := token_mapping_rule &
[ +CONTEXT  < [ +TNT.+TAGS < ^FW|NN$ > ] >,
  +INPUT    < [ +TNT.+TAGS < "NNP" > ] >,
  +OUTPUT   < >,
  +POSITION "I1@C1" ].

tnt_filter_dup_nnps_tmr := token_mapping_rule &
[ +CONTEXT  < [ +TNT.+TAGS < "NNP" > ] >,
  +INPUT    < [ +TNT.+TAGS < "NNPS" > ] >,
  +OUTPUT   < >,
  +POSITION "I1@C1" ].

;;
;; _fix_me_
;; the old `posmapping' setting in PET contained the following comment by dan
;; (from sep-07):
;; 
;;   Tried doing without the adjective, since TNT appears to mostly guess both
;;   an adjective and a noun, and our generic mass-count noun can almost always
;;   do the work of the adjective.  This would avoid large amounts of spurious
;;   ambiguity for most occurrences of these pairs.  But unfortunately TNT 
;;   doesn't always guess both, so we need JJ when it's the only guess.  Maybe
;;   we can effect this with the new token-mapping machinery ...
;;
;; the following rule should have that effect.                 (21-jan-09; oe)
;;
tnt_filter_dup_jj_tmr := token_mapping_rule &
[ +CONTEXT  < [ +TNT.+TAGS < "NN" > ] >,
  +INPUT    < [ +TNT.+TAGS < "JJ" > ] >,
  +OUTPUT   < >,
  +POSITION "I1@C1" ].

;;
;; on all tokens that we expect to activate generic entries, make the +PRED
;; value reflect the orthography and PoS tag.
;;
generic_pred_tmr := token_mapping_rule &  
[ +INPUT < [ +FORM #form,
             +ONSET #onset & unk_onset, +CLASS #class & non_ne, 
             +PRED anti_string, +CARG #carg & ^(.+)$,
             +ID #id, +FROM #from, +TO #to,
             +TNT #tnt & [ +TAGS < ^(.+)$ > ] ] >,
  +OUTPUT < [ +FORM #form,
              +ONSET #onset, +CLASS #class,
              +PRED "_${lc(I1:+CARG:1)}_${lc(I1:+TNT.+TAGS.FIRST:1)}_rel",
              +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
              +TNT #tnt ] >,
  +POSITION "O1@I1" ].


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; finally, make all non-generic tokens lower case, for lexical look-up.
;;; as a side effect, the rule may also end up `grounding' very underspecified
;;; tokens (which should not exist, at this point, in principle): if +ONSET,
;;; +CLASS, and +TNT were all unspecific, we end up defaulting their values to
;;; a token that can only activate native lexical entries.
;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

downcase_tmr := one_one_form_tmt &
[ +INPUT < [ +FORM ^(.*[[:upper:]].*)$, 
             +ONSET con_or_voc, +CLASS non_ne, +TNT null_tnt ] > ,
  +OUTPUT < [ +FORM "${lc(I1:+FORM:1)}" ] > ].