;;; -*- Mode: tdl; Coding: utf-8; -*-

;;;
;;; Copyright (c) 2009 -- 2010 Stephan Oepen (oe@ifi.uio.no); 
;;; see `LICENSE' for conditions.
;;;


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; lightweight NEs: form-driven generic entries (formerly `ersatz' entries)
;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;
;;; email addresses
;;;

;;
;; any valid DNS string, prefixed by address, with optional angle brackets
;;
email_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM
             ^<?[[:alnum:]._-]+@[[:alnum:]_-]+(\.[[:alnum:]_-]+)+>?$ ] >,
  +OUTPUT < [ +CLASS email_ne ] > ].

;;;
;;; uniform resource locators (URLs)
;;;

;;
;; any valid DNS string, prefixed by `http://', with optional angle brackets
;;
url_ne_1_tmr := ne_tmt &
[ +INPUT < [ +FORM ^<?http://[[:alnum:]_-]+(\.[[:alnum:]_-]+)+(/.*)?>?$ ] >,
  +OUTPUT < [ +CLASS url_ne ] > ].

;;
;; any valid DNS string, prefixed by `www', with optional angle brackets
;;
url_ne_2_tmr := ne_tmt &
[ +INPUT < [ +FORM ^<?www(\.[[:alnum:]_-]+)+(/.*)?>?$ ] >,
  +OUTPUT < [ +CLASS url_ne ] > ].

;;
;; any valid DNS string, with obligatory angle brackets
;;
url_ne_3_tmr := ne_tmt &
[ +INPUT < [ +FORM ^<[[:alnum:]_-]+(\.[[:alnum:]_-]+)+(/.*)?>$ ] >,
  +OUTPUT < [ +CLASS url_ne ] > ].

;;;
;;; file names
;;;

;;
;; fully-qualified Un*x style, starting with a slash, e.g. |/etc/config|.  
;;
;; _fix_me_
;; we require a minimum of two components, such that |/etc| by itself will not
;; match.  maybe we should allow these too but create an ambiguity here, i.e.
;; output two tokens, one [ CLASS file_ne ], the other [ CLASS non_ne ]?
;;                                                              (19-sep-08; oe)
;;
file_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^(/[[:alnum:]._-]+){2,}/?$ ] >,
  +OUTPUT < [ +CLASS file_ne ] > ].


;;;
;;; time-of-day expressions: |9am|, |11:45pm|, |20:15|
;;;

;;
;; an |am| or |pm| suffix unambiguously indicates a time expression.  we also
;; grab all tokens of the form `H:M' where `H' and `M' are numbers in the right
;; ranges.
;;
;; _fix_me_
;; i wonder about `mix in a ratio of 1:15', which the second rule below would
;; consider a time-of-day expression.  should we approach such cases with more
;; `optional' NE rules, i.e. ones outputting two tokens?  or should we rather
;; introduce an abstraction over `time_ne' and `ratio_ne', such that a single
;; token can activate multiple lexical entries?  once we get regular expression
;; matching for lexical instantiation (peter is working on that), in principle,
;; we could just drop `time_ne_2_tmr', make `time_ne' a sub-type of `ratio_ne',
;; and put the `H:M' regular expression into the generic lexical entry.  with
;; great power comes great responsibility :-).                 (19-sep-08; oe)
;; DPF 27-jun-09 - Let's try the option of abstracting over time_ne and meas_ne
;; (what the ratio rules introduce).
;;                       
time_ne_1_tmr := ne_tmt &
[ +INPUT < [ +FORM ^(0?[0-9]|1[0-2])([:.][0-5][0-9])?([aApP][mM])$ ] >,
  +OUTPUT  < [ +CLASS time_ne ] > ].

time_ne_2_tmr := ne_tmt &
[ +INPUT < [ +FORM ^(0?[0-9]|1[0-9]|2[0-4]):[0-5][0-9]$ ] >,
  +OUTPUT  < [ +CLASS meas_or_time_ne ] > ].

;;
;; to also match |10.30| as a candidate time, there is an underspecification of
;; times and floating point numbers; this is a separate rule to avoid matching
;; |15:30| as a candidate cardinal.
;; DPF 2012-05-27 - Generalized this class to include proper ne's, so we can
;; treat |Ubuntu 8.10| as a compound of two proper names.
;; DPF 2012-09-21 - Changed the case of the single digit before the decimal
;; to exclude 0, consistent with allowing sub-one decimals to appear with
;; singular nouns in measure phrases, as in WSJ |the stock was down 0.3 cent|
;;
time_ne_3_tmr := ne_tmt &
[ +INPUT < [ +FORM ^(0?[1-9]|1[0-9]|2[0-4])\.[0-5][0-9]$ ] >,
  +OUTPUT  < [ +CLASS card_or_time_or_proper_ne ] > ].


;;;
;;; dates
;;;

;;
;; _fix_me_
;; things are getting a little murky here: some of the current date formats
;; overlap with some of the identifiers and the fractions and ranges.  to do
;; justice to these ambiguities, we would have to introduce multiple NE tokens.
;; or find ways of underspecification in the lexicon, maybe?    (23-sep-08; oe)
;;

;;
;; US and European variants: |11-01-1957|, |11/01/1957|, |11-01-57| |57/01/11|
;;
;; _fix_me_
;; i am leaving out |12-2005| and |'06| for now.  they overlap too much with 
;; other patterns, so maybe should be optional rules (like the one for |9/11|
;; and such below, added by dan in late 2011)?                  (23-sep-08; oe)
;;
date_ne_1_tmr := ne_tmt &
[ +INPUT < [ +FORM ^([0-9]{1,2}[-/]){2}[0-9]{2,4}$ ] >,
  +OUTPUT < [ +CLASS date_ne ] > ].   

date_ne_2_tmr := add_ne_tmt &
[ +CONTEXT < [ +FORM ^([01]?[0-9][-/])[0-9]{1,2}$ ] >,
  +OUTPUT < [ +CLASS date_ne ] > ].   


;;;
;;; ratios: |1:1000|, |1:100,000|, et al.
;;;

;;
;; we make the conservative assumption that the first element not exceed three
;; digits and not have leading zeros.
;;
ratio_ne_1_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[1-9][0-9]{0,2}:[1-9][0-9]*$ ] >,
  +OUTPUT < [ +CLASS meas_ne ] > ].   

ratio_ne_2_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[1-9][0-9]{0,2}:[1-9][0-9]{0,2}(,[0-9]{3})*$ ] >,
  +OUTPUT < [ +CLASS meas_ne ] > ].   


;;;
;;; fractions: |1/4|, |-1/3|, |51/100|, et al.  since we have considered the
;;; alternative analysis as dates by now (|9/11|), consume the input token at
;;; this point, i.e. block any further (different) analysis of it.  we choose
;;; to run this one prior to splitting, to avoid confusion about the slash; we
;;; will need to return to a similar pattern (if without the ordinals, maybe)
;;; after splitting, however, to do justice to, say, |2 1/2-year|.
;;;
fraction_ne_1_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]+(\.[0-9]*)?/[1-9][0-9]*(th)?$ ] >,
  +OUTPUT < [ +CLASS frct_ne ] > ].   

;;;
;;; measure noun phrases (taking precedence over alphanumeric identifiers)
;;;

;;
;; single token: |25cm| or |+37.3ºC|
;;
;; _fix_me_
;; dan changed this from `meas_ne' to `card_or_meas_ne'; why so? i realize we
;; should record more information in [incr tsdb()] profiles, e.g. which token
;; mapping rules had fired (for each parsing result), which might help finding
;; relevant instances for archeological search like this one.  (27-feb-11; oe)
;;
measure_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM 
          ^[-+±~]?[0-9]+([,.][0-9]*)?([kKmMgGµ"']+|[ckmn]m|σ|Å|[º°][CF]?)$ ] >,
  +OUTPUT < [ +CLASS card_or_meas_ne ] > ].

;;
;; |1:1000m|
;;
ratio_measure_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM 
             ^[1-9][0-9]{0,2}:[1-9][0-9]*([kKmMgG"']+|[ckmn]m|σ|[º°][CF]?)$ ]>,
  +OUTPUT < [ +CLASS meas_ne ] > ].   

;;
;; |US$20| and the like: break into two tokens, to get a bracketing compatible
;; with [US$ [20 billion]] for a string like |US$20 billion|.
;;
;; _fix_me_
;; dan and i concluded that we should introduce a `currency_ne' instead, with
;; a corresponding generic entry, i.e. drop |US$| et al. from the lexicon (at
;; present, some of the entries allowed by this rule are missing, e.g. |C$|).
;; to maintain some parallelism with currency names that can be ordinary words
;; (e.g. `pound'), the rule below should synthesize a PRED value (rather than
;; put a CARG on currency NEs), e.g. `_us$_n_currency_rel'.  another rule will
;; be needed for just |US$| (and in principle |€| and other currency symbols),
;; to also match them as currency NEs, and at that point the `currency+name'
;; lexical filtering rule could be discarded.            (dan & oe, 29-jul-09)
;;
currency_measure_ne_tmr := one_two_tmt &
[ +INPUT < [ +FORM ^((?:AU?|CA?|HK|NZ|US)?\$|S[Kk]r)([0-9]+([\.,][0-9]*)?)$,
             +TRAIT [ +UW #uw, +LB #lb, +RB #rb, +HD #hd ], +CLASS non_ne,
             +PRED #pred, +CARG #carg, +TNT.+MAIN #main ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}", +CLASS non_ne,
              +TRAIT [ +UW #uw, +LB #lb, +RB bracket_null, +HD #hd ],
              +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ],
	    [ +FORM "${I1:+FORM:2}", +CLASS non_ne,
              +TRAIT [ +UW #uw, +LB bracket_null, +RB #rb ],
              +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ] > ].

;;;
;;; decades: |1950s|, |50s|, |1950's|, and |50's|; the first type, at least,
;;; needs to precede the alphanumeric identifiers.
;;;
decade_ne_1_tmr := ne_tmt &
[ +INPUT < [ +FORM ^(?:1[0-9])?[0-9]0[sS]$ ] >,
  +OUTPUT < [ +CLASS plur_ne ] > ].

;;
;; this latter type (plural of decades and, in principle, other names) actually
;; is ambiguous with the possessive, e.g. `the 1950's style'.  thus, we create
;; a token-level ambiguity here.
;;
decade_ne_2_tmr := token_mapping_rule &
[ +CONTEXT < [ +FORM ^((1[0-9])?[0-9]0)$, 
               +TRAIT #trait, +CLASS non_ne,
               +ID [ LIST #front, LAST #middle ], +FROM #from,
               +PRED #pred, +CARG #carg, +TNT.+MAIN #main ],
             [ +FORM ^(['’][sS])$, +CLASS non_ne,
               +ID [ LIST #middle, LAST #back ], +TO #to ] >,
  +INPUT  <>,
  +OUTPUT < [ +FORM "${C1:+FORM:1}${C2:+FORM:1}", 
              +TRAIT #trait, +CLASS plur_ne,
	      +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to,
              +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ] >,
  +POSITION "C1<C2, O1@C1, O1@C2" ].

;;;
;;; various kinds of identifiers (accumulated throughout the years); some of
;;; these are allowed to include punctuation marks that can (later) lead to
;;; additional token boundaries (i.e. |-| and |/|), hence we need to be fairly
;;; restrictive about our identifier patterns.  in principle, i guess, this is
;;; something that should be adapted specifically for a target domain and type
;;; of text.
;;;
;;; _fix_me_
;;; some of these patterns conflict with others, e.g. `56K data line' should
;;; be a measure NP, not an identifier.  in all-caps strings, `PRE-1980' would
;;; be mis-analyzed too.  i am not really sure what to do about these.
;;;                                                            (24-sep-08; oe)

;;
;; (at least) one or more letters, followed by digits, e.g. |ABC123DEF|
;;
alphanumeric_identifier_ne_1_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]*[[:alpha:]]+[0-9]+[[:alnum:]]*$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].

;;
;; a special case: a number, a hyphen, followed by a single letter: |22-b|
;;
alphanumeric_identifier_ne_2_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]+-[[:alpha:]]$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].

;;
;; in a similar spirit, a number followed by letters in parentheses: |22(B)|;
;; note that the closing parentheses will have been tokenized off, PTB-style.
;;
alphanumeric_identifier_ne_3_tmr := two_one_tmt &
[ +INPUT < [ +FORM ^([0-9]+\([[:alpha:]]+[[:alnum:]]*)$, 
             +TRAIT #trait, +CLASS non_ne,
             +PRED #pred, +CARG #carg, +TNT.+MAIN #main ],
           [ +FORM ")", +CLASS non_ne ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1})", 
              +TRAIT #trait, +CLASS proper_ne,
              +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ] > ].

;;
;; a number followed by one letter: |22a|
;;
alphanumeric_identifier_ne_4_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]+[[:alpha:]]$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].

;;
;; the following are maybe taken from chemistry: |B.25| |IL-10| |IL/10|
;;
chemistry_identifier_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]*[[:upper:]]+[-./][[:upper:]]*[0-9]+$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].

;;
;; strongly alphanumeric strings (i.e. including digits), with at least two
;; hyphens, e.g. |123-45-6789|.
;;
hyphenated_identifier_ne_tmr := add_ne_tmt &
[ +CONTEXT < [ +FORM ^[[:alpha:]]*[0-9]+[[:alpha:]0-9]*(-[[:alnum:]]+){2,}$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].

;;
;; section numbers (and the like): two or more decimal points
;;
section_number_ne_tmr := ne_tmt &
[ +INPUT < [ +FORM ^[0-9]+\.[0-9]+(\.[0-9]+)+$ ] >,
  +OUTPUT < [ +CLASS proper_ne ] > ].