;;; -*- Mode: tdl; Coding: utf-8; -*- ;;; ;;; Copyright (c) 2009 -- 2010 Stephan Oepen (oe@ifi.uio.no); ;;; see `LICENSE' for conditions. ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; lightweight NEs: form-driven generic entries (formerly `ersatz' entries) ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; email addresses ;;; ;; ;; any valid DNS string, prefixed by address, with optional angle brackets ;; email_ne_tmr := ne_tmt & [ +INPUT < [ +FORM ^?$ ] >, +OUTPUT < [ +CLASS email_ne ] > ]. ;;; ;;; uniform resource locators (URLs) ;;; ;; ;; any valid DNS string, prefixed by `http://', with optional angle brackets ;; url_ne_1_tmr := ne_tmt & [ +INPUT < [ +FORM ^?$ ] >, +OUTPUT < [ +CLASS url_ne ] > ]. ;; ;; any valid DNS string, prefixed by `www', with optional angle brackets ;; url_ne_2_tmr := ne_tmt & [ +INPUT < [ +FORM ^?$ ] >, +OUTPUT < [ +CLASS url_ne ] > ]. ;; ;; any valid DNS string, with obligatory angle brackets ;; url_ne_3_tmr := ne_tmt & [ +INPUT < [ +FORM ^<[[:alnum:]_-]+(\.[[:alnum:]_-]+)+(/.*)?>$ ] >, +OUTPUT < [ +CLASS url_ne ] > ]. ;;; ;;; file names ;;; ;; ;; fully-qualified Un*x style, starting with a slash, e.g. |/etc/config|. ;; ;; _fix_me_ ;; we require a minimum of two components, such that |/etc| by itself will not ;; match. maybe we should allow these too but create an ambiguity here, i.e. ;; output two tokens, one [ CLASS file_ne ], the other [ CLASS non_ne ]? ;; (19-sep-08; oe) ;; file_ne_tmr := ne_tmt & [ +INPUT < [ +FORM ^(/[[:alnum:]._-]+){2,}/?$ ] >, +OUTPUT < [ +CLASS file_ne ] > ]. ;;; ;;; time-of-day expressions: |9am|, |11:45pm|, |20:15| ;;; ;; ;; an |am| or |pm| suffix unambiguously indicates a time expression. we also ;; grab all tokens of the form `H:M' where `H' and `M' are numbers in the right ;; ranges. ;; ;; _fix_me_ ;; i wonder about `mix in a ratio of 1:15', which the second rule below would ;; consider a time-of-day expression. should we approach such cases with more ;; `optional' NE rules, i.e. ones outputting two tokens? or should we rather ;; introduce an abstraction over `time_ne' and `ratio_ne', such that a single ;; token can activate multiple lexical entries? once we get regular expression ;; matching for lexical instantiation (peter is working on that), in principle, ;; we could just drop `time_ne_2_tmr', make `time_ne' a sub-type of `ratio_ne', ;; and put the `H:M' regular expression into the generic lexical entry. with ;; great power comes great responsibility :-). (19-sep-08; oe) ;; DPF 27-jun-09 - Let's try the option of abstracting over time_ne and meas_ne ;; (what the ratio rules introduce). ;; time_ne_1_tmr := ne_tmt & [ +INPUT < [ +FORM ^(0?[0-9]|1[0-2])([:.][0-5][0-9])?([aApP][mM])$ ] >, +OUTPUT < [ +CLASS time_ne ] > ]. time_ne_2_tmr := ne_tmt & [ +INPUT < [ +FORM ^(0?[0-9]|1[0-9]|2[0-4]):[0-5][0-9]$ ] >, +OUTPUT < [ +CLASS meas_or_time_ne ] > ]. ;; ;; to also match |10.30| as a candidate time, there is an underspecification of ;; times and floating point numbers; this is a separate rule to avoid matching ;; |15:30| as a candidate cardinal. ;; DPF 2012-05-27 - Generalized this class to include proper ne's, so we can ;; treat |Ubuntu 8.10| as a compound of two proper names. ;; DPF 2012-09-21 - Changed the case of the single digit before the decimal ;; to exclude 0, consistent with allowing sub-one decimals to appear with ;; singular nouns in measure phrases, as in WSJ |the stock was down 0.3 cent| ;; time_ne_3_tmr := ne_tmt & [ +INPUT < [ +FORM ^(0?[1-9]|1[0-9]|2[0-4])\.[0-5][0-9]$ ] >, +OUTPUT < [ +CLASS card_or_time_or_proper_ne ] > ]. ;;; ;;; dates ;;; ;; ;; _fix_me_ ;; things are getting a little murky here: some of the current date formats ;; overlap with some of the identifiers and the fractions and ranges. to do ;; justice to these ambiguities, we would have to introduce multiple NE tokens. ;; or find ways of underspecification in the lexicon, maybe? (23-sep-08; oe) ;; ;; ;; US and European variants: |11-01-1957|, |11/01/1957|, |11-01-57| |57/01/11| ;; ;; _fix_me_ ;; i am leaving out |12-2005| and |'06| for now. they overlap too much with ;; other patterns, so maybe should be optional rules (like the one for |9/11| ;; and such below, added by dan in late 2011)? (23-sep-08; oe) ;; date_ne_1_tmr := ne_tmt & [ +INPUT < [ +FORM ^([0-9]{1,2}[-/]){2}[0-9]{2,4}$ ] >, +OUTPUT < [ +CLASS date_ne ] > ]. date_ne_2_tmr := add_ne_tmt & [ +CONTEXT < [ +FORM ^([01]?[0-9][-/])[0-9]{1,2}$ ] >, +OUTPUT < [ +CLASS date_ne ] > ]. ;;; ;;; ratios: |1:1000|, |1:100,000|, et al. ;;; ;; ;; we make the conservative assumption that the first element not exceed three ;; digits and not have leading zeros. ;; ratio_ne_1_tmr := ne_tmt & [ +INPUT < [ +FORM ^[1-9][0-9]{0,2}:[1-9][0-9]*$ ] >, +OUTPUT < [ +CLASS meas_ne ] > ]. ratio_ne_2_tmr := ne_tmt & [ +INPUT < [ +FORM ^[1-9][0-9]{0,2}:[1-9][0-9]{0,2}(,[0-9]{3})*$ ] >, +OUTPUT < [ +CLASS meas_ne ] > ]. ;;; ;;; fractions: |1/4|, |-1/3|, |51/100|, et al. since we have considered the ;;; alternative analysis as dates by now (|9/11|), consume the input token at ;;; this point, i.e. block any further (different) analysis of it. we choose ;;; to run this one prior to splitting, to avoid confusion about the slash; we ;;; will need to return to a similar pattern (if without the ordinals, maybe) ;;; after splitting, however, to do justice to, say, |2 1/2-year|. ;;; fraction_ne_1_tmr := ne_tmt & [ +INPUT < [ +FORM ^[0-9]+(\.[0-9]*)?/[1-9][0-9]*(th)?$ ] >, +OUTPUT < [ +CLASS frct_ne ] > ]. ;;; ;;; measure noun phrases (taking precedence over alphanumeric identifiers) ;;; ;; ;; single token: |25cm| or |+37.3ºC| ;; ;; _fix_me_ ;; dan changed this from `meas_ne' to `card_or_meas_ne'; why so? i realize we ;; should record more information in [incr tsdb()] profiles, e.g. which token ;; mapping rules had fired (for each parsing result), which might help finding ;; relevant instances for archeological search like this one. (27-feb-11; oe) ;; measure_ne_tmr := ne_tmt & [ +INPUT < [ +FORM ^[-+±~]?[0-9]+([,.][0-9]*)?([kKmMgGµ"']+|[ckmn]m|σ|Å|[º°][CF]?)$ ] >, +OUTPUT < [ +CLASS card_or_meas_ne ] > ]. ;; ;; |1:1000m| ;; ratio_measure_ne_tmr := ne_tmt & [ +INPUT < [ +FORM ^[1-9][0-9]{0,2}:[1-9][0-9]*([kKmMgG"']+|[ckmn]m|σ|[º°][CF]?)$ ]>, +OUTPUT < [ +CLASS meas_ne ] > ]. ;; ;; |US$20| and the like: break into two tokens, to get a bracketing compatible ;; with [US$ [20 billion]] for a string like |US$20 billion|. ;; ;; _fix_me_ ;; dan and i concluded that we should introduce a `currency_ne' instead, with ;; a corresponding generic entry, i.e. drop |US$| et al. from the lexicon (at ;; present, some of the entries allowed by this rule are missing, e.g. |C$|). ;; to maintain some parallelism with currency names that can be ordinary words ;; (e.g. `pound'), the rule below should synthesize a PRED value (rather than ;; put a CARG on currency NEs), e.g. `_us$_n_currency_rel'. another rule will ;; be needed for just |US$| (and in principle |€| and other currency symbols), ;; to also match them as currency NEs, and at that point the `currency+name' ;; lexical filtering rule could be discarded. (dan & oe, 29-jul-09) ;; currency_measure_ne_tmr := one_two_tmt & [ +INPUT < [ +FORM ^((?:AU?|CA?|HK|NZ|US)?\$|S[Kk]r)([0-9]+([\.,][0-9]*)?)$, +TRAIT [ +UW #uw, +LB #lb, +RB #rb, +HD #hd ], +CLASS non_ne, +PRED #pred, +CARG #carg, +TNT.+MAIN #main ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}", +CLASS non_ne, +TRAIT [ +UW #uw, +LB #lb, +RB bracket_null, +HD #hd ], +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ], [ +FORM "${I1:+FORM:2}", +CLASS non_ne, +TRAIT [ +UW #uw, +LB bracket_null, +RB #rb ], +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ] > ]. ;;; ;;; decades: |1950s|, |50s|, |1950's|, and |50's|; the first type, at least, ;;; needs to precede the alphanumeric identifiers. ;;; decade_ne_1_tmr := ne_tmt & [ +INPUT < [ +FORM ^(?:1[0-9])?[0-9]0[sS]$ ] >, +OUTPUT < [ +CLASS plur_ne ] > ]. ;; ;; this latter type (plural of decades and, in principle, other names) actually ;; is ambiguous with the possessive, e.g. `the 1950's style'. thus, we create ;; a token-level ambiguity here. ;; decade_ne_2_tmr := token_mapping_rule & [ +CONTEXT < [ +FORM ^((1[0-9])?[0-9]0)$, +TRAIT #trait, +CLASS non_ne, +ID [ LIST #front, LAST #middle ], +FROM #from, +PRED #pred, +CARG #carg, +TNT.+MAIN #main ], [ +FORM ^(['’][sS])$, +CLASS non_ne, +ID [ LIST #middle, LAST #back ], +TO #to ] >, +INPUT <>, +OUTPUT < [ +FORM "${C1:+FORM:1}${C2:+FORM:1}", +TRAIT #trait, +CLASS plur_ne, +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to, +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ] >, +POSITION "C1, +OUTPUT < [ +CLASS proper_ne ] > ]. ;; ;; a special case: a number, a hyphen, followed by a single letter: |22-b| ;; alphanumeric_identifier_ne_2_tmr := ne_tmt & [ +INPUT < [ +FORM ^[0-9]+-[[:alpha:]]$ ] >, +OUTPUT < [ +CLASS proper_ne ] > ]. ;; ;; in a similar spirit, a number followed by letters in parentheses: |22(B)|; ;; note that the closing parentheses will have been tokenized off, PTB-style. ;; alphanumeric_identifier_ne_3_tmr := two_one_tmt & [ +INPUT < [ +FORM ^([0-9]+\([[:alpha:]]+[[:alnum:]]*)$, +TRAIT #trait, +CLASS non_ne, +PRED #pred, +CARG #carg, +TNT.+MAIN #main ], [ +FORM ")", +CLASS non_ne ] >, +OUTPUT < [ +FORM "${I1:+FORM:1})", +TRAIT #trait, +CLASS proper_ne, +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ] > ]. ;; ;; a number followed by one letter: |22a| ;; alphanumeric_identifier_ne_4_tmr := ne_tmt & [ +INPUT < [ +FORM ^[0-9]+[[:alpha:]]$ ] >, +OUTPUT < [ +CLASS proper_ne ] > ]. ;; ;; the following are maybe taken from chemistry: |B.25| |IL-10| |IL/10| ;; chemistry_identifier_ne_tmr := ne_tmt & [ +INPUT < [ +FORM ^[0-9]*[[:upper:]]+[-./][[:upper:]]*[0-9]+$ ] >, +OUTPUT < [ +CLASS proper_ne ] > ]. ;; ;; strongly alphanumeric strings (i.e. including digits), with at least two ;; hyphens, e.g. |123-45-6789|. ;; hyphenated_identifier_ne_tmr := add_ne_tmt & [ +CONTEXT < [ +FORM ^[[:alpha:]]*[0-9]+[[:alpha:]0-9]*(-[[:alnum:]]+){2,}$ ] >, +OUTPUT < [ +CLASS proper_ne ] > ]. ;; ;; section numbers (and the like): two or more decimal points ;; section_number_ne_tmr := ne_tmt & [ +INPUT < [ +FORM ^[0-9]+\.[0-9]+(\.[0-9]+)+$ ] >, +OUTPUT < [ +CLASS proper_ne ] > ].