;;; -*- Mode: tdl; Coding: utf-8; -*- ;;; ;;; Copyright (c) 2009 -- 2018 Stephan Oepen (oe@ifi.uio.no); ;;; see `LICENSE' for conditions. ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; lightweight NEs: form-driven generic entries (formerly `ersatz' entries) ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; email addresses ;;; ;; ;; any valid DNS string, prefixed by address, with optional angle brackets ;; email_ne_tmr := ne_tmt & [ +INPUT < [ +FORM ^?$ ] >, +OUTPUT < [ +CLASS email_ne ] > ]. ;;; ;;; uniform resource locators (URLs) ;;; ;; ;; any valid DNS string, prefixed by `http://', with optional angle brackets ;; url_ne_1_tmr := ne_tmt & [ +INPUT < [ +FORM ^?$ ] >, +OUTPUT < [ +CLASS url_ne ] > ]. ;; ;; any valid DNS string, prefixed by `www', with optional angle brackets ;; url_ne_2_tmr := ne_tmt & [ +INPUT < [ +FORM ^?$ ] >, +OUTPUT < [ +CLASS url_ne ] > ]. ;; ;; any valid DNS string, with obligatory angle brackets ;; url_ne_3_tmr := ne_tmt & [ +INPUT < [ +FORM ^<[[:alnum:]_-]+(\.[[:alnum:]_-]+)+(/.*)?>$ ] >, +OUTPUT < [ +CLASS url_ne ] > ]. ;; ;; any valid DNS string, with suffix com|edu|org ;; url_ne_4_tmr := ne_tmt & [ +INPUT < [ +FORM ^[[:alnum:]_-]+(\.[[:alnum:]_-]+)*\.(com|edu|org)$ ] >, +OUTPUT < [ +CLASS url_ne ] > ]. ;;; ;;; file names ;;; ;; ;; fully-qualified Un*x style, starting with a slash, e.g. |/etc/config|. ;; ;; _fix_me_ ;; we require a minimum of two components, such that |/etc| by itself will not ;; match. maybe we should allow these too but create an ambiguity here, i.e. ;; output two tokens, one [ CLASS file_ne ], the other [ CLASS non_ne ]? ;; (19-sep-08; oe) ;; file_ne_tmr := ne_tmt & [ +INPUT < [ +FORM ^(/[[:alnum:]._-]+){2,}/?$ ] >, +OUTPUT < [ +CLASS file_ne ] > ]. ;;; ;;; time-of-day expressions: |9am|, |11:45pm|, |20:15| ;;; ;; ;; an |am| or |pm| suffix unambiguously indicates a time expression. we also ;; grab all tokens of the form `H:M' where `H' and `M' are numbers in the right ;; ranges. ;; ;; _fix_me_ ;; i wonder about `mix in a ratio of 1:15', which the second rule below would ;; consider a time-of-day expression. should we approach such cases with more ;; `optional' NE rules, i.e. ones outputting two tokens? or should we rather ;; introduce an abstraction over `time_ne' and `ratio_ne', such that a single ;; token can activate multiple lexical entries? once we get regular expression ;; matching for lexical instantiation (peter is working on that), in principle, ;; we could just drop `time_ne_2_tmr', make `time_ne' a sub-type of `ratio_ne', ;; and put the `H:M' regular expression into the generic lexical entry. with ;; great power comes great responsibility :-). (19-sep-08; oe) ;; DPF 27-jun-09 - Let's try the option of abstracting over time_ne and meas_ne ;; (what the ratio rules introduce). ;; time_ne_1_tmr := ne_tmt & [ +INPUT < [ +FORM ^(0?[0-9]|1[0-2])([:.][0-5][0-9])?([aApP][mM])$ ] >, +OUTPUT < [ +CLASS time_ne ] > ]. ;; |1:30-6pm| where hyphen-splitting results in |1: 30| time_ne_1b_tmr := two_one_keep_brackets_tmt & [ +INPUT < [ +FORM ^(0?[0-9]|1[0-2])[:]$, +CLASS non_ne, +PRED #pred, +CARG #carg, +TNT.+MAIN #main ], [ +FORM ^([0-5][0-9])([aApP][mM])?$, +CLASS non_ne ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}:${I2:+FORM:1}", +CLASS time_ne, +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ] > ]. time_ne_2_tmr := ne_tmt & [ +INPUT < [ +FORM ^(0?[0-9]|1[0-9]|2[0-4]):[0-5][0-9]$ ] >, +OUTPUT < [ +CLASS meas_or_time_ne ] > ]. ;; ;; to also match |10.30| as a candidate time, there is an underspecification of ;; times and floating point numbers; this is a separate rule to avoid matching ;; |15:30| as a candidate cardinal. ;; DPF 2012-05-27 - Generalized this class to include proper ne's, so we can ;; treat |Ubuntu 8.10| as a compound of two proper names. ;; DPF 2012-09-21 - Changed the case of the single digit before the decimal ;; to exclude 0, consistent with allowing sub-one decimals to appear with ;; singular nouns in measure phrases, as in WSJ |the stock was down 0.3 cent| ;; time_ne_3_tmr := basic_ne_tmt & [ +INPUT < [ +FORM ^(0?[1-9]|1[0-9]|2[0-4])\.[0-5][0-9]$ ] >, +OUTPUT < [ +CLASS card_or_time_or_proper_ne, +ONSET c-onset ] > ]. ;; DPF 2014-07-09 - For token sequences like |3 2/3| we get massive ambiguity ;; with dates, adverbial combinations, etc. that are too expensive to ;; contemplate for now, though strictly speaking these ambiguities are ;; somehow defensible. FIX someday, but for now, just build a mixed fraction. ;; #| mixed_fraction_ne_1_tmr := two_one_keep_brackets_tmt & [ +INPUT < [ +FORM ^([1-9][0-9]*)$, +CLASS non_ne, +PRED #pred, +CARG #carg, +TNT.+MAIN #main ], [ +FORM ^(([1-9][0-9]*/[1-9][0-9]*)|½|⅓|¼|⅛|⅔|¾)$, +CLASS non_ne ] >, +OUTPUT < [ +FORM "${I1:+FORM:1} ${I2:+FORM:1}", +CLASS frct_ne, +PRED #pred, +CARG #carg, +TNT.+MAIN #main ] > ]. |# mixed_fraction_ne_1_tmr := two_one_keep_brackets_tmt & [ +INPUT < [ +FORM ^([1-9][0-9]*)$, +CLASS non_ne, +PRED #pred, +CARG #carg, +TNT.+MAIN #main ], [ +FORM ^(½|⅓|¼|⅛|⅔|¾)$, +CLASS non_ne ] >, +OUTPUT < [ +FORM "${I1:+FORM:1} ${I2:+FORM:1}", +CLASS frct_ne, +PRED #pred, +CARG #carg, +TNT.+MAIN #main ] > ]. mixed_fraction_ne_1b_tmr := four_one_keep_brackets_tmt & [ +INPUT < [ +FORM ^([1-9][0-9]*)$, +CLASS non_ne, +PRED #pred, +CARG #carg, +TNT.+MAIN #main ], [ +FORM ^([1-9][0-9]*)$, +CLASS non_ne ], [ +FORM ^(/)$, +CLASS non_ne ], [ +FORM ^([1-9][0-9]*)$, +CLASS non_ne ] >, +OUTPUT < [ +FORM "${I1:+FORM:1} ${I2:+FORM:1}${I3:+FORM:1}${I4:+FORM:1}", +CLASS frct_ne, +PRED #pred, +CARG #carg, +TNT.+MAIN #main ] >, +CONTEXT < > ]. ;; A few of the more common fraction symbols. Far from a general solution. ;; FIX someday. ;; mixed_fraction_ne_2_tmr := ne_tmt & [ +INPUT < [ +FORM ^([1-9][0-9]*(½|⅓|¼|⅛|⅔|¾))$, +CLASS non_ne, +PRED #pred, +CARG #carg, +TNT.+MAIN #main ] >, +OUTPUT < [ +CLASS frct_ne ] > ]. ;;; ;;; dates ;;; ;; ;; _fix_me_ ;; things are getting a little murky here: some of the current date formats ;; overlap with some of the identifiers and the fractions and ranges. to do ;; justice to these ambiguities, we would have to introduce multiple NE tokens. ;; or find ways of underspecification in the lexicon, maybe? (23-sep-08; oe) ;; ;; ;; US and European variants: |11-01-1957|, |11/01/1957|, |11-01-57| |57/01/11| ;; ;; _fix_me_ ;; i am leaving out |12-2005| and |'06| for now. they overlap too much with ;; other patterns, so maybe should be optional rules (like the one for |9/11| ;; and such below, added by dan in late 2011)? (23-sep-08; oe) ;; DPF 2015-09-02 - Thanks to Paul Haley for correcting this old pattern in ;; date_ne_1_tmr: +INPUT < [ +FORM ^([0-9]{1,2}[-/]){2}[0-9]{2,4}$ ] >, ;; which wrongly allowed a `date' of |0/000|. ;; Also, changed the parent type to add_ne_tmt, so we can also get a fraction ;; analysis of e.g. |3/15|. ;; Likewise for his improvement of date_ne_2_tmr to avoid the `date' |7-0|, ;; allowed by the old patterh: ;; +CONTEXT < [ +FORM ^([01]?[0-9][-/])[0-9]{1,2}$ ] >, ;; DPF 2020-02-12 - These had to be revised since repp now treats the ;; dash and slash as separate tokens. ;; ;; |1/3/84| |1/11/2014| |1-3-84| ;; #| date_ne_1_tmr := ne_tmt & [ +INPUT < [ +FORM ^[0-9]{1,2}[-/][0-9]{1,2}[-/][0-9]{2,4}$ ] >, +OUTPUT < [ +CLASS date_ne ] > ]. ;; |10/10| |2/15| |12/2015| date_ne_2_tmr := ne_tmt & [ +INPUT < [ +FORM ^[0-9]{1,2}[/][0-9]{2,4}$ ] >, +OUTPUT < [ +CLASS date_or_fract_ne ] > ]. ;; Adds token for 1 or 2 digit month, 1 or 2 digit day of month ;; date_ne_3_tmr := add_ne_tmt & [ +CONTEXT < [ +FORM ^(0?[1-9]|1[0-2])[-/](0?[1-9]|[12][0-9]|3[0-1])$ ]>, +OUTPUT < [ +CLASS date_ne ] > ]. |# date_ne_1_tmr := five_one_tmt & [ +INPUT < [ +FORM ^([0-9]{1,2})$, +CLASS non_ne, +TRAIT [ +LB #lb, +RB #rb, +LD #ld, +RD #rd ], +PRED #pred, +CARG #carg, +TNT.+MAIN #main ], [ +FORM ^([–/])$ ], [ +FORM ^([0-9]{1,2})$ ], [ +FORM ^([–/])$ ], [ +FORM ^([0-9]{2,4})$ ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}${I2:+FORM:1}${I3:+FORM:1}${I4:+FORM:1}${I5:+FORM:1}", +CLASS date_ne, +TRAIT [ +LB #lb, +RB #rb, +LD #ld, +RD #rd ], +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ], +TRAIT.+UW + ] >, +CONTEXT <> ]. date_ne_2_tmr := three_one_tmt & [ +INPUT < [ +FORM ^([0-9]{1,2})$, +CLASS non_ne, +TRAIT [ +LB #lb, +RB #rb, +LD #ld, +RD #rd ], +PRED #pred, +CARG #carg, +TNT.+MAIN #main ], [ +FORM ^([/])$ ], [ +FORM ^([0-9]{1,4})$ ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}${I2:+FORM:1}${I3:+FORM:1}", +TRAIT [ +LB #lb, +RB #rb, +LD #ld, +RD #rd, +UW + ], +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ], +CLASS date_or_fract_ne ] >, +CONTEXT <> ]. ;; DPF 2020-02-12 - Not clear what this rule is/was supposed to do, but here ;; is an attempt to convert it to the makeover universe: ;; #| date_ne_3_tmr := token_mapping_rule & [ +CONTEXT < [ +FORM ^(0?[1-9]|1[0-2])$, +CLASS non_ne, +TRAIT [ +LB #lb, +RB #rb ], +PRED #pred, +CARG #carg, +TNT.+MAIN #main, +ONSET #onset, +ID #id, +FROM #from ], [ +FORM ^([–/])$ ], [ +FORM ^(0?[1-9]|[12][0-9]|3[0-1])$, +TO #to ] >, +INPUT <>, +OUTPUT < [ +FORM "${I1:+FORM:1}${I2:+FORM:1}${I3:+FORM:1}", +TRAIT [ +LB #lb, +RB #rb, +UW + ], +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ], +ONSET #onset, +ID #id, +FROM #from, +TO #to, +CLASS date_ne ] >, +POSITION "C1, +OUTPUT < [ +CLASS meas_ne ] > ]. ratio_ne_2_tmr := ne_tmt & [ +INPUT < [ +FORM ^[1-9][0-9]{0,2}:[1-9][0-9]{0,2}(,[0-9]{3})*$ ] >, +OUTPUT < [ +CLASS meas_ne ] > ]. ;;; ;;; fractions: |1/4|, |-1/3|, |51/100|, et al. since we have considered the ;;; alternative analysis as dates by now (|9/11|), consume the input token at ;;; this point, i.e. block any further (different) analysis of it. we choose ;;; to run this one prior to splitting, to avoid confusion about the slash; we ;;; will need to return to a similar pattern (if without the ordinals, maybe) ;;; after splitting, however, to do justice to, say, |2 1/2-year|. ;;; fraction_ne_1_tmr := three_one_tmt & [ +INPUT < [ +FORM ^([-+±~]?[0-9]+(\.[0-9]*)?)$, +CLASS non_ne, +TRAIT [ +LB #lb, +RB #rb, +LD #ld, +RD #rd ], +PRED #pred, +CARG #carg, +TNT.+MAIN #main ], [ +FORM ^([/])$ ], [ +FORM ^([1-9][0-9]*(th)?[-–]?)$ ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}${I2:+FORM:1}${I3:+FORM:1}", +TRAIT [ +LB #lb, +RB #rb, +LD #ld, +RD #rd, +UW + ], +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ], +CLASS frct_ne ] >, +CONTEXT <> ]. ;;; ;;; measure noun phrases (taking precedence over alphanumeric identifiers) ;;; ;; ;; single token: |25cm| or |+37.3ºC| ;; ;; _fix_me_ ;; dan changed this from `meas_ne' to `card_or_meas_ne'; why so? i realize we ;; should record more information in [incr tsdb()] profiles, e.g. which token ;; mapping rules had fired (for each parsing result), which might help finding ;; relevant instances for archeological search like this one. (27-feb-11; oe) ;; Note that this rule constrains the +CLASS value so as to trigger the ;; introduction of a generic lexical entry for measure NPs. ;; DPF 2015-09-04 - Indeed, the motive for the above change is not clear. Let's ;; try putting it back to meas_ne, and see if we can discover the motive, if ;; it was genuine. ;; measure_ne_tmr := ne_tmt & [ +INPUT < [ +FORM ^[-+±~]?[0-9]+([,.][0-9]*)?([kKmMgGµlLCcTt"']+|[ckmn]m|σ|Å|ft|in|mi|yd|lb|[º°℃][CF]?)$ ] >, +OUTPUT < [ +CLASS meas_or_proper_ne ] > ]. ;; ;; |1:1000m| ;; ratio_measure_ne_tmr := ne_tmt & [ +INPUT < [ +FORM ^[1-9][0-9]{0,2}:[1-9][0-9]*([kKmMgG"']+|[ckmn]m|σ|[º°][CF]?)$ ]>, +OUTPUT < [ +CLASS meas_ne ] > ]. ;; ;; |US$20| and the like: break into two tokens, to get a bracketing compatible ;; with [US$ [20 billion]] for a string like |US$20 billion|. ;; ;; _fix_me_ ;; dan and i concluded that we should introduce a `currency_ne' instead, with ;; a corresponding generic entry, i.e. drop |US$| et al. from the lexicon (at ;; present, some of the entries allowed by this rule are missing, e.g. |C$|). ;; to maintain some parallelism with currency names that can be ordinary words ;; (e.g. `pound'), the rule below should synthesize a PRED value (rather than ;; put a CARG on currency NEs), e.g. `_us$_n_currency_rel'. another rule will ;; be needed for just |US$| (and in principle |€| and other currency symbols), ;; to also match them as currency NEs, and at that point the `currency+name' ;; lexical filtering rule could be discarded. (dan & oe, 29-jul-09) ;; currency_measure_ne_tmr := one_two_tmt & [ +INPUT < [ +FORM ^((?:AU?|CA?|HK|NZ|US)?\$|S[Kk]r)([0-9]+([\.,][0-9]*)?)$, +TRAIT [ +UW #uw, +LB #lb, +RB #rb, +LD #ld, +RD #rd, +HD #hd ], +CLASS non_ne, +PRED #pred, +CARG #carg, +TNT.+MAIN #main ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}", +CLASS non_ne, +TRAIT [ +UW #uw, +LB #lb, +RB bracket_null, +LD #ld, +RD #rd, +HD #hd ], +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ], [ +FORM "${I1:+FORM:2}", +CLASS non_ne, +TRAIT [ +UW #uw, +LB bracket_null, +RB #rb, +LD #ld, +RD #rd ], +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ] > ]. ;;; ;;; decades: |1950s|, |50s|, |1950's|, and |50's|; the first type, at least, ;;; needs to precede the alphanumeric identifiers. ;;; decade_ne_1_tmr := ne_tmt & [ +INPUT < [ +FORM ^(?:1[0-9])?[0-9]0[sS]$ ] >, +OUTPUT < [ +CLASS plur_ne ] > ]. ;; ;; _fix_me_ ;; this latter type (plural of decades and, in principle, other names) actually ;; is ambiguous with the possessive (i suspect), e.g. `the 1950's style'. so, ;; maybe, optional rules for those? (24-sep-08; oe) ;; decade_ne_2_tmr := token_mapping_rule & [ +CONTEXT < [ +FORM ^((mid-)?([1,2][0-9])?[0-9]0)$, +TRAIT #trait, +CLASS non_ne, +ID [ LIST #front, LAST #middle ], +FROM #from, +PRED #pred, +CARG #carg, +TNT.+MAIN #main ], [ +FORM ^(['’][sS])$, +CLASS non_ne, +ID [ LIST #middle, LAST #back ], +TO #to ] >, +INPUT <>, +OUTPUT < [ +FORM "${C1:+FORM:1}${C2:+FORM:1}", +TRAIT #trait & [ +UW + ], +CLASS plur_apos_ne, +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to, +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ] >, +POSITION "C1, +OUTPUT < [ +CLASS proper_ne ] > ]. ;; ;; a special case: a number, a hyphen, followed by a single letter: |22-b| ;; alphanumeric_identifier_ne_2_tmr := ne_tmt & [ +INPUT < [ +FORM ^[0-9]+-[[:alpha:]]$ ] >, +OUTPUT < [ +CLASS proper_ne ] > ]. ;; ;; in a similar spirit, a number followed by letters in parentheses: |22(B)|; ;; note that the closing parentheses will have been tokenized off, PTB-style. ;; ;; DPF 2015-06-02 - It seems that the tokenizer does not currently split off ;; the closing parenthesis for |22(B)| though it does for either |(22)| or ;; |(B)|. So add a one-one variant of this rule that assumes the presence of ;; the closing paren. ;; alphanumeric_identifier_ne_3_tmr := two_one_keep_brackets_tmt & [ +INPUT < [ +FORM ^([0-9]+\([[:alpha:]]+[[:alnum:]]*)$, +CLASS non_ne, +PRED #pred, +CARG #carg, +TNT.+MAIN #main ], [ +FORM ")", +CLASS non_ne ] >, +OUTPUT < [ +FORM "${I1:+FORM:1})", +CLASS proper_ne, +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ] > ]. alphanumeric_identifier_ne_3b_tmr := ne_tmt & [ +INPUT < [ +FORM ^([0-9]+\([[:alpha:]]+[[:alnum:]\)]*)$ ] >, +OUTPUT < [ +CLASS proper_ne ] > ]. ;; ;; a number followed by one letter: |22a| ;; alphanumeric_identifier_ne_4_tmr := ne_tmt & [ +INPUT < [ +FORM ^[0-9]+[[:alpha:]]$ ] >, +OUTPUT < [ +CLASS proper_ne ] > ]. ;; ;; the following are maybe taken from chemistry: |B.25| |IL-10| |IL/10| ;; chemistry_identifier_ne_tmr := ne_tmt & [ +INPUT < [ +FORM ^[0-9]*[[:upper:]]+[-./][[:upper:]]*[0-9]+$ ] >, +OUTPUT < [ +CLASS proper_ne ] > ]. ;; ;; strongly alphanumeric strings (i.e. including digits), with at least two ;; hyphens, e.g. |123-45-6789|. ;; DPF 2017-11-18 - We also want to analyze |$10-a-share|, which gets tokenized ;; as |$ 10-a-share|, so generalize the +CLASS here from proper_ne to ;; card_or_proper_ne. ;; hyphenated_identifier_ne_tmr := basic_add_ne_tmt & [ +CONTEXT < [ +FORM ^[[:alpha:]]*[0-9]+[[:alpha:]0-9]*(-[[:alnum:]]+){2,}$ ] >, +OUTPUT < [ +CLASS card_or_proper_ne, +ONSET c-onset ] > ]. ;; ;; section numbers (and the like): two or more decimal points ;; section_number_ne_tmr := ne_tmt & [ +INPUT < [ +FORM ^[0-9]+\.[0-9]+(\.[0-9]+)+$ ] >, +OUTPUT < [ +CLASS proper_ne ] > ].