;;; -*- Mode: tdl; Coding: utf-8; -*- ;;; ;;; Copyright (c) 2009 -- 2018 Stephan Oepen (oe@ifi.uio.no); ;;; see ‘LICENSE’ for conditions. ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; a second pass at lightweight NEs, now that we have further split up tokens ;;; at hyphens and dashes. ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; numerals, including some sub-sets (days of the month or years). in some of ;;; the rules |1| is deliberately excluded, because (unlike everyone else) it ;;; requires singular agreement. hence, we assume that all usages of |1| need ;;; be accounted for in the native lexicon, including as hours or days of the ;;; month. ;;; ;; ;; (candidate) days of the month: |2| to |9|, |10| to |29|, |30|, and |31| ;; ;; DPF 2018-03-28 - Distinguish vowel-initial numbers, to exclude ;; |an 7 meter tree| and |a 11 meter tree| ;; card_or_dom_ne_tmr := basic_ne_tmt & [ +INPUT < [ +FORM ^(([2-7]|9)|1(0|[2-7]|9)|2[0-9]|3[01])$ ] >, +OUTPUT < [ +CLASS card_or_dom_ne, +ONSET c-onset ] > ]. card_or_dom_voc_ne_tmr := basic_ne_tmt & [ +INPUT < [ +FORM ^(8|8-|11|11-|18|18-)$ ] >, +OUTPUT < [ +CLASS card_or_dom_ne, +ONSET v-onset ] > ]. ;; ;; (candidate) years: |950|, |1805|, |1957|, |2005|, et al. ;; ;; DPF 2012-12-09 - Allow for trailing hyphen (or decimal point) in order to ;; get e.g. |1990-model car| ;; DPF 2018-03-29 - Distinguish con-or-voc onset, but note that we might also ;; want to worry about the alternative pronunciations of clock time as in ;; |an 0900 meeting| pronounced "oh-nine-hundred", but |a 0900 meeting| ;; pronounced |nine o'clock|. ;; DPF 2018-10-02 - Returned to using this pattern only for card or year, so ;; we don't wrongly admit |*the 10 meeting| with clocktime 10:00. We'll once ;; again use n_-_pn-hour_le for the hours. ;; card_or_year_ne_tmr := basic_ne_tmt & [ +INPUT < [ +FORM ^((0|[2-7]|9)[0-9]?[0-9]?[0-9]|1(0|[2-7]|9)[0-9]?[0-9]?[0-9])(\.|-|\*)?$ ] >, +OUTPUT < [ +CLASS card_or_year_ne, +ONSET c-onset ] > ]. card_or_year_voc_ne_tmr := basic_ne_tmt & [ +INPUT < [ +FORM ^(8[0-9]?[0-9]?[0-9]|11[0-9]{2}|18[0-9]{2})(\.|-|\*)?$ ] >, +OUTPUT < [ +CLASS card_or_year_ne, +ONSET v-onset ] > ]. ;; abbreviated year: |'93| |'06| ;; ;; DPF 2018-02-22 - Added as presumably unambiguous year names ;; appostrophe_year_ne_tmr := basic_ne_tmt & [ +INPUT < [ +FORM ^['’][0-7,9][0-9](\.|-|\*)?$ ] >, +OUTPUT < [ +CLASS year_ne, +ONSET c-onset ] > ]. appostrophe_year_voc_ne_tmr := basic_ne_tmt & [ +INPUT < [ +FORM ^['’]8[0-9](\.|-|\*)?$ ] >, +OUTPUT < [ +CLASS year_ne, +ONSET v-onset ] > ]. ;; ;; any sequence of digits, with optional sign and optional decimal point. to ;; accept measure phrases like |900-pound|, where at this point |900-| is one ;; token, allow an optional trailing hyphen. ;; card_ne_1_tmr := basic_ne_tmt & [ +INPUT < [ +FORM ^[-+±~]?([2-9]|[1-9][0-9][0-9]*)(\.|-)?$ ] >, +OUTPUT < [ +CLASS card_ne, +ONSET c-onset ] > ]. ;; ;; floating point numbers, with optional sign and at least one decimal. again ;; for measures phrases, allow optional hyphen: |1.5-inch pianist|. ;; DPF 2012-05-27 - Generalized this class to also allow treatment of |8.10| ;; as a proper name, as in |Ubuntu 8.10|. ;; ;; DPF 2012-09-19 - In some contexts such as the WSJ, an expression like ;; |stocks rose 0.3 point| is accepted with a sub-one decimal value and a ;; singular measure noun. So we split the old rule into two, one for ;; decimal numbers 1.0 or greater (and the negative equivalent), and the ;; other for decimals with a 0 before the decimal. ;; DPF 2014-07-09 - Re 2012-05-27: This is dubious, and too expensive, since ;; we then have to treat every decimal as possibly also an ordinary proper name. ;; So undo, and uniformly treat as numbers. ;; DPF 2015-05-09 - Re 2014-07-09: Well, but to get |the 1.9 release|, we don't ;; want to treat 1.9 as an ordinary cardinal, because we don't want to accept ;; |the three cat|. So let's try treating these as class card_or_decimal_ne, ;; since we also want (and get) |the 0.9 release|. ;; DPF 2018-10-21 - Improved the regex's so we get vocalic onset for e.g. ;; |1100| or |18,000|. The awkward outermost disjunction in card_ne_2_tmr ;; is to be sure we get nonvocalic for |a 1.9 release| but vocalic for ;; |an 11.9 release|. Note that we want (and get) both |a 0.9 release| ;; pronounced "a zero point nine release" and |an 0.9 release| pronounced ;; "an oh point nine release". ;; DPF 2020-05-15 - To avoid costly ambiguity for general cardinals, change ;; output +CLASS from card_or_decimal_ne to card_ne, and leave open the ;; option of distinguishing |*the three cat| from |the 1.9 release| by ;; having the generic cardinal lexical entry (including decimals) be unmarked ;; for LPERIPH, while non-generic numbers always constrain LPERIPH to prevent ;; them from appearing in NP-N compound rule. However, this still leads to ;; ambiguity for e.g. |35 years| ;; DPF 2020-05-17 - To still get |They lost 0.62 share|, have special decimal ;; rule for |0.x...| ;; #| card_ne_2_tmr := basic_ne_tmt & [ +INPUT < [ +FORM ^[-+±~]?(((0|([2-7]|9)|1(0|[2-7]|9))[0-9]*)|1)\.[0-9]+$ ] >, +OUTPUT < [ +CLASS card_ne, +ONSET c-onset ] > ]. card_voc_ne_2_tmr := basic_ne_tmt & [ +INPUT < [ +FORM ^[-+±~]?(11|18|8[0-9]*)\.[0-9]+$ ] >, +OUTPUT < [ +CLASS card_ne, +ONSET v-onset ] > ]. |# card_ne_2_tmr := basic_ne_tmt & [ +INPUT < [ +FORM ^[-+±~]?(((([2-7]|9)|1(0|[2-7]|9))[0-9]*)|1)\.[0-9]+$ ] >, +OUTPUT < [ +CLASS card_ne, +ONSET c-onset ] > ]. card_voc_ne_2_tmr := basic_ne_tmt & [ +INPUT < [ +FORM ^[-+±~]?(11|18|8[0-9]*)\.[0-9]+$ ] >, +OUTPUT < [ +CLASS card_ne, +ONSET v-onset ] > ]. decimal_ne_2_tmr := basic_ne_tmt & [ +INPUT < [ +FORM ^[-+±~]?0\.[0-9]+$ ] >, +OUTPUT < [ +CLASS card_or_decimal_ne, +ONSET c-onset ] > ]. ;; US-style or German separators, optional sign and decimals: e.g. |23,000.-| ;; ;; We get con/voc agreement when the larger number has one or two digits before ;; the first comma/period, with voc for |8,...|, |11,...|, |18,...|, |8x,...|, ;; and |8xx,...|. card_ne_4_tmr := basic_ne_tmt & [ +INPUT < [ +FORM ^[-+±]?(([2-7]|9)[0-9]{0,2}|1(0|[2-7]|9|[0-9]{2})?)([,.][0-9]{1,3})+([,.]([0-9]*|-))?$ ]>, +OUTPUT < [ +CLASS card_ne, +ONSET c-onset ] > ]. card_voc_ne_4_tmr := basic_ne_tmt & [ +INPUT < [ +FORM ^[-+±]?(8[0-9]{0,2}|11|18)([,.][0-9]{1,3})+([,.]([0-9]*|-))?$ ]>, +OUTPUT < [ +CLASS card_ne, +ONSET v-onset ] > ]. ;; ;; ordinals, mostly parallel to the numerals ;; ord_or_dom_ne_tmr := ne_tmt & [ +INPUT < [ +FORM ^([23]?(1st|2nd|3rd)|(2?[4-9]|1[0-9]|20|30)th)$ ] >, +OUTPUT < [ +CLASS ord_or_dom_ne ] > ]. ord_ne_tmr := ne_tmt & [ +INPUT < [ +FORM ^[0-9]*((^|[^1])(1st|2nd|3rd)|(11|12|13|[04-9])th)$ ] >, +OUTPUT < [ +CLASS ord_ne ] > ]. ;; ;; fractions and dates, again, this time pulled apart from a hyphenated context ;; such as |9/10-11/12| ;; #| fraction_ne_2_tmr := ne_tmt & [ +INPUT < [ +FORM ^[-+±~]?[0-9]+(\.[0-9]*)?/[1-9][0-9]*[-–]?$ ] >, +OUTPUT < [ +CLASS date_or_fract_ne ] > ]. |# ;; ;; _fix_me_ ;; another pass at the decades, for cases like |mid-1800s|; see ‘ne1.tdl’ for ;; the original rules. to clean these up, we would need something like the ;; ‘group call’ mechanism in REPP. (27-aug-11; oe) ;; decade_ne_3_tmr := ne_tmt & [ +INPUT < [ +FORM ^(?:1[0-9])?[0-9]0[sS]$ ] >, +OUTPUT < [ +CLASS plur_ne ] > ]. decade_ne_4_tmr := two_one_keep_brackets_tmt & [ +INPUT < [ +FORM ^((?:1[0-9])?[0-9]0)$, +CLASS non_ne, +PRED #pred, +CARG #carg, +TNT.+MAIN #main ], [ +FORM ^(['’][sS])$, +CLASS non_ne ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}${I2:+FORM:1}", +CLASS plur_ne, +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ] > ].