;;; -*- Mode: tdl; Coding: utf-8; -*- ;;; ;;; Copyright (c) 2009 -- 2018 Stephan Oepen (oe@ifi.uio.no); ;;; see `LICENSE' for conditions. ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; now with NEs out of our way, this would be a good time for adjustments to ;;; tokenization: introduce additional token boundaries (e.g. for hyphens and ;;; slashes) and maybe some robustness rules for `sandwiched' punctuation. ;;; ;;; note that, as of 17-jun-09, we treat hyphens and n-dashes alike, i.e. on ;;; the input side either one will lead to re-tokenization, while we output a ;;; normalized form: n-dashes between numbers (three output tokens), hyphens ;;; in all other cases (two tokens, with the hyphen prepended to the first of ;;; them. ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; when we split into multiple tokens, it may be desirable to force the ;; resulting token sequence to form a phrase, eventually, i.e. provide the ;; parser with bracketing constraints, say the sequence |⌊(| |1| |-| |3| |)⌋|. ;; to enforce this in the syntax, there could be two features LEFT and RIGHT, ;; to pass up the bracketing property on the left and right periphery of all ;; phrases. the bracketing (GML) tokens would mark lexical items as [LEFT +] ;; or [RIGHT +]; all non-unary rules would disallow their first daughter to be ;; [RIGHT +], and last daughter to not be [LEFT +]. to match bracketings and ;; discard the RIGHT and LEFT marks, a unary rule goes from [LEFT +, RIGHT +] ;; to [LEFT -, RIGHT -]. come to think of it, for full generality, we should ;; support multiple, nested bracketings. hence, LEFT and RIGHT actually need ;; to be list-valued: the bracket prefix and suffix rules push onto the right ;; list, while the new unary `matching' rule pops both lists. ;; (4-may; dan & oe) ;; _fix_me_ ;; note that this implies moving to GML 2.0, now using two reserved characters ;; (|⌊| and |⌋|, UniCode U+230a and U+230b) instead of just the former broken ;; vertical bar (|¦|). ;; #| ;; ;; make hyphen a token in its own right between numbers (an n-dash, actually), ;; e.g. |50-60|. otherwise, break at hyphens following alphabetic prefixes, ;; but keep the hyphen on the prefix, e.g. |sub-| |discipline|. ;; DPF 2015-09-04 - Tried accommodating more complex inputs such as the ;; range in |2/3-7/8| for a reading of "two-thirds to seven-eighths", or ;; "Feb.2 through July 8". But can't get this pattern to match: ;; [ +INPUT < [ +FORM ^([+-]?[0-9]+(?:\.[0-9]*)?(/[1-9][0-9]*)?)[–-]([0-9]+(?:\.[0-9]*)?(/[1-9][0-9]*)?)$, ;; FIX someday? numeric_hyphen_tmr := one_three_tmt & [ +INPUT < [ +FORM ^([+-]?[0-9]+(?:\.[0-9]*)?)[–-]([0-9]+(?:\.[0-9]*)?)$, +CLASS non_ne, +TRAIT.+UW #uw, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}", +TRAIT [ +UW #uw, +LD bracket_nonnull & , +RD bracket_null ], +PRED #pred, +CARG #carg, +TNT #tnt ], [ +FORM "–", +TRAIT [ +UW -, +LD bracket_null, +RD bracket_null ], +PRED #pred, +CARG #carg, +TNT null_tnt ], [ +FORM "${I1:+FORM:2}", +TRAIT [ +UW #uw, +LD bracket_null, +RD bracket_nonnull & ], +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. ;; DPF 2015-03-14 - Added rule for sandwiched multiple hyphens used as ndashes, ;; as in |Kim--the doctor--arose.| or |Kim---the doctor---arose|. ;; Since ACE has already replaced the double-hyphen by ndash before this (for ;; some reason), let's try also doing this three-token split for ndashes, ;; noting that this means we don't get the bracketing constraint for ndashes ;; via the alphabetic-hyphen rules below, but maybe this is right. ;; alphabetic_multi_hyphen_tmr := one_three_tmt & [ +INPUT < [ +FORM ^([^–-]+)(--|---|–)([[:alnum:]]+)$, +CLASS non_ne, +TRAIT.+UW #uw, +PRED #pred, +CARG #carg, +TNT #tnt & [ +MAIN #main ] ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}", +TRAIT [ +UW #uw, +LD bracket_null, +RD bracket_null ], +PRED #pred, +CARG #carg, +TNT #tnt ], [ +FORM "–", +TRAIT [ +UW #uw, +LD bracket_null, +RD bracket_null ], +PRED #pred, +CARG #carg, +TNT #tnt ], [ +FORM "${I1:+FORM:3}", +TRAIT [ +UW #uw, +LD bracket_null, +RD bracket_null ], +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. ;; sandwiched sentence final period |We arrived.He left.| sandwich_period_tmr := one_three_tmt & [ +INPUT < [ +FORM ^(.+)([\.?!])([[:upper:]][[:alnum:]]*)$, +CLASS non_ne, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}", +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ], [ +FORM "${I1:+FORM:2}", +PRED #pred, +CARG #carg, +TNT null_tnt, +TRAIT.+UW - ], [ +FORM "${I1:+FORM:3}", +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. ;; sandwiched comma |The tall,green tree| ;; DPF 2019-06-05 - Accommodate a non-standard (Chinese) comma sandwich_comma_tmr := one_three_tmt & [ +INPUT < [ +FORM ^(.+)[,,、]([[:alpha:]-][[:alnum:]]*)$, +CLASS non_ne, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}", +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ], [ +FORM "⸴", +PRED #pred, +CARG #carg, +TNT null_tnt, +TRAIT.+UW - ], [ +FORM "${I1:+FORM:2}", +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. ;; ;; _fix_me_ ;; when we break up tokens, it is not obvious which tag to assign to the first ;; segment. often, especially for unknown words (which most hyphenated tokens ;; are), the PoS value will reflect the suffix. for now, copy over +TNT to the ;; initial segment. if nothing else, names should still work when capitalized, ;; for tokens containing multiple hyphens, the rule will apply from the rear, ;; i.e. the final segment is guaranteed to carry the +TNT information. ;; i just re-tooled this rule a little, see whether dan likes it this way? ;; (12-jan-09; oe) ;; DPF 2010-09-14 - One example where this goes wrong is in |Fetchmail-friends| ;; where |Fetchmail-| wrongly inherits "NNS" from |friends|, preventing the ;; construction of the NP-N compound, which requires an uninflected non-head. ;; (To reproduce, choose another unknown name in place of |Fetchmail|, which ;; I've now added to the lexicon.) ;; ;; DPF 2011-02-15 - Stamp POS "NN" on the left member of the split, to allow ;; correct parse for e.g. |mitogen-stimulated|, instead of assigning the left ;; member the POS of the whole ("JJ"). Would be more robust to allow both ;; "NN" and "JJ", but not clear that it's worth the extra ambiguity. ;; ;; _fix_me_ ;; as we add brackets around token sequences split at hyphens, we need to make ;; sure there is only one pair of outermost brackets, as token mapping has no ;; way of predicting the internal syntactic structure of, e.g. |two-year-old|. ;; for now, bifurcate the splitting rule, into a recursive case (that applies ;; when there are additional hyphens, and does not insert brackets) and a base ;; case for the outermost structure. maybe this is the best solution we can ;; find, but i will want to reconsider the question. (2-nov-12; oe) ;; alphabetic_hyphen_singleton_tmr := one_three_tmt & [ +INPUT < [ +FORM ^([^–-]+)([–-])([[:alnum:]/⌋]+)$, +CLASS non_ne, +TRAIT [ +UW #uw, +LB #lb, +RB #rb ], +PRED #pred, +CARG #carg, +TNT #tnt & [ +MAIN #main ] ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}", +TRAIT [ +UW #uw, +LB #lb, +RB bracket_null, +LD bracket_nonnull & , +RD bracket_null ], +PRED #pred, +CARG #carg, +TNT [ +MAIN #main, +TAGS < "NN" >, +PRBS < "1.0" > ] ], [ +FORM "${I1:+FORM:2}", +TRAIT [ +UW #uw, +LD bracket_null, +RD bracket_null ], +PRED #pred, +CARG #carg, +TNT #tnt ], [ +FORM "${I1:+FORM:3}", +TRAIT [ +UW #uw, +LB bracket_null, +RB #rb, +LD bracket_null, +RD bracket_nonnull & ], +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. alphabetic_hyphen_initialize_tmr := one_three_tmt & [ +INPUT < [ +FORM ^(.+[–-].+)([–-])([[:alnum:]/⌋]+)$, +CLASS non_ne, +TRAIT [ +UW #uw, +LD #ld, +RD #rd, +LB #lb, +RB #rb ], +PRED #pred, +CARG #carg, +TNT #tnt & [ +MAIN #main ] ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}", +TRAIT [ +UW #uw, +LB #lb, +RB bracket_null, +LD #ld, +RD #rd ], +PRED #pred, +CARG #carg, +TNT [ +MAIN #main, +TAGS < "NN" >, +PRBS < "1.0" > ] ], [ +FORM "${I1:+FORM:2}", +TRAIT [ +UW #uw, +LD bracket_null, +RD bracket_null ], +PRED #pred, +CARG #carg, +TNT #tnt ], [ +FORM "${I1:+FORM:3}", +TRAIT [ +UW #uw, +LB bracket_null, +RB #rb, +LD #ld, +RD bracket_nonnull & ], +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. alphabetic_hyphen_recurse_tmr := one_three_tmt & [ +INPUT < [ +FORM ^(.+[–-].+)([–-])([[:alnum:]/⌋]+-?)$, +CLASS non_ne, +TRAIT [ +UW #uw, +LD #ld, +RD #rd, +LB #lb, +RB #rb ], +PRED #pred, +CARG #carg, +TNT #tnt & [ +MAIN #main ] ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}", +TRAIT [ +UW #uw, +LB #lb, +RB bracket_null, +LD #ld, +RD #rd ], +PRED #pred, +CARG #carg, +TNT [ +MAIN #main, +TAGS < "NN" >, +PRBS < "1.0" > ] ], [ +FORM "${I1:+FORM:2}", +TRAIT [ +UW #uw, +LD bracket_null, +RD bracket_null ], +PRED #pred, +CARG #carg, +TNT #tnt ], [ +FORM "${I1:+FORM:3}", +TRAIT [ +UW #uw, +LB bracket_null, +RB #rb, +LD #ld, +RD #rd ], +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. alphabetic_hyphen_terminate_tmr := one_three_tmt & [ +INPUT < [ +FORM ^(.+)([–-])([[:alnum:]/⌋]+-?)$, +CLASS non_ne, +TRAIT [ +UW #uw, +LD #ld, +RD #rd, +LB #lb, +RB #rb ], +PRED #pred, +CARG #carg, +TNT #tnt & [ +MAIN #main ] ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}", +TRAIT [ +UW #uw, +LB #lb, +RB bracket_null, +LD bracket_nonnull & , +RD #rd ], +PRED #pred, +CARG #carg, +TNT [ +MAIN #main, +TAGS < "NN" >, +PRBS < "1.0" > ] ], [ +FORM "${I1:+FORM:2}", +TRAIT [ +UW #uw, +LD bracket_null, +RD bracket_null ], +PRED #pred, +CARG #carg, +TNT #tnt ], [ +FORM "${I1:+FORM:3}", +TRAIT [ +UW #uw, +LB bracket_null, +RB #rb, +LD #ld, +RD #rd ], +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. |# ;; ;; with the new addition of derivational lexical rules, immediately re-attach ;; certain (verbal) prefixes (e.g. |mis-| and |re-|). it is a bit unfortunate ;; that we end up duplicating information from the orthographemic annotation ;; on those rules in token mapping, but i imagine the linguistic arguments for ;; this particular treatment are overwhelming. ;; ;; DPF 2016-02-06 - The main motive for treating these elements as (attached) ;; prefixes is that most if not all of them can also appear without the hyphen ;; (|counterattack| alongside |counter-attack|), and hence since we have to ;; have derivational rules for the non-hyphen versions, we choose to use those ;; rules for the hyphen versions as well. ;; ;; _fix_me_ ;; some prefixes are missing in this rule, notably |co-|; see the comments in ;; `lexrinst.tdl', towards the end of the file. (17-jun-09; oe) ;; DPF 2015-09-02 - Added |co-| awhile ago, and now also prefix rules for nouns ;; and adjectives ;; ;; _fix_me_ ;; but what about capitalized or all upper-case variants? (4-may-12; oe) ;; DPF 2015-09-02 - Yes, added capitalized variants, but not upper case. ;; ;; Prefixes currently included: ;; co- counter- cross- mis- out- over- pre- re- self- un- ;; ;; DPF 2020-03-25 - Exclude conjunctions, so we can parse ;; |They thinned out - and disappeared| ;; DPF 2020-03-31 - Also allow all-caps, as in |CROSS-BRED| ;; #| derivational_prefix_tmr := three_one_final_form_trait_tmt & [ +INPUT < [ +FORM ^((?:[Cc]o(?:unter)?|[Cc]ross|[Mm]is|[Oo]ut|[Oo]ver|[Pp]?[Rr]e|[Ss]elf|[Uu]n(?:der)?))$, +TRAIT.+LD #ld ], [ +FORM ^([-])$ ], [ +FORM ^((?!and\z|or\z|but\z)[[:alnum:]]+)$, +TRAIT [ +UW #uw, +RD #rd, +HD #hd ] ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}${I2:+FORM:1}${I3:+FORM:1}", +TRAIT [ +UW #uw, +LD #ld, +RD #rd, +HD #hd ] ] > ]. |# derivational_prefix_tmr := three_one_final_form_trait_tmt & [ +INPUT < [ +FORM ^((?:[Cc][Oo](?:(unter|UNTER))?|[Cc](ross|ROSS)|[Mm](is|IS)|[Oo](ut|UT)|[Oo](ver|VER)|[Pp]?[Rr][Ee]|[Ss](elf|ELF)|[Uu][Un](?:(der|DER))?))$, +TRAIT.+LD #ld ], [ +FORM ^([-])$ ], [ +FORM ^((?!and\z|or\z|but\z)[[:alnum:]]+)$, +TRAIT [ +UW #uw, +RD #rd, +HD #hd ] ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}${I2:+FORM:1}${I3:+FORM:1}", +TRAIT [ +UW #uw, +LD #ld, +RD #rd, +HD #hd ] ] > ]. ;; Add brackets to ensure immediate attachment of right punctuation mark, ;; Exclude colon, which also has non-punct lexical entry ;; punct_suffix_clitic_tmr := two_two_tmt & [ +INPUT < [ +FORM ^([[:alnum:],\.\?!;”\)\]⌊/⌋]+)$, +CLASS #class, +TRAIT [ +UW #uw, +LD #ld, +RD [ LIST #rdlist, LAST #rdlast ] ], +PRED #pred, +CARG #carg, +TNT #tnt ], [ +FORM ^([,\.\?!;”\)\]])$, +CLASS #classp, +TRAIT [ +UW #uwp, +RD bracket_null ], +PRED #predp, +CARG #cargp, +TNT #tntp ]>, +OUTPUT < [ +FORM "${I1:+FORM:1}", +CLASS #class, +TRAIT [ +UW #uw, +LD #ld, +RD bracket_nonnull & [ LIST < n . #rdlist >, LAST #rdlast ] ], +PRED #pred, +CARG #carg, +TNT #tnt ], [ +FORM "${I2:+FORM:1}", +CLASS #classp, +TRAIT [ +UW #uwp, +LD bracket_null, +RD bracket_nonnull & ], +PRED #predp, +CARG #cargp, +TNT #tntp ] >, +CONTEXT < > ]. punct_prefix_clitic_tmr := two_two_tmt & [ +INPUT < [ +FORM ^([“\(\[])$, +CLASS #classp, +TRAIT [ +UW #uwp, +LD bracket_null, +RD bracket_null ], +PRED #predp, +CARG #cargp, +TNT #tntp ], [ +FORM ^([[:alnum:]“\(\[]+)$, +CLASS #class, +TRAIT [ +UW #uw, +LD [ LIST #ldlist, LAST #ldlast ], +RD #rd ], +PRED #pred, +CARG #carg, +TNT #tnt ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}", +CLASS #classp, +TRAIT [ +UW #uwp, +LD bracket_nonnull & , +RD bracket_null ], +PRED #predp, +CARG #cargp, +TNT #tntp ], [ +FORM "${I2:+FORM:1}", +CLASS #class, +TRAIT [ +UW #uw, +LD bracket_nonnull & [ LIST < n . #ldlist >, LAST #ldlast ], +RD #rd ], +PRED #pred, +CARG #carg, +TNT #tnt ] >, +CONTEXT < > ]. ;; DPF 2020-05-02 - Add weak-bracket addition rule that blocks phrase boundary ;; inside of multiwords containing suffix characters that the tokenizer treats ;; as separate, e.g. the |$| in |HK$|. ;; Note that in the limit this is too strict without some notion of a ;; non-breaking space, since we could get input such as ditransitive ;; |They sent the US $1000|. ;; So for now don't include for example |US$|. ;; character_suffix_clitic_tmr := two_two_tmt & [ +INPUT < [ +FORM ^(AU|C|HK|NZ)$, +CLASS #class, +TRAIT [ +UW #uw, +LD #ld, +RD [ LIST #rdlist, LAST #rdlast ] ], +PRED #pred, +CARG #carg, +TNT #tnt ], [ +FORM ^([\$])$, +CLASS #classp, +TRAIT [ +UW #uwp, +RD bracket_null ], +PRED #predp, +CARG #cargp, +TNT #tntp ]>, +OUTPUT < [ +FORM "${I1:+FORM:1}", +CLASS #class, +TRAIT [ +UW #uw, +LD #ld, +RD bracket_nonnull & [ LIST < n . #rdlist >, LAST #rdlast ] ], +PRED #pred, +CARG #carg, +TNT #tnt ], [ +FORM "${I2:+FORM:1}", +CLASS #classp, +TRAIT [ +UW #uwp, +LD bracket_null, +RD bracket_nonnull & ], +PRED #predp, +CARG #cargp, +TNT #tntp ] >, +CONTEXT < > ].