;;; -*- Mode: tdl; Coding: utf-8; -*- ;;; ;;; Copyright (c) 2009 -- 2012 Stephan Oepen (oe@ifi.uio.no); ;;; see `LICENSE' for conditions. ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; now with NEs out of our way, this would be a good time for adjustments to ;;; tokenization: introduce additional token boundaries (e.g. for hyphens and ;;; slashes) and maybe some robustness rules for `sandwiched' punctuation. ;;; ;;; note that, as of 17-jun-09, we treat hyphens and n-dashes alike, i.e. on ;;; the input side either one will lead to re-tokenization, while we output a ;;; normalized form: n-dashes between numbers (three output tokens), hyphens ;;; in all other cases (two tokens, with the hyphen prepended to the first of ;;; them. ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; when we split into multiple tokens, it may be desirable to force the ;; resulting token sequence to form a phrase, eventually, i.e. provide the ;; parser with bracketing constraints, say the sequence |⌊(| |1| |-| |3| |)⌋|. ;; to enforce this in the syntax, there could be two features LEFT and RIGHT, ;; to pass up the bracketing property on the left and right periphery of all ;; phrases. the bracketing (GML) tokens would mark lexical items as [LEFT +] ;; or [RIGHT +]; all non-unary rules would disallow their first daughter to be ;; [RIGHT +], and last daughter to not be [LEFT +]. to match bracketings and ;; discard the RIGHT and LEFT marks, a unary rule goes from [LEFT +, RIGHT +] ;; to [LEFT -, RIGHT -]. come to think of it, for full generality, we should ;; support multiple, nested bracketings. hence, LEFT and RIGHT actually need ;; to be list-valued: the bracket prefix and suffix rules push onto the right ;; list, while the new unary `matching' rule pops both lists. ;; (4-may; dan & oe) ;; _fix_me_ ;; note that this implies moving to GML 2.0, now using two reserved characters ;; (|⌊| and |⌋|, UniCode U+230a and U+230b) instead of just the former broken ;; vertical bar (|¦|). ;; ;; ;; make hyphen a token in its own right between numbers (an n-dash, actually), ;; e.g. |50-60|. otherwise, break at hyphens following alphabetic prefixes, ;; but keep the hyphen on the prefix, e.g. |sub-| |discipline|. ;; numeric_hyphen_tmr := one_three_tmt & [ +INPUT < [ +FORM ^([+-]?[0-9]+(?:\.[0-9]*)?)[–-]([0-9]+(?:\.[0-9]*)?)$, +CLASS non_ne, +TRAIT [ +UW #uw, +LB #lb, +RB #rb ], +PRED #pred, +CARG #carg, +TNT #tnt ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}", +TRAIT [ +UW #uw, +LB < ctype . #lb >, +RB <> ], +PRED #pred, +CARG #carg, +TNT #tnt ], [ +FORM "–", +TRAIT [ +UW -, +LB <>, +RB <> ], +PRED #pred, +CARG #carg, +TNT null_tnt ], [ +FORM "${I1:+FORM:2}", +TRAIT [ +UW #uw, +LB <>, +RB < ctype . #rb > ], +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. ;; ;; _fix_me_ ;; when we break up tokens, it is not obvious which tag to assign to the first ;; segment. often, especially for unknown words (which most hyphenated tokens ;; are), the PoS value will reflect the suffix. for now, copy over +TNT to the ;; initial segment. if nothing else, names should still work when capitalized, ;; for tokens containing multiple hyphens, the rule will apply from the rear, ;; i.e. the final segment is guaranteed to carry the +TNT information. ;; i just re-tooled this rule a little, see whether dan likes it this way? ;; (12-jan-09; oe) ;; DPF 2010-09-14 - One example where this goes wrong is in |Fetchmail-friends| ;; where |Fetchmail-| wrongly inherits "NNS" from |friends|, preventing the ;; construction of the NP-N compound, which requires an uninflected non-head. ;; (To reproduce, choose another unknown name in place of |Fetchmail|, which ;; I've now added to the lexicon.) ;; ;; DPF 2011-02-15 - Stamp POS "NN" on the left member of the split, to allow ;; correct parse for e.g. |mitogen-stimulated|, instead of assigning the left ;; member the POS of the whole ("JJ"). Would be more robust to allow both ;; "NN" and "JJ", but not clear that it's worth the extra ambiguity. ;; ;; _fix_me_ ;; as we add brackets around token sequences split at hyphens, we need to make ;; sure there is only one pair of outermost brackets, as token mapping has no ;; way of predicting the internal syntactic structure of, e.g. |two-year-old|. ;; for now, bifurcate the splitting rule, into a recursive case (that applies ;; when there are additional hyphens, and does not insert brackets) and a base ;; case for the outermost structure. maybe this is the best solution we can ;; find, but i will want to reconsider the question. (2-nov-12; oe) ;; alphabetic_hyphen_singleton_tmr := one_two_tmt & [ +INPUT < [ +FORM ^([^–-]+)[–-]([[:alnum:]]+)$, +CLASS non_ne, +TRAIT [ +UW #uw, +LB #lb, +RB #rb ], +PRED #pred, +CARG #carg, +TNT #tnt & [ +MAIN #main ] ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}-", +TRAIT [ +UW #uw, +LB < ctype . #lb >, +RB <> ], +PRED #pred, +CARG #carg, +TNT [ +MAIN #main, +TAGS < "NN" >, +PRBS < "1.0" > ] ], [ +FORM "${I1:+FORM:2}", +TRAIT [ +UW #uw, +LB <>, +RB < ctype . #rb > ], +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. alphabetic_hyphen_initialize_tmr := one_two_tmt & [ +INPUT < [ +FORM ^(.+[–-].+)[–-]([[:alnum:]]+)$, +CLASS non_ne, +TRAIT [ +UW #uw, +LB #lb, +RB #rb ], +PRED #pred, +CARG #carg, +TNT #tnt & [ +MAIN #main ] ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}-", +TRAIT [ +UW #uw, +LB #lb, +RB #rb ], +PRED #pred, +CARG #carg, +TNT [ +MAIN #main, +TAGS < "NN" >, +PRBS < "1.0" > ] ], [ +FORM "${I1:+FORM:2}", +TRAIT [ +UW #uw, +LB #lb, +RB < ctype . #rb > ], +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. alphabetic_hyphen_recurse_tmr := one_two_tmt & [ +INPUT < [ +FORM ^(.+[–-].+)[–-]([[:alnum:]]+-?)$, +CLASS non_ne, +TRAIT [ +UW #uw, +LB #lb, +RB #rb ], +PRED #pred, +CARG #carg, +TNT #tnt & [ +MAIN #main ] ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}-", +TRAIT [ +UW #uw, +LB #lb, +RB #rb ], +PRED #pred, +CARG #carg, +TNT [ +MAIN #main, +TAGS < "NN" >, +PRBS < "1.0" > ] ], [ +FORM "${I1:+FORM:2}", +TRAIT [ +UW #uw, +LB #lb, +RB #rb ], +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. alphabetic_hyphen_terminate_tmr := one_two_tmt & [ +INPUT < [ +FORM ^(.+)[–-]([[:alnum:]]+-?)$, +CLASS non_ne, +TRAIT [ +UW #uw, +LB #lb, +RB #rb ], +PRED #pred, +CARG #carg, +TNT #tnt & [ +MAIN #main ] ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}-", +TRAIT [ +UW #uw, +LB < ctype . #lb >, +RB #rb ], +PRED #pred, +CARG #carg, +TNT [ +MAIN #main, +TAGS < "NN" >, +PRBS < "1.0" > ] ], [ +FORM "${I1:+FORM:2}", +TRAIT [ +UW #uw, +LB #lb, +RB #rb ], +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. ;; ;; with the new addition of derivational lexical rules, immediately re-attach ;; certain (verbal) prefixes (e.g. |mis-| and |re-|). it is a bit unfortunate ;; that we end up duplicating information from the orthographemic annotation ;; on those rules in token mapping, but i imagine the linguistic arguments for ;; this particular treatment are overwhelming. ;; ;; _fix_me_ ;; some prefixes are missing in this rule, notably |co-|; see the comments in ;; `lexrinst.tdl', towards the end of the file. (17-jun-09; oe) ;; ;; _fix_me_ ;; but what about capitalized or all upper-case variants? (4-may-12; oe) ;; derivational_prefix_tmr := two_one_final_form_trait_tmt & [ +INPUT < [ +FORM ^((?:co(?:unter)?|mis|p?re|un)-)$, +TRAIT.+LB #lb ], [ +FORM ^([[:alnum:]]+)$, +TRAIT [ +UW #uw, +RB #rb, +HD #hd ] ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}${I2:+FORM:1}", +TRAIT [ +UW #uw, +LB #lb, +RB #rb, +HD #hd ] ] > ]. ;; ;; _fix_me_ ;; there will be more to do about slashes, no doubt ... (12-jan-09; oe) ;; alphabetic_slash_tmr := one_three_tmt & [ +INPUT < [ +FORM ^(.+)/([[:alnum:]]+)$, +CLASS non_ne, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +OUTPUT < [ +FORM "${I1:+FORM:1}", +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ], [ +FORM "/", +TRAIT.+UW -, +PRED #pred, +CARG #carg, +TNT null_tnt ], [ +FORM "${I1:+FORM:2}", +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ] > ].