;;; -*- Mode: tdl; Coding: utf-8; -*- ;;; ;;; Copyright (c) 2009 -- 2018 Stephan Oepen (oe@ifi.uio.no); ;;; see `LICENSE' for conditions. ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; at this point, we multiply out PoS values on all tokens, where for each ;;; original token as many additional tokens are created (in the same chart ;;; cell) as there are PoS readings. at this point, we start distinguishing ;;; between tokens that activate native lexical entries (LEs), vs. those that ;;; activate generic LEs. in the token universe, this distinction is made by ;;; virtue of +TRAIT, with generic_trait targeting generic LEs. the two sets ;;; do not overlap, i.e. for a single original token with two PoS readings, we ;;; end up with a total of three new tokens. the pair of rules below resembles ;;; a recursive function, terminating once the PoS list has been reduced to ;;; a singleton element. form-based named entities identified earlier avoid ;;; this kind of PoS multiplication because they have already emptied out their ;;; PoS list. ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; even though we originally made sure all tokens had a fully specified +TNT ;; value, intervening rules could have `leaked' PoS information. if so, once ;; again, fully annul the +TNT value. ;; tnt_default_tmr := one_one_tmt & [ +INPUT < [ +FORM #form, +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT [ +MAIN #main, +TAGS < anti_string, ... > ] ] >, +OUTPUT < [ +FORM #form, +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ] >, +CONTEXT < > ]. tnt_recurse_tmr := token_mapping_rule & [ +INPUT < [ +FORM #form, +CLASS #class, +TRAIT anti_trait & [ +LB #lb, +RB #rb, +LD #ld, +RD #rd ], +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT [ +MAIN #main, +TAGS < #tag . #tags & *cons* >, +PRBS < #prb . #prbs & *cons* > ] ] > , +OUTPUT < [ +FORM #form, +TRAIT [ +UW +, +LB #lb, +RB #rb, +LD #ld, +RD #rd ], +CLASS #class, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT [ +MAIN #main, +TAGS < #tag >, +PRBS < #prb > ] ], [ +FORM #form, +CLASS #class, +TRAIT [ +LB #lb, +RB #rb, +LD #ld, +RD #rd ], +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT [ +MAIN #main, +TAGS #tags, +PRBS #prbs ] ] >, +CONTEXT < >, +POSITION "O1@I1, O2@I1" ]. tnt_terminate_tmr := token_mapping_rule & [ +INPUT < [ +FORM #form, +TRAIT anti_trait & [ +LB #lb, +RB #rb, +LD #ld, +RD #rd ], +CLASS #class, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT [ +MAIN #main, +TAGS < #tag >, +PRBS < #prb > ] ] > , +OUTPUT < [ +FORM #form, +TRAIT [ +UW +, +LB #lb, +RB #rb, +LD #ld, +RD #rd ], +CLASS #class, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT [ +MAIN #main, +TAGS < #tag >, +PRBS < #prb > ] ], [ +FORM #form, +TRAIT [ +UW -, +LB #lb, +RB #rb, +LD #ld, +RD #rd ], +CLASS #class, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT null_tnt & [ +MAIN #main ] ] >, +CONTEXT < >, +POSITION "O1@I1, O2@I1" ]. ;;; ;;; with singleton PoS readings multiplied out in each chart cell, we can prune ;;; undesirable alternatives, e.g. a foreign word reading when there also is a ;;; common noun. also, ditch PoS readings with very low probability, and ones ;;; for which no PoS-activated generic entries exist anyway (function words). ;;; this final step eases debugging, reducing the size of the token chart. ;;; tnt_ditch_unlikely_tmr := token_mapping_rule & [ +INPUT < [ +TNT.+PRBS < ^0?\.0.*$ > ] >, +OUTPUT < >, +CONTEXT < > ]. ;;; DPF 2014-04-23 - TnT appears to want to assign the POS tag "CC" to unknown ;;; words ending with a period in mid-sentence, as with |fundraiser.| in ;;; |... Duck Race fundraiser. 3 p.m. at ...| ;;; So let's try treating such unknown words as nouns, for better robustness. ;;; tnt_ditch_function_1_tmr := token_mapping_rule & [ +INPUT < [ +TNT.+TAGS < ^DT|EX|IN|MD|PDT|POS|PRP\$?|RB[RS]$ > ] >, +OUTPUT < >, +CONTEXT < > ]. tnt_ditch_function_2_tmr := token_mapping_rule & [ +INPUT < [ +TNT.+TAGS < ^RP|TO|WDT|WP|WRB$ > ] >, +OUTPUT < >, +CONTEXT < > ]. ;; DPF 2016-11-23 - See comment below for tnt_ditch_punctuation_tmr. ;; ;tnt_ditch_function_3_tmr := token_mapping_rule & ;[ +INPUT < [ +TNT.+TAGS < ^\$|#|``|''|\(|\)|,|\.|:$ > ] >, ; +OUTPUT < >, ; +CONTEXT < > ]. ;; ;; _fix_me_ ;; experimentally, also ditch PoS information on punctuation-only tokens. we ;; appear to get noun and adjective readings for n- and m-dashes, which hardly ;; can do us any good. (24-sep-08; oe) ;; DPF 2016-11-23 - At Woodley's urging, commenting out this rule, in favor of ;; robustness, where we add a generic lexical entry for punctuation characters ;; tagged with POS ".", and call on the existing rule for attaching separate ;; punctuation tokens to the preceding or following token. Intended to fix a ;; number of instances where we were ending up with an incoherent lattice in ;; chart-mapping. Also commented out tnt_ditch_function_3_tmr above, but not ;; ditch_punctuation.tmr in punctuation.tdl, ;; ;tnt_ditch_punctuation_tmr := token_mapping_rule & ;[ +INPUT < [ +FORM ^[[:punct:]]+$, +TNT.+TAGS *cons* ] >, ; +OUTPUT < >, ; +CONTEXT < > ]. ;;; ;;; _fix_me_ ;;; should we eventually want to include the PoS probabilities as a feature in ;;; parse selection, this kind of pruning should disappear: a high-probability ;;; FW, say, should not be ellbowed out by an unlikely NN. (31-aug-08; oe) ;;; tnt_filter_dup_fw_tmr := token_mapping_rule & [ +CONTEXT < [ +TNT.+TAGS < "NN" > ] >, +INPUT < [ +TNT.+TAGS < "FW" > ] >, +OUTPUT < >, +POSITION "I1@C1" ]. ;; ;; [DPF 23-apr-09] words ending in "-ing" can get tagged both as noun and as ;; verb, but since the grammar has gerund rules, drop the noun and keep the ;; verb. ;; _fix_me_ ;; is there a reason to prefer the gerund over the vanilla noun? it means a ;; little extra ambiguity when followed by a PP[of], which the generic gerund ;; optionally picks up as a complement. (24-may-09; oe) ;; tnt_filter_dup_vbg_tmr := token_mapping_rule & [ +CONTEXT < [ +TNT.+TAGS < "VBG" > ] >, +INPUT < [ +TNT.+TAGS < "NN" > ] >, +OUTPUT < >, +POSITION "I1@C1" ]. ;; DPF 2014-06-12 - Constrain INPUT to be non-initial, since we need to keep ;; the NNP if it is the first word of the input, so that it can trigger the ;; special case rule initial_capitalized_name_nnp_tmr. ;; tnt_filter_dup_nnp_tmr := token_mapping_rule & [ +CONTEXT < [ +TNT.+TAGS < ^FW|NN$ > ] >, +INPUT < [ +TNT.+TAGS < "NNP" >, +CLASS.+INITIAL - ] >, +OUTPUT < >, +POSITION "I1@C1" ]. ;; ;; _fix_me_ ;; the old `posmapping' setting in PET contained the following comment by dan ;; (from sep-07): ;; ;; Tried doing without the adjective, since TNT appears to mostly guess both ;; an adjective and a noun, and our generic mass-count noun can almost always ;; do the work of the adjective. This would avoid large amounts of spurious ;; ambiguity for most occurrences of these pairs. But unfortunately TNT ;; doesn't always guess both, so we need JJ when it's the only guess. Maybe ;; we can effect this with the new token-mapping machinery ... ;; ;; the following rule should have that effect. (21-jan-09; oe) ;; ;; [DPF 24-mar-09] Unfortunately, this simple rule goes wrong sometimes. For ;; "the tallest and most unk-word cat" the |unk-word| has to be an adjective, ;; so we can't just throw it away. We'll try using the probabilities from the ;; tagger for a more sensitive rule. ;; tnt_filter_dup_jj_tmr := token_mapping_rule & [ +CONTEXT < [ +TNT [ +TAGS < "NN" >, +PRBS < ^0?\.[3-9].*$ > ] ] >, +INPUT < [ +TNT.+TAGS < "JJ" > ] >, +OUTPUT < >, +POSITION "I1@C1" ]. ;; ;; there is one case that cuts across the sub-division of generic entries into ;; (a) so-called named entities (triggered from string-level properties) and ;; (b) non-NE generics, triggered on the basis of PoS tags. the NE rules for ;; names (triggered by capital letters) do not stipulate a named entity in ;; sentence-initial position. on the other hand, lexical filtering will drop ;; non-NE generics whenever there is a competing native entry. so, to allow ;; generic names in initial position to survive the filtering, look at the ;; combined string-level and PoS evidence and create an actual NE. this rule ;; needs to be late in the process, so that we have PoS tags multiplied out. ;; DPF 2020-05-06 - Generalized +INPUT..+CASE from capitalized+lower to just ;; captitalized, because we want to include capitalized+mixed as in |Mao_Zedong| ;; and maybe we don't care about failing to exclude all-caps here. ;; DPF 2020-05-16 - The commented out def preserves the entry for |Liu_Wen| ;; which is dropped by `uniq' token-dropping rule unless +MAIN value is changed ;; as shown. But we want `uniq' to discard redundant entries for e.g. |W.R.|, ;; so FIX before using makeover for educ. ;; initial_capitalized_name_tmr := token_mapping_rule & [ +INPUT < [ +FORM #form, +CLASS alphanumeric & [ +INITIAL +, +CASE capitalized ], +TRAIT #trait & [ +UW + ], +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT [ +MAIN #main, +TAGS < ^NNP|NN|VBZ|VBG$ > ] ] >, +OUTPUT < [ +FORM #form, +CLASS proper_ne, +TRAIT #trait, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT null_tnt & [ +MAIN #main ] ] >, +CONTEXT < >, +POSITION "I1@O1" ]. ;; DPF 2017-07-09 - The ACE tagger seems to often land on JJ as a default for ;; an unknown S-initial word, as in |Equitable of New York survives|. So add ;; a variant of the rule above just for JJ, adding a new token. ;; initial_capitalized_name_jj_tmr := token_mapping_rule & [ +INPUT < >, +CONTEXT < [ +FORM #form, +CLASS alphanumeric & [ +INITIAL +, +CASE capitalized+lower ], +TRAIT #trait & [ +UW + ], +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT [ +MAIN #main, +TAGS < ^JJ$ > ] ] >, +OUTPUT < [ +FORM #form, +CLASS proper_ne, +TRAIT #trait, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT null_tnt & [ +MAIN #main ] ] >, +POSITION "C1@O1" ]. ;; ;; on all tokens that we expect to activate generic entries, make the +PRED ;; value reflect the orthography and PoS tag. ;; generic_pred_tmr := token_mapping_rule & [ +INPUT < [ +FORM #form, +TRAIT #trait & [ +UW + ], +CLASS #class & non_ne, +PRED anti_string, +CARG #carg & ^(.+)$, +ID #id, +FROM #from, +TO #to, +TNT #tnt & [ +TAGS < ^(.*)$ > ] ] >, +OUTPUT < [ +FORM #form, +TRAIT #trait, +CLASS #class, +PRED "_${lc(I1:+CARG:1)}/${I1:+TNT.+TAGS.FIRST:1}_u_unknown_rel", +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT #tnt ] >, +CONTEXT < >, +POSITION "O1@I1" ].