;;; -*- Mode: tdl; Coding: utf-8; -*- ;;; ;;; Copyright (c) 2009 -- 2013 Stephan Oepen (oe@ifi.uio.no); ;;; Copyright (c) 2009 -- 2013 Dan Flickinger (danf@stanford.edu); ;;; see ‘LICENSE’ for conditions. ;;; tokens := *top* & [ +LIST *list*, +LAST token ]. token_min := *avm*. token := token_min & [ +FORM string, +CLASS token_class, +TRAIT token_trait, +PRED predsort, +CARG string, +ID *diff-list*, +FROM string, +TO string, +TNT tnt, +TICK bool ]. token_trait := *top* & [ +UW bool, +IT italics, +LB bracket_list, +RB bracket_list, +LA luk, +RA luk, +HD token_head ]. anti_trait := token_trait. native_token_list := *list*. native_token_cons := native_token_list & *cons* & [ FIRST.+TRAIT.+UW -, REST native_token_list ]. native_token_null := native_token_list & *null*. native_token_la_list := *list*. native_token_la_cons := native_token_la_list & *cons* & [ FIRST.+TRAIT [ +UW -, +LA - ], REST native_token_la_list ]. native_token_la_null := native_token_la_list & *null*. generic_token_list := *list*. generic_token_cons := generic_token_list & *cons* & [ FIRST.+TRAIT.+UW +, REST generic_token_list ]. generic_token_null := generic_token_list & *null*. italics := *sort*. left_italics := italics. right_italics := italics. both_italics := italics. bracket_list := *list*. bracket_cons := bracket_list & *cons* & [ FIRST ctype, REST bracket_list ]. bracket_null := bracket_list & *null*. anti_bracket_list := bracket_list. token_head := *sort* & [ +LL ctype , +TG string ]. ;; ;; _fix_me_ ;; in token mapping, the original +TNT list (of tags and probabilities) can be ;; rewritten; native tokens, for example, will end up with an empty list, and ;; generic tokens ‘multiply out’ all elements from the input list. to preserve ;; information about the top-ranked PoS hypotheses in (all) token FSs that end ;; up as part of a derivation (recorded in [incr tsdb()], say), the rules make ;; sure to set (and then leave intact) the value of +TNT.+MAIN. i am wondering ;; whether it would be possible to reverse the logic of what we do, i.e. leave ;; the original list intact and selectively move active values to another part ;; of the token FS, where lexical entries could look for it. not quite sure, ;; however, how that would work for the rules that ‘multiply out’ PoS tags and ;; create as many generic tokens as there were elements in the original list. ;; (18-nov-10; oe) tnt_main := *top* & [ +TAG string, +PRB string ]. tnt := *top* & [ +MAIN tnt_main, +TAGS *list*, +PRBS *list* ]. null_tnt := tnt & [ +TAGS <>, +PRBS <> ]. ;; ;; in token mapping, it is useful to have available distinct ‘anti’-strings. ;; anti_string := string. non_string := string. ;;; ;;; orthographic classes, used in token mapping and lexical filtering ;;; token_class := *sort*. no_class := token_class. named_entity := token_class. card_or_time_or_proper_ne := named_entity. ;; For decimal numbers between -1.0 and 1.0, which can show either singular ;; or plural agreement on their measure nouns. card_or_decimal_ne := named_entity. card_or_proper_ne := card_or_time_or_proper_ne & card_or_decimal_ne. proper_ne := card_or_proper_ne. file_ne := proper_ne. url_ne := proper_ne. email_ne := proper_ne. phone_ne := proper_ne. card_or_year_ne := named_entity. card_or_dom_ne := named_entity. card_or_time_ne := card_or_time_or_proper_ne. card_or_meas_ne := named_entity. card_ne := card_or_year_ne & card_or_dom_ne & card_or_time_ne & card_or_meas_ne & card_or_proper_ne. year_ne := card_or_year_ne. ord_or_dom_ne := named_entity. ord_ne := ord_or_dom_ne. frct_ne := named_entity. decimal_ne := card_or_decimal_ne. plur_ne := named_entity. dom_card_ne := card_or_dom_ne. dom_ord_ne := ord_or_dom_ne. date_ne := named_entity. meas_or_time_ne := named_entity. time_ne := card_or_dom_ne & card_or_time_ne & meas_or_time_ne. meas_ne := meas_or_time_ne & card_or_meas_ne. meas_noun_ne := named_entity. ;; ;; the following are modeled after POSIX character classes; most have obvious ;; correspondences in terms of (more elaborate) UniCode character properties. ;; essentially, we cross-classify along three dimensions: (a) the combination ;; of characters used, (b) whether or not the first character is capitalized, ;; and (c) whether or not a token appears utterance-initial. ;; non_ne := token_class & [ +INITIAL luk ]. non_alphanumeric := non_ne. apostrophe := non_alphanumeric. anti_apostrophe := non_alphanumeric. alphanumeric := non_ne & [ +CASE token_case ]. alphabetic := alphanumeric. numeric := alphanumeric. ;; ;; at least the fourth time that i revise this hierarchy. ‘capitalized’ or not ;; is a property of the first character (|1A| is not capitalized). ‘mixed’, on ;; the other hand, is only applicable to tokens with at least two characters. ;; both |aB| and |AbC| are mixed, but |A| or |a| are not. finally, ‘lower’ and ;; ‘upper’ reflect the full token string, i.e. |Dan| is neither, |1a| is lower, ;; and |A| is upper. ;; token_case := *sort*. capitalized := token_case. non_capitalized := token_case. mixed := token_case. non_mixed := token_case. capitalized+mixed := capitalized & mixed. capitalized+non_mixed := capitalized & non_mixed. capitalized+lower := capitalized+non_mixed. capitalized+upper := capitalized+non_mixed. non_capitalized+mixed := non_capitalized & mixed. ;; ;; we are making a simplifying assumption here, not distinguishing one-token ;; non-capitalized (which could be called ‘non_capitalized+non_mixed’) from ;; ‘non_capitalized+lower’. so far, we just never care about the distinction. ;; non_capitalized+lower := non_capitalized & non_mixed. chart_mapping_rule := *top* & [ +CONTEXT *list*, +INPUT *list*, +OUTPUT *list*, +POSITION string ]. ;;; ;;; constructing a sensible hierarchy of token mapping rules is not trivial. ;;; there is variation among many dimensions: (a) arity of input and output, ;;; positioning of LHS and RHS rule elements, (c) which token properties are ;;; copied over, and others. ;;; ;;; following is an attempt to sketch some of the more frequent configurations, ;;; but so far there is hardly any use of inheritance here ... ;;; token_mapping_rule := chart_mapping_rule. basic_one_one_tmt := token_mapping_rule & [ +INPUT.FIRST [ +ID #id, +FROM #from, +TO #to ], +OUTPUT.FIRST [ +ID #id, +FROM #from, +TO #to ] ]. basic_two_two_tmt := basic_one_one_tmt & [ +INPUT.REST.FIRST [ +ID #id, +FROM #from, +TO #to ], +OUTPUT.REST.FIRST [ +ID #id, +FROM #from, +TO #to ] ]. basic_three_three_tmt := basic_two_two_tmt & [ +INPUT.REST.REST.FIRST [ +ID #id, +FROM #from, +TO #to ], +OUTPUT.REST.REST.FIRST [ +ID #id, +FROM #from, +TO #to ] ]. basic_four_four_tmt := basic_three_three_tmt & [ +INPUT.REST.REST.REST.FIRST [ +ID #id, +FROM #from, +TO #to ], +OUTPUT.REST.REST.REST.FIRST [ +ID #id, +FROM #from, +TO #to ] ]. one_one_tmt := basic_one_one_tmt & [ +INPUT < [] >, +OUTPUT < [] >, +POSITION "O1@I1" ]. two_two_tmt := basic_two_two_tmt & [ +INPUT < [], [] >, +OUTPUT < [], [] >, +POSITION "I1, +OUTPUT < [ +FORM #form, +CLASS #class, +TRAIT.+UW +, +PRED #pred, +CARG #carg, +TNT #tnt ] . *list* >, +CONTEXT < > ]. basic_two_two_ner_tmt := basic_one_one_ner_tmt & basic_two_two_tmt & [ +INPUT < [], [ +FORM #form, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ] . *list* >, +OUTPUT < [], [ +FORM #form, +CLASS #class, +TRAIT.+UW +, +PRED #pred, +CARG #carg, +TNT #tnt ] . *list* > ]. basic_three_three_ner_tmt := basic_two_two_ner_tmt & basic_three_three_tmt & [ +INPUT < [], [], [ +FORM #form, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ] . *list* >, +OUTPUT < [], [], [ +FORM #form, +CLASS #class, +TRAIT.+UW +, +PRED #pred, +CARG #carg, +TNT #tnt ] . *list* > ]. basic_four_four_ner_tmt := basic_three_three_ner_tmt & basic_four_four_tmt & [ +INPUT < [], [], [], [ +FORM #form, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ] . *list* >, +OUTPUT < [], [], [], [ +FORM #form, +CLASS #class, +TRAIT.+UW +, +PRED #pred, +CARG #carg, +TNT #tnt ] . *list* > ]. ;;; ;;; _fix_me_ ;;; need to revisit these, once i decide on which order brackets go onto the ;;; +LB and +RB lists. in principle, an NE pattern may be surrounded by GML ;;; brackets, which either way we must not lose. (2-nov-12; oe) ;;; two_two_ner_tmt := basic_two_two_ner_tmt & [ +INPUT < [ +TRAIT.+LB bracket_null ], [ +TRAIT.+RB bracket_null ] >, +OUTPUT < [ +TRAIT.+RB <> ], [ +TRAIT.+LB <> ] >, +POSITION "I1, +OUTPUT < [ +TRAIT.+RB <> ], [], [ +TRAIT.+LB <> ] >, +POSITION "I1, +OUTPUT < [ +TRAIT.+RB <> ], [], [], [ +TRAIT.+LB <> ] >, +POSITION "I1 ] should amount to the ;;; same thing, viz. no relevant reference to context tokens. ACE developers ;;; asked the ERG developers to always spell out the empty list, but i would ;;; prefer not to build that expectation into the machinery. while a fully ;;; underspecified list could be construed to allow matching arbitrary tokens ;;; as ‘context’, but since such matches could never be referenced in +OUTPUT ;;; they could not possibly have any effect on the result of rule application. ;;; hence, an engine would be justified in deciding to not even attempt any ;;; matching against an underspecified +CONTEXT (or +INPUT, for that matter), ;;; which is the strategy adopted in PET. approach ACE developers about this, ;;; one day. (31-oct-12; oe) ;;; two_one_tmt := token_mapping_rule & [ +INPUT < [ +ID [ LIST #front, LAST #middle ], +FROM #from ], [ +ID [ LIST #middle, LAST #back ], +TO #to ] >, +OUTPUT < [ +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to ] >, +CONTEXT <>, +POSITION "I1, +OUTPUT < [ +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to ] >, +POSITION "I1, +OUTPUT < [ +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to ] >, +POSITION "I1, +OUTPUT < [ +ID #id, +FROM #from, +TO #to ], [ +ID #id, +FROM #from, +TO #to ] >, +CONTEXT <>, +POSITION "O1, +OUTPUT < [ +ID #id, +FROM #from, +TO #to ], [ +ID #id, +FROM #from, +TO #to ], [ +ID #id, +FROM #from, +TO #to ] >, +CONTEXT <>, +POSITION "O1, +OUTPUT < [ +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +CONTEXT <> ]. two_two_trait_tmt := two_two_tmt & [ +INPUT < [ +FORM #form1, +CLASS #class1, +PRED #pred1, +CARG #carg1, +TNT #tnt1 ], [ +FORM #form2, +CLASS #class2, +PRED #pred2, +CARG #carg2, +TNT #tnt2 ] >, +OUTPUT < [ +FORM #form1, +CLASS #class1, +PRED #pred1, +CARG #carg1, +TNT #tnt1 ], [ +FORM #form2, +CLASS #class2, +PRED #pred2, +CARG #carg2, +TNT #tnt2 ] >, +CONTEXT <> ]. two_one_initial_form_trait_tmt := two_one_tmt & [ +INPUT < [ +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ], [] >, +OUTPUT < [ +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. two_one_initial_form_tmt := two_one_initial_form_trait_tmt & [ +INPUT < [ +TRAIT #trait ], [] >, +OUTPUT < [ +TRAIT #trait ] > ]. two_one_final_form_trait_tmt := two_one_tmt & [ +INPUT < [], [ +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +OUTPUT < [ +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. three_one_initial_form_tmt := three_one_tmt & [ +INPUT < [ +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ], [], [] >, +OUTPUT < [ +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +CONTEXT <> ]. three_one_center_form_tmt := three_one_tmt & [ +INPUT < [], [ +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ], [] >, +OUTPUT < [ +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +CONTEXT <> ]. three_one_final_form_tmt := three_one_tmt & [ +INPUT < [], [], [ +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +OUTPUT < [ +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +CONTEXT <> ]. one_two_all_form_tmt := one_two_tmt & [ +INPUT < [ +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +OUTPUT < [ +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ], [ +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +CONTEXT <> ]. ;;; ;;; a few relatively specialized token mapping rule types, for configurations ;;; that are instantiated with non-trivial frequency. ;;; token_class_null_tnt_tmt := one_one_tmt & [ +INPUT < [ +FORM #form, +CLASS no_class, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT.+MAIN #main ] >, +OUTPUT < [ +FORM #form, +CLASS non_ne, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ] >, +CONTEXT <> ]. token_class_tmt := one_one_tmt & [ +INPUT < [ +FORM #form, +CLASS no_class, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +OUTPUT < [ +FORM #form, +CLASS non_ne, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +CONTEXT <> ]. token_case_tmt := token_mapping_rule & [ +INPUT < [ +FORM #form, +CLASS #class, +TRAIT #trait, +PRED #pred, +TNT #tnt ] >, +OUTPUT < [ +FORM #form, +CLASS #class, +TRAIT #trait, +PRED #pred, +TNT #tnt ] > ]. one_one_token_case_tmt := one_one_tmt & token_case_tmt & [ +CONTEXT <> ]. tick_reset_tmt := one_one_tmt & [ +INPUT < [ +FORM #form, +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt, +TICK + ] >, +OUTPUT < [ +FORM #form, +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg, +TNT #tnt, +TICK bool ] >, +CONTEXT <> ]. ;; ;; the following rules are unusual, as they combine +IDs from both the context ;; and input elements; the contexts (punctuation marks) need to remain in the ;; chart until (re-)attached to all adjacent tokens, but eventually they will ;; be purged from the chart. ;; prefix_punctuation_tmt := token_mapping_rule & [ +CONTEXT < [ +TRAIT.+LA #la, +ID [ LIST #front, LAST #middle ], +FROM #from ] >, +INPUT < [ +CLASS #class, +TRAIT [ +UW #uw, +LB #lb, +RB #rb, +RA #ra, +HD #hd ], +PRED #pred, +CARG #carg, +ID [ LIST #middle, LAST #back ], +TO #to, +TNT #tnt ] >, +OUTPUT < [ +CLASS #class, +TRAIT [ +UW #uw, +LB #lb, +RB #rb, +LA #la, +RA #ra, +HD #hd ], +PRED #pred, +CARG #carg, +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to, +TNT #tnt ] >, +POSITION "C1, +CONTEXT < [ +TRAIT.+RA #ra, +ID [ LIST #middle, LAST #back ], +TO #to ] >, +OUTPUT < [ +CLASS #class, +TRAIT [ +UW #uw, +LB #lb, +RB #rb, +LA #la, +RA #ra, +HD #hd ], +PRED #pred, +CARG #carg, +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to, +TNT #tnt ] >, +POSITION "I1, +OUTPUT < [ +FORM #form, +CLASS named_entity, +TRAIT [ +UW +, +LB #lb, +RB #rb, +LA #la, +RA #ra ], +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ] >, +CONTEXT <> ]. add_ne_tmt := token_mapping_rule & [ +CONTEXT < [ +FORM #form, +CLASS non_ne, +TRAIT [ +LB #lb, +RB #rb, +LA #la, +RA #ra ], +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT.+MAIN #main ] >, +INPUT <>, +OUTPUT < [ +FORM #form, +CLASS named_entity, +TRAIT [ +UW +, +LB #lb, +RB #rb, +LA #la, +RA #ra ], +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT null_tnt & [ +MAIN #main ] ] >, +POSITION "O1@C1" ]. ;;; Type used initially for moving bracket leftward, but maybe more uses ;;; will be found. Essentially makes minor changes to two input tokens, ;;; and adds two near copies of the context token. The two-and-two are ;;; needed for the left-bracket rule because we propose both a generic and ;;; a native token. ;;; one_two_four_tmt := token_mapping_rule & [ +CONTEXT < [ +FORM #form2, +ID #id2, +FROM #from2, +TO #to2, +PRED #pred2, +CARG #carg2, +TNT #tnt2 ] >, +INPUT < [ +FORM #form1, +CLASS #class1a, +ID #id, +FROM #from, +TO #to, +PRED #pred1, +CARG #carg1, +TNT #tnt1a ], [ +FORM #form1, +CLASS #class1b, +ID #id, +FROM #from, +TO #to, +PRED #pred1, +CARG #carg1, +TNT #tnt1b] >, +OUTPUT < [ +FORM #form1, +CLASS #class1a, +ID #id, +FROM #from, +TO #to, +PRED #pred1, +CARG #carg1, +TNT #tnt1a ], [ +FORM #form1, +CLASS #class1b, +ID #id, +FROM #from, +TO #to, +PRED #pred1, +CARG #carg1, +TNT #tnt1b ], [ +FORM #form2, +CLASS #class1a, +ID #id2, +FROM #from2, +TO #to2, +PRED #pred2, +CARG #carg2, +TNT #tnt2 ], [ +FORM #form2, +CLASS #class1b, +ID #id2, +FROM #from2, +TO #to2, +PRED #pred2, +CARG #carg2, +TNT #tnt2 ] >, +POSITION "I1