;;; -*- Mode: tdl; Coding: utf-8; -*- ;;; ;;; Copyright (c) 2009 -- 2010 Stephan Oepen (oe@ifi.uio.no); ;;; see `LICENSE' for conditions. ;;; tokens := *top* & [ +LIST *list*, +LAST token ]. token_min := *avm*. token := token_min & [ +FORM string, +CLASS token_class, +TRAIT token_trait, +PRED predsort, +CARG string, +ID *diff-list*, +FROM string, +TO string, +TNT tnt, +STAG tnt ]. token_trait := *top*. anti_trait := token_trait. native_trait := token_trait. generic_trait := token_trait. native_token_list := *list*. native_token_cons := native_token_list & *cons* & [ FIRST.+TRAIT native_trait, REST native_token_list ]. native_token_null := native_token_list & *null*. generic_token_list := *list*. generic_token_cons := generic_token_list & *cons* & [ FIRST.+TRAIT generic_trait, REST generic_token_list ]. generic_token_null := generic_token_list & *null*. ;; ;; _fix_me_ ;; in token mapping, the original +TNT list (of tags and probabilities) can be ;; rewritten; native tokens, for example, will end up with an empty list, and ;; generic tokens `multiply out' all elements from the input list. to preserve ;; information about the top-ranked PoS hypotheses in (all) token FSs that end ;; up as part of a derivation (recorded in [incr tsdb()], say), the rules make ;; sure to set (and then leave intact) the value of +TNT.+MAIN. i am wondering ;; whether it would be possible to reverse the logic of what we do, i.e. leave ;; the original list intact and selectively move active values to another part ;; of the token FS, where lexical entries could look for it. not quite sure, ;; however, how that would work for the rules that `multiply out' PoS tags and ;; create as many generic tokens as there were elements in the original list. ;; (18-nov-10; oe) tnt_main := *top* & [ +TAG string, +PRB string ]. tnt := *top* & [ +MAIN tnt_main, +TAGS *list*, +PRBS *list* ]. null_tnt := tnt & [ +TAGS < >, +PRBS < > ]. ;; ;; in token mapping, it is useful to have available distinct `anti'-strings. ;; anti_string := string. non_string := string. ;;; ;;; orthographic classes, used in token mapping and lexical filtering ;;; token_class := *sort*. no_class := token_class. named_entity := token_class. proper_ne := named_entity. file_ne := proper_ne. url_ne := proper_ne. email_ne := proper_ne. phone_ne := proper_ne. card_or_year_ne := named_entity. card_or_dom_ne := named_entity. card_or_time_ne := named_entity. card_ne := card_or_year_ne & card_or_dom_ne & card_or_time_ne. year_ne := card_or_year_ne. ord_or_dom_ne := named_entity. ord_ne := ord_or_dom_ne. frct_ne := named_entity. plur_ne := named_entity. dom_card_ne := card_or_dom_ne. dom_ord_ne := ord_or_dom_ne. date_ne := named_entity. meas_or_time_ne := named_entity. time_ne := card_or_dom_ne & card_or_time_ne & meas_or_time_ne. meas_ne := meas_or_time_ne. meas_noun_ne := named_entity. ;; ;; the following are modeled after POSIX character classes; most have obvious ;; correspondences in terms of (more elaborate) UniCode character properties. ;; essentially, we cross-classify along three dimensions: (a) the combination ;; of characters used, (b) whether or not the first character is capitalized, ;; and (c) whether or not a token appears utterance-initial. ;; non_ne := token_class & [ +INITIAL luk ]. non_alphanumeric := non_ne. apostrophe := non_alphanumeric. anti_apostrophe := non_alphanumeric. alphanumeric := non_ne & [ +CASE token_case ]. alphabetic := alphanumeric. numeric := alphanumeric. ;; ;; in parsing partially bracketed inputs, we introduce special-purpose bracket ;; tokens, e.g. |the ¦[0 parking lot 0]¦ attendant arrived.| these are parsed ;; in token mapping, using a custom +CLASS value. ;; bracket := non_ne & [ +COUNT string ]. left_bracket := bracket. right_bracket := bracket. ;; ;; at least the fourth time that i revise this hierarchy. `capitalized' or not ;; is a property of the first character (|1A| is not capitalized). `mixed', on ;; the other hand, is only applicable to tokens with at least two characters. ;; both |aB| and |AbC| are mixed, but |A| or |a| are not. finally, `lower' and ;; `upper' reflect the full token string, i.e. |Dan| is neither, |1a| is lower, ;; and |A| is upper. ;; token_case := *sort*. capitalized := token_case. non_capitalized := token_case. mixed := token_case. non_mixed := token_case. capitalized+mixed := capitalized & mixed. capitalized+non_mixed := capitalized & non_mixed. capitalized+lower := capitalized+non_mixed. capitalized+upper := capitalized+non_mixed. non_capitalized+mixed := non_capitalized & mixed. non_capitalized+lower := non_capitalized & non_mixed. chart_mapping_rule := *top* & [ +CONTEXT *list*, +INPUT *list*, +OUTPUT *list*, +POSITION string ]. ;;; ;;; constructing a sensible hierarchy of token mapping rules is not trivial. ;;; there is variation among many dimensions: (a) arity of input and output, ;;; positioning of LHS and RHS rule elements, (c) which token properties are ;;; copied over, and others. ;;; ;;; following is an attempt to sketch some of the more frequent configurations, ;;; but so far there is hardly any use of inheritance here ... ;;; token_mapping_rule := chart_mapping_rule. one_one_tmt := token_mapping_rule & [ +INPUT < [ +ID #id, +FROM #from, +TO #to ] >, +OUTPUT < [ +ID #id, +FROM #from, +TO #to ] >, +POSITION "O1@I1" ]. two_one_tmt := token_mapping_rule & [ +INPUT < [ +ID [ LIST #front, LAST #middle ], +FROM #from ], [ +ID [ LIST #middle, LAST #back ], +TO #to ] >, +OUTPUT < [ +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to ] >, +POSITION "I1, +OUTPUT < [ +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to ] >, +POSITION "I1, +OUTPUT < [ +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to ] >, +POSITION "I1, +OUTPUT < [ +ID #id, +FROM #from, +TO #to ], [ +ID #id, +FROM #from, +TO #to ] >, +POSITION "O1, +OUTPUT < [ +ID #id, +FROM #from, +TO #to ], [ +ID #id, +FROM #from, +TO #to ], [ +ID #id, +FROM #from, +TO #to ] >, +POSITION "O1, +OUTPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG #stag ] > ]. two_one_initial_form_tmt := two_one_tmt & [ +INPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG #stag ], [ ] >, +OUTPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG #stag ] > ]. two_one_final_form_tmt := two_one_tmt & [ +INPUT < [ ], [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG #stag ] >, +OUTPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG #stag ] > ]. three_one_initial_form_tmt := three_one_tmt & [ +INPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG #stag ], [ ], [ ] >, +OUTPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG #stag ] > ]. three_one_center_form_tmt := three_one_tmt & [ +INPUT < [ ], [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG #stag ], [ ] >, +OUTPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG #stag ] > ]. three_one_final_form_tmt := three_one_tmt & [ +INPUT < [ ], [ ], [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG #stag ] >, +OUTPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG #stag ] > ]. one_two_all_form_tmt := one_two_tmt & [ +INPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG #stag ] >, +OUTPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG #stag ], [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG #stag ] > ]. ;;; ;;; a few relatively specialized token mapping rule types, for configurations ;;; that are instantiated with non-trivial frequency. ;;; token_class_null_tnt_tmt := one_one_tmt & [ +INPUT < [ +FORM #form, +TRAIT #trait, +CLASS no_class, +PRED #pred, +CARG #carg, +TNT.+MAIN #main ] >, +OUTPUT < [ +FORM #form, +TRAIT #trait, +CLASS non_ne, +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ] > ]. token_class_tmt := one_one_tmt & [ +INPUT < [ +FORM #form, +TRAIT #trait, +CLASS no_class, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG #stag ] >, +OUTPUT < [ +FORM #form, +TRAIT #trait, +CLASS non_ne, +PRED #pred, +CARG #carg, +TNT #tnt, +STAG #stag ] > ]. token_case_tmt := token_mapping_rule & [ +INPUT < [ +FORM #form, +TRAIT #trait, +CLASS #class, +PRED #pred, +TNT #tnt, +STAG #stag ] >, +OUTPUT < [ +FORM #form, +TRAIT #trait, +CLASS #class, +PRED #pred, +TNT #tnt, +STAG #stag ] > ]. one_one_token_case_tmt := one_one_tmt & token_case_tmt. ;; ;; the following rules are unusual, as they combine +IDs from both the context ;; and input elements; the contexts (punctuation marks) needs to remain in the ;; chart until (re-)attached to all adjacent tokens, but eventually they will ;; be purged from the chart. ;; prefix_punctuation_tmt := token_mapping_rule & [ +CONTEXT < [ +ID [ LIST #front, LAST #middle ], +FROM #from ] >, +INPUT < [ +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg, +ID [ LIST #middle, LAST #back ], +TO #to, +TNT #tnt, +STAG #stag ] >, +OUTPUT < [ +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg, +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to, +TNT #tnt, +STAG #stag ] >, +POSITION "C1, +OUTPUT < [ +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg, +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to, +TNT #tnt, +STAG #stag ] >, +POSITION "C1, +CONTEXT < [ +ID [ LIST #middle, LAST #back ], +TO #to ] >, +OUTPUT < [ +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg, +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to, +TNT #tnt, +STAG #stag ] >, +POSITION "I1, +OUTPUT < [ +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg, +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to, +TNT #tnt, +STAG #stag ] >, +POSITION "C1, +OUTPUT < [ +FORM #form, +TRAIT generic_trait, +CLASS named_entity, +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ] > ]. add_ne_tmt := token_mapping_rule & [ +CONTEXT < [ +FORM #form, +CLASS non_ne, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT.+MAIN #main ] >, +INPUT < >, +OUTPUT < [ +FORM #form, +TRAIT generic_trait, +CLASS named_entity, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT null_tnt & [ +MAIN #main ] ] >, +POSITION "O1@C1" ]. ;;; ;;; lexical filtering rules; not much use of the type hierarchy yet ;;; lexical_filtering_rule := chart_mapping_rule.