;;; -*- Mode: tdl; Coding: utf-8; -*- word-or-lexrule :+ [STEM [FORM #surf, FROM #from, TO #to], SYNSEM [POLTONE #pol, LKEYS.KEYREL [CFROM #from, CTO #to]], SUPRA [TM #tm, LM #lm, --TONES #lt, --LENGTHS #ll, TONES !>, LENGTHS ], TOKENS tokens & [+LIST <[+FORM #surf, +FROM #from, +TM #tm, +LM #lm, +TONES #lt, +LENGTHS #ll ], ...>, +LAST.+TO #to]]. tokens := *top* & [ +LIST list, +LAST token ]. token_min := avm. token := token_min & [ +FORM string, +TM diff-list, +LM diff-list, +TONES diff-list, +LENGTHS diff-list, +CLASS token_class, +TRAIT token_trait, +PRED predsort, +CARG string, +ID diff-list, +FROM string, +TO string, +TNT tnt ]. token_trait := *top*. anti_trait := token_trait. native_trait := token_trait. generic_trait := token_trait. native_token_list := list. native_token_cons := native_token_list & cons & [ FIRST.+TRAIT native_trait, REST native_token_list ]. native_token_null := native_token_list & null. generic_token_list := list. generic_token_cons := generic_token_list & cons & [ FIRST.+TRAIT generic_trait, REST generic_token_list ]. generic_token_null := generic_token_list & null. tnt := *top* & [ +TAGS list, +PRBS list ]. null_tnt := tnt & [ +TAGS < >, +PRBS < > ]. ;; ;; in token mapping, it is useful to have available distinct `anti'-strings. ;; anti_string := string. non_string := string. ;;; ;;; orthographic classes, used in token mapping and lexical filtering ;;; token_class := sort. no_class := token_class. named_entity := token_class. proper_ne := named_entity. file_ne := proper_ne. url_ne := proper_ne. email_ne := proper_ne. phone_ne := proper_ne. card_or_year_ne := named_entity. card_or_dom_ne := named_entity. card_or_time_ne := named_entity. card_ne := card_or_year_ne & card_or_dom_ne & card_or_time_ne. year_ne := card_or_year_ne. ord_or_dom_ne := named_entity. ord_ne := ord_or_dom_ne. frct_ne := named_entity. plur_ne := named_entity. dom_card_ne := card_or_dom_ne. dom_ord_ne := ord_or_dom_ne. date_ne := named_entity. meas_or_time_ne := named_entity. time_ne := card_or_dom_ne & card_or_time_ne & meas_or_time_ne. meas_ne := meas_or_time_ne. meas_noun_ne := named_entity. ;; ;; the following are modeled after POSIX character classes; most have obvious ;; correspondences in terms of (more elaborate) UniCode character properties. ;; essentially, we cross-classify along three dimensions: (a) the combination ;; of characters used, (b) whether or not the first character is capitalized, ;; and (c) whether or not a token appears utterance-initial. ;; non_ne := token_class & [ +INITIAL luk ]. non_alphanumeric := non_ne. alphanumeric := non_ne & [ +CASE token_case ]. alphabetic := alphanumeric. numeric := alphanumeric. ;; ;; at least the fourth time that i revise this hierarchy. `capitalized' or not ;; is a property of the first character (|1A| is not capitalized). `mixed', on ;; the other hand, is only applicable to tokens with at least two characters. ;; both |aB| and |AbC| are mixed, but |A| or |a| are not. finally, `lower' and ;; `upper' reflect the full token string, i.e. |Dan| is neither, |1a| is lower, ;; and |A| is upper. ;; token_case := sort. capitalized := token_case. non_capitalized := token_case. mixed := token_case. non_mixed := token_case. capitalized+mixed := capitalized & mixed. capitalized+non_mixed := capitalized & non_mixed. capitalized+lower := capitalized+non_mixed. capitalized+upper := capitalized+non_mixed. non_capitalized+mixed := non_capitalized & mixed. non_capitalized+lower := non_capitalized & non_mixed. chart_mapping_rule := *top* & [ +CONTEXT list, +INPUT list, +OUTPUT list, +POSITION string ]. ;;; ;;; constructing a sensible hierarchy of token mapping rules is not trivial. ;;; there is variation among many dimensions: (a) arity of input and output, ;;; positioning of LHS and RHS rule elements, (c) which token properties are ;;; copied over, and others. ;;; ;;; following is an attempt to sketch some of the more frequent configurations, ;;; but so far there is hardly any use of inheritance here ... ;;; token_mapping_rule := chart_mapping_rule. one_one_tmt := token_mapping_rule & [ +INPUT < [ +ID #id, +FROM #from, +TO #to ] >, +OUTPUT < [ +ID #id, +FROM #from, +TO #to ] >, +POSITION "O1@I1" ]. two_one_tmt := token_mapping_rule & [ +INPUT < [ +ID [ LIST #front, LAST #middle ], +FROM #from ], [ +ID [ LIST #middle, LAST #back ], +TO #to ] >, +OUTPUT < [ +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to ] >, +POSITION "I1, +OUTPUT < [ +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to ] >, +POSITION "I1, +OUTPUT < [ +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to ] >, +POSITION "I1, +OUTPUT < [ +ID #id, +FROM #from, +TO #to ], [ +ID #id, +FROM #from, +TO #to ] >, +POSITION "O1, +OUTPUT < [ +ID #id, +FROM #from, +TO #to ], [ +ID #id, +FROM #from, +TO #to ], [ +ID #id, +FROM #from, +TO #to ] >, +POSITION "O1, +OUTPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. two_one_initial_form_tmt := two_one_tmt & [ +INPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ], [ ] >, +OUTPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. two_one_final_form_tmt := two_one_tmt & [ +INPUT < [ ], [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +OUTPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. three_one_center_form_tmt := three_one_tmt & [ +INPUT < [ ], [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ], [ ] >, +OUTPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. three_one_final_form_tmt := three_one_tmt & [ +INPUT < [ ], [ ], [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +OUTPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. one_two_all_form_tmt := one_two_tmt & [ +INPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +OUTPUT < [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ], [ +TRAIT #trait, +CLASS #class, +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. ;;; ;;; a few relatively specialized token mapping rule types, for configurations ;;; that are instantiated with non-trivial frequency. ;;; token_class_tmt := one_one_tmt & [ +INPUT < [ +FORM #form, +TRAIT #trait, +CLASS no_class, +PRED #pred, +CARG #carg, +TNT #tnt ] >, +OUTPUT < [ +FORM #form, +TRAIT #trait, +CLASS non_ne, +PRED #pred, +CARG #carg, +TNT #tnt ] > ]. token_case_tmt := token_mapping_rule & [ +INPUT < [ +FORM #form, +TRAIT #trait, +CLASS #class, +PRED #pred, +TNT #tnt ] >, +OUTPUT < [ +FORM #form, +TRAIT #trait, +CLASS #class, +PRED #pred, +TNT #tnt ] > ]. one_one_token_case_tmt := one_one_tmt & token_case_tmt. ;; ;; _fix_me_ ;; NE rules force [ +TRAIT generic_trait ], to prevent NE tokens activating a ;; native entry. there are some digits in the lexicon, hence `4 chairs' could ;; in principle get two analyses. but i see no reason why we should want that? ;; (26-sep-08; oe) ne_tmt := one_one_tmt & [ +INPUT < [ +FORM #form, +CLASS non_ne, +PRED #pred, +CARG #carg ] >, +OUTPUT < [ +FORM #form, +TRAIT generic_trait, +CLASS named_entity, +PRED #pred, +CARG #carg, +TNT null_tnt ] > ]. add_ne_tmt := token_mapping_rule & [ +CONTEXT < [ +FORM #form, +CLASS non_ne, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to ] >, +OUTPUT < [ +FORM #form, +TRAIT generic_trait, +CLASS named_entity, +PRED #pred, +CARG #carg, +ID #id, +FROM #from, +TO #to, +TNT null_tnt ] >, +POSITION "O1@C1" ]. ;;; ;;; lexical filtering rules; not much use of the type hierarchy yet ;;; lexical_filtering_rule := chart_mapping_rule. ;;; Tone rule types initialise_tmt := one_one_form_tmt & [ +INPUT < [+FORM #form] >, +OUTPUT < [+FORM #form, +TM , +LM , +TONES !>, +LENGTHS !>] > ]. diacritic_norm_tmt := one_one_form_tmt & [ +INPUT < [ +TM #tm, +LM #lm, +TONES #lt, +LENGTHS #ll] >, +OUTPUT < [ +TM #tm, +LM #lm, +TONES #lt, +LENGTHS #ll] > ]. tone_tmt := one_one_form_tmt & [ +INPUT < [ +TM #tm, +LM #lm, +TONES.LIST < #tlist, ... >, +LENGTHS.LIST < #llist, ... >] >, +OUTPUT < [ +TM #tm, +LM #lm, +TONES.LIST < , ... >, +LENGTHS.LIST < , ... > ] > ]. high_tmt := tone_tmt & [ +OUTPUT < [ +TONES.LIST < < high, ...>, ... >, +TM ] > ]. low_tmt := tone_tmt & [ +OUTPUT < [ +TONES.LIST < < low, ...>, ... >, +TM ] > ]. fall_tmt := tone_tmt & [ +OUTPUT < [ +TM , +TONES.LIST < < fall, ...>, ... >] > ]. utone_tmt := tone_tmt & [ +OUTPUT < [ +TONES.LIST < , ... > ] > ]. long_tmt := tone_tmt & [ +OUTPUT < [ +LENGTHS.LIST < , ... >, +LM ] > ]. short_tmt := tone_tmt & [ +OUTPUT < [ +LENGTHS.LIST < < short, ...>, ... > ] > ]. ulen_tmt := tone_tmt & [ +OUTPUT < [ +LENGTHS.LIST < < ulength, ...>, ... > ] > ]. h_long_tmt := high_tmt & long_tmt. l_long_tmt := low_tmt & long_tmt. hl_long_tmt := fall_tmt & long_tmt. *_long_tmt := utone_tmt & long_tmt. h_tmt := high_tmt & ulen_tmt. l_tmt := low_tmt & ulen_tmt. hl_tmt := fall_tmt & ulen_tmt. *_tmt := utone_tmt & ulen_tmt. h_short_tmt := high_tmt & short_tmt. l_short_tmt := low_tmt & short_tmt. hl_short_tmt := fall_tmt & short_tmt. *_short_tmt := utone_tmt & short_tmt.