;;; -*- Mode: tdl; Coding: utf-8; indent-tabs-mode: nil; -*- ;;; ;;; upon completion of `lexical parsing' (i.e. application of lexical rules ;;; until a fix-point is reached), we can now filter lexical entries. there is ;;; little point attempting to do that earlier (as PET used to in its original ;;; `-default-les' mode, where generics were only activated where there seemed ;;; to be `gaps' in the _initial_ lexical chart, i.e. after lexical lookup). ;;; ;;; the main problem in this approach is the interaction with orthographemics: ;;; in the initial lexical chart, there will be an edge analysing |UPS| as the ;;; plural or 3sg present tense form of the preposition |up|. it is only once ;;; lexical rules have been processed that we know such hypotheses have turned ;;; out invalid. thus, lexical filtering rules below operate on lexical edges, ;;; lexical entries that have gone through any number of lexical rules, i.e. ;;; everything that would ordinarily feed into syntactic rules. ;;; ;;; initially, our strategy is conservative: whenever there is a native entry, ;;; purge all generic entries in the same chart cell, unless there is a good ;;; reason to keep some. for now, only capitalization is considered a reason, ;;; and even there (i.e. for generic names), certain types of native entries ;;; will filter. ;;; ;;; ;;; throw out entires not licensed by supertag if licensed entries are available #| native_stag_null+native_stag_cons := lexical_filtering_rule & [ +CONTEXT < [LR.LR_INFLECTED +, TOKENS.+LIST < [ +TRAIT native_trait, +STAG.+TAGS < [] > ]> ] >, +INPUT < [LR.LR_INFLECTED +, TOKENS.+LIST < [ +TRAIT native_trait, +STAG.+TAGS < > ]> ] >, +OUTPUT < >, +POSITION "I1@C1" ]. |# ;; ;; throw out generic whenever a native entry is available, unless the token is ;; a named entity (which now includes names activated because of mixed case or ;; non-sentence-initial capitalization). ;; ;; noun_generic_non_ne+native_lfr := lexical_filtering_rule & [ +CONTEXT < [LR.LR_INFLECTED +, SYNSEM.LOC.CAT.HEAD noun-head, TOKENS.+LIST < [ +TRAIT native_trait ]> ] >, +INPUT < [ SYNSEM.LOC.CAT.HEAD noun-head, TOKENS.+LIST < [ +TRAIT generic_trait, +CLASS non_ne ]>] >, +OUTPUT < >, +POSITION "I1@C1" ]. adv_generic_non_ne+native_lfr := lexical_filtering_rule & [ +CONTEXT < [SYNSEM.LOC.CAT.HEAD mod-adv-head, LR.LR_INFLECTED +, TOKENS.+LIST < [ +TRAIT native_trait ]> ] >, +INPUT < [ SYNSEM.LOC.CAT.HEAD mod-adv-head, TOKENS.+LIST < [ +TRAIT generic_trait, +CLASS non_ne ]>] >, +OUTPUT < >, +POSITION "I1@C1" ]. adja_generic_non_ne+native_lfr := lexical_filtering_rule & [ +CONTEXT < [SYNSEM.LOC.CAT.HEAD mod-adj-head, LR.LR_INFLECTED +, TOKENS.+LIST < [ +TRAIT native_trait ]> ] >, +INPUT < [SYNSEM.LOC.CAT.HEAD mod-adj-head, TOKENS.+LIST < [ +TRAIT generic_trait, +CLASS non_ne ]>] >, +OUTPUT < >, +POSITION "I1@C1" ]. adj-unflex_generic_non_ne+native_lfr := lexical_filtering_rule & [ +CONTEXT < adj-non-prd-unflex-le & [LR.LR_INFLECTED +, TOKENS.+LIST < [ +TRAIT native_trait ]> ] >, +INPUT < adj-non-prd-unflex-le & [ TOKENS.+LIST < [ +TRAIT generic_trait, +CLASS non_ne ]>] >, +OUTPUT < >, +POSITION "I1@C1" ]. adjd_generic_non_ne+native_lfr := lexical_filtering_rule & [ +CONTEXT < [SYNSEM.LOC.CAT.HEAD prd-adj-head, LR.LR_INFLECTED +, TOKENS.+LIST < [ +TRAIT native_trait ]> ] >, +INPUT < [ SYNSEM.LOC.CAT.HEAD prd-adj-head, TOKENS.+LIST < [ +TRAIT generic_trait, +CLASS non_ne ]>] >, +OUTPUT < >, +POSITION "I1@C1" ]. verb_generic_non_ne+native_lfr := lexical_filtering_rule & [ +CONTEXT < [LR.LR_INFLECTED +, SYNSEM verb-synsem, TOKENS.+LIST < [ +TRAIT native_trait ]> ] >, +INPUT < [ SYNSEM verb-synsem, TOKENS.+LIST < [ +TRAIT generic_trait, +CLASS non_ne ]>] >, +OUTPUT < >, +POSITION "I1@C1" ]. ;; ;; a native name, however, should suppress generic names, even NE ones. this ;; is restricted to singular native names, since otherwise we get unwanted ;; blocking for acronyms like |EDS|, given the native name |Ed|. ;; proper_ne+name_lfr := lexical_filtering_rule & [ +CONTEXT < [ TOKENS.+LIST < [ +TRAIT native_trait ]>, SYNSEM [ LOC [ CAT.HEAD ref-noun-head, UNIAGR.PNG 3-s-g, CONT.KEY.PRED named_rel]] ] >, +INPUT < [ TOKENS.+LIST < [ +TRAIT generic_trait ]>, SYNSEM [ LOC.CONT.KEY.PRED named_rel ] ] >, +OUTPUT < >, +POSITION "I1@C1" ]. ;; ;; mass nouns (both native and generic) also suppress generic names, even ;; NE ones. this reflects what dan calls the `tyranny of mass nouns', i.e. ;; the assumptions that there are no syntactic contexts where a proper name ;; would be needed for coverage (thus glossing over differences in the ;; associated semantics, for improved parsing efficiency). ;; mass_noun+name_lfr := lexical_filtering_rule & [ +CONTEXT < [ SYNSEM mass-noun-synsem & [ LOC [ CAT.HEAD noun-head, UNIAGR [ PNG 3-s-g ] ] ] ] >, +INPUT < [ TOKENS.+LIST < [ +TRAIT generic_trait ]>, SYNSEM [ LOC.CONT.KEY.PRED named_rel ] ] >, +OUTPUT < >, +POSITION "I1@C1" ]. ;; ;; ;; avoid analyzing currency symbols (like |US$|), which appear capitalized, as ;; generic names ;; currency+name_lfr := lexical_filtering_rule & [ +CONTEXT < [ TOKENS.+LIST < [ +TRAIT native_trait ]>, SYNSEM [ LOC [CAT.HEAD noun-head, CONT.KEY currency_rel] ] ] >, +INPUT < [ TOKENS.+LIST < [ +TRAIT generic_trait ]>, SYNSEM [ LOC.CONT.KEY.PRED named_rel ] ] >, +OUTPUT < >, +POSITION "I1@C1" ]. ;; ;; discard generic names (even NE ones) for |I|, a pronoun that is standardly ;; capitalized. ;; ;; proper_ne+pronoun_lfr := lexical_filtering_rule & ;; [ +CONTEXT < [ SYNSEM [ PHON.ONSET con_or_voc, ;; LOC [ CAT.HEAD.CASE nom, ;; UNIAGR.PNG.PN 1s ], ;; LKEYS.KEYREL.PRED pron_rel ] ] >, ;; +INPUT < [ SYNSEM [ PHON.ONSET unk_onset, ;; LKEYS.KEYREL.PRED named_rel ] ] >, ;; +OUTPUT < >, ;; +POSITION "I1@C1" ]. ;;; Initial caps are not a good indicator of proper nouns in German ;;; Better trust POS-activated NEs native-noun+ne_name_lfr := lexical_filtering_rule & [ +CONTEXT < [ TOKENS.+LIST < [ +TRAIT native_trait ]>, SYNSEM.LOC.CAT.HEAD noun-head ] >, +INPUT < [ TOKENS.+LIST < [ +FORM ^[[:upper:]][[:lower:]]+[[:punct:]]?$, +CLASS proper_ne, +TRAIT generic_trait ]>, SYNSEM [ LOC.CONT.KEY.PRED named_rel ] ]>, +OUTPUT < >, +POSITION "I1@C1" ]. ;; ;; a named entity corresponding to a name kills a PoS-activated generic name, ;; unless that is a named entity itself. ;; generic_name+ne_name_lfr := lexical_filtering_rule & [ +INPUT < [ TOKENS.+LIST < [ +TRAIT generic_trait ]>, ORTH.CLASS named_entity ] >, +CONTEXT < [ TOKENS.+LIST < [ +TRAIT generic_trait ]>, SYNSEM [ LOC.CONT.KEY.PRED named_rel ], ORTH.CLASS non_ne ] >, +OUTPUT < >, +POSITION "I1@C1" ]. #| ;;; Rule does not make much sense: delete all other native entries ;;; in case of particle reading in same cell? ptkvz_dup_lfr := lexical_filtering_rule & [ +CONTEXT < [ TOKENS.+LIST < [ +TRAIT native_trait, +TNT.+TAGS < "PTKVZ" > ] > ] >, +INPUT < [ TOKENS.+LIST < [ +TRAIT native_trait, +TNT null_tnt, +STAG null_tnt] > ] >, +OUTPUT < >, +POSITION "I1@C1"]. |# ;;; Generic entries for particle verbs have generic_trait, ;;; others have lic_generic_trait ;;; Mark all particle verbs licensed by a following particle ;;; with lic_generic_trait. licensed_generic_particle_verb_tmr := lexical_filtering_rule & [+CONTEXT < [ TOKENS.+LIST < [ +TRAIT native_trait, +TNT.+TAGS < "PTKVZ" > ] > ] >, +INPUT < [TOKENS.+LIST < [+FORM #form, +TRAIT nlic_generic_trait, +CLASS #class & non_ne, +PRED #pred, +ID #id, +FROM #from, +TO #to, +TNT #tnt & [+TAGS <"VVFIN">] ]>]>, +OUTPUT < [TOKENS.+LIST < [+FORM #form, +TRAIT lic_generic_trait, +CLASS #class, +PRED #pred, +ID #id, +FROM #from, +TO #to, +TNT #tnt & [+TAGS <"VVFIN">] ]> ] >, +POSITION "I1@O1, I1<] > ] >, +OUTPUT < >, +POSITION ""]. ;; ;; generic entries followed by punctuation will typically admit two readings, ;; one of them including the punctuation marks as part of the generic, as e.g. ;; in (sentence-final) |oe@yy.com.| these are rarely (if ever) desirable, so ;; delete edges whose tokens bear final punctuation if they have not undergone ;; punctuation affixation rule(s). and likewise for prefixing punctuation. ;; generic_right_punct_lfr := lexical_filtering_rule & [ +INPUT < [ ORTH.FORM ^.+[])}”",;!?-_]$, TOKENS.+LIST < [ +TRAIT generic_trait ]>, SIGNPUNCT.RPUNCT no_punct ] >, +OUTPUT < > ]. generic_left_punct_lfr := lexical_filtering_rule & [ +INPUT < [ ORTH.FORM ^[[({“‘].+$, TOKENS.+LIST < [ +TRAIT generic_trait ]>, SIGNPUNCT.LPUNCT no_punct ] >, +OUTPUT < > ].