;;; -*- Mode: tdl; Coding: utf-8; -*-

;;;
;;; Copyright (c) 2009 -- 2010 Stephan Oepen (oe@ifi.uio.no); 
;;; see `LICENSE' for conditions.
;;;


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; at this point, we multiply out PoS values on all tokens, where for each
;;; original token as many additional tokens are created (in the same chart
;;; cell) as there are PoS readings.  at this point, we start distinguishing
;;; between tokens that activate native lexical entries (LEs), vs. those that
;;; activate generic LEs.  in the token universe, this distinction is made by
;;; virtue of +TRAIT, with generic_trait targeting generic LEs.  the two sets
;;; do not overlap, i.e. for a single original token with two PoS readings, we
;;; end up with a total of three new tokens.  the pair of rules below resembles
;;; a recursive function, terminating once the PoS list has been reduced to 
;;; a singleton element.  form-based named entities identified earlier avoid
;;; this kind of PoS multiplication because they have already emptied out their
;;; PoS list.
;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;
;; even though we originally made sure all tokens had a fully specified +TNT
;; value, intervening rules could have `leaked' PoS information.  if so, once
;; again, fully annul the +TNT value.
;;
tnt_default_tmr := one_one_tmt &
[ +INPUT < [ +FORM #form, +TRAIT #trait, +CLASS #class,
             +PRED #pred, +CARG #carg, 
             +TNT [ +MAIN #main, +TAGS < anti_string, ... > ] ] >,
  +OUTPUT < [ +FORM #form, +TRAIT #trait, +CLASS #class,
              +PRED #pred, +CARG #carg, +TNT null_tnt & [ +MAIN #main ] ] >,
  +CONTEXT < > ].

tnt_recurse_tmr := token_mapping_rule &
[ +INPUT < [ +FORM #form, +CLASS #class, 
             +TRAIT anti_trait & [ +LB #lb, +RB #rb ],
             +PRED #pred, +CARG #carg,
             +ID #id, +FROM #from, +TO #to,
             +TNT [ +MAIN #main, 
                    +TAGS < #tag . #tags & *cons* >,
                    +PRBS < #prb . #prbs & *cons* > ] ] > ,
  +OUTPUT < [ +FORM #form, +TRAIT [ +UW +, +LB #lb, +RB #rb ],
              +CLASS #class, +PRED #pred, +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
              +TNT [ +MAIN #main, +TAGS < #tag >, +PRBS < #prb > ] ],
            [ +FORM #form, +CLASS #class, 
              +TRAIT [ +LB #lb, +RB #rb ],
              +PRED #pred, +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
              +TNT [ +MAIN #main, +TAGS #tags, +PRBS #prbs ] ] >,
  +CONTEXT < >,
  +POSITION "O1@I1, O2@I1" ].

tnt_terminate_tmr := token_mapping_rule &
[ +INPUT < [ +FORM #form, +TRAIT anti_trait & [ +LB #lb, +RB #rb ],
             +CLASS #class, +PRED #pred, +CARG #carg,
             +ID #id, +FROM #from, +TO #to,
             +TNT [ +MAIN #main, +TAGS < #tag >, +PRBS < #prb > ] ] > ,
  +OUTPUT < [ +FORM #form, +TRAIT [ +UW +, +LB #lb, +RB #rb ],
              +CLASS #class, +PRED #pred, +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
              +TNT [ +MAIN #main, +TAGS < #tag >, +PRBS < #prb > ] ],
            [ +FORM #form, +TRAIT [ +UW -, +LB #lb, +RB #rb ],
              +CLASS #class, +PRED #pred, +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
              +TNT null_tnt & [ +MAIN #main ] ] >,
  +CONTEXT < >,
  +POSITION "O1@I1, O2@I1" ].

;;;
;;; with singleton PoS readings multiplied out in each chart cell, we can prune
;;; undesirable alternatives, e.g. a foreign word reading when there also is a
;;; common noun.  also, ditch PoS readings with very low probability, and ones
;;; for which no PoS-activated generic entries exist anyway (function words).
;;; this final step eases debugging, reducing the size of the token chart.
;;;

tnt_ditch_unlikely_tmr := token_mapping_rule &
[ +INPUT < [ +TNT.+PRBS < ^0?\.0.*$ > ] >,
  +OUTPUT < >,
  +CONTEXT < > ].

tnt_ditch_function_1_tmr := token_mapping_rule &
[ +INPUT < [ +TNT.+TAGS < ^CC|DT|EX|IN|MD|PDT|POS|PRP\$?|RB[RS]$ > ] >,
  +OUTPUT < >,
  +CONTEXT < > ].

tnt_ditch_function_2_tmr := token_mapping_rule &
[ +INPUT < [ +TNT.+TAGS < ^RP|TO|UH|WDT|WP|WRB$ > ] >,
  +OUTPUT < >,
  +CONTEXT < > ].

tnt_ditch_function_3_tmr := token_mapping_rule &
[ +INPUT < [ +TNT.+TAGS < ^\$|#|``|''|\(|\)|,|\.|:$ > ] >,
  +OUTPUT < >,
  +CONTEXT < > ].

;;
;; _fix_me_
;; experimentally, also ditch PoS information on punctuation-only tokens.  we
;; appear to get noun and adjective readings for n- and m-dashes, which hardly
;; can do us any good.                                         (24-sep-08; oe)
;;
tnt_ditch_punctuation_tmr := token_mapping_rule &
[ +INPUT < [ +FORM ^[[:punct:]]+$, +TNT.+TAGS *cons* ] >,
  +OUTPUT < >,
  +CONTEXT < > ].

;;;
;;; _fix_me_
;;; should we eventually want to include the PoS probabilities as a feature in
;;; parse selection, this kind of pruning should disappear: a high-probability
;;; FW, say, should not be ellbowed out by an unlikely NN.     (31-aug-08; oe)
;;;
tnt_filter_dup_fw_tmr := token_mapping_rule &
[ +CONTEXT  < [ +TNT.+TAGS < "NN" > ] >,
  +INPUT    < [ +TNT.+TAGS < "FW" > ] >,
  +OUTPUT   < >,
  +POSITION "I1@C1" ].

;; 
;; [DPF 23-apr-09] words ending in "-ing" can get tagged both as noun and as
;; verb, but since the grammar has gerund rules, drop the noun and keep the 
;; verb.
;; _fix_me_
;; is there a reason to prefer the gerund over the vanilla noun?  it means a
;; little extra ambiguity when followed by a PP[of], which the generic gerund
;; optionally picks up as a complement.                         (24-may-09; oe)
;;
tnt_filter_dup_vbg_tmr := token_mapping_rule &
[ +CONTEXT  < [ +TNT.+TAGS < "VBG" > ] >,
  +INPUT    < [ +TNT.+TAGS < "NN" > ] >,
  +OUTPUT   < >,
  +POSITION "I1@C1" ].

tnt_filter_dup_nnp_tmr := token_mapping_rule &
[ +CONTEXT  < [ +TNT.+TAGS < ^FW|NN$ > ] >,
  +INPUT    < [ +TNT.+TAGS < "NNP" > ] >,
  +OUTPUT   < >,
  +POSITION "I1@C1" ].


;;
;; _fix_me_
;; the old `posmapping' setting in PET contained the following comment by dan
;; (from sep-07):
;; 
;;   Tried doing without the adjective, since TNT appears to mostly guess both
;;   an adjective and a noun, and our generic mass-count noun can almost always
;;   do the work of the adjective.  This would avoid large amounts of spurious
;;   ambiguity for most occurrences of these pairs.  But unfortunately TNT 
;;   doesn't always guess both, so we need JJ when it's the only guess.  Maybe
;;   we can effect this with the new token-mapping machinery ...
;;
;; the following rule should have that effect.                 (21-jan-09; oe)
;;
;; [DPF 24-mar-09]  Unfortunately, this simple rule goes wrong sometimes.  For 
;; "the tallest and most unk-word cat" the |unk-word| has to be an adjective,
;; so we can't just throw it away.  We'll try using the probabilities from the
;; tagger for a more sensitive rule.
;;
tnt_filter_dup_jj_tmr := token_mapping_rule &
[ +CONTEXT  < [ +TNT [ +TAGS < "NN" >, +PRBS < ^0?\.[3-9].*$ > ] ] >,
  +INPUT    < [ +TNT.+TAGS < "JJ" > ] >,
  +OUTPUT   < >,
  +POSITION "I1@C1" ].

;;
;; there is one case that cuts across the sub-division of generic entries into
;; (a) so-called named entities (triggered from string-level properties) and
;; (b) non-NE generics, triggered on the basis of PoS tags.  the NE rules for
;; names (triggered by capital letters) do not stipulate a named entity in
;; sentence-initial position.  on the other hand, lexical filtering will drop
;; non-NE generics whenever there is a competing native entry.  so, to allow
;; generic names in initial position to survive the filtering, look at the
;; combined string-level and PoS evidence and create an actual NE.  this rule
;; needs to be late in the process, so that we have PoS tags multiplied out.
;;
initial_capitalized_name_nnp_tmr := token_mapping_rule &
[ +INPUT < [ +FORM #form, 
             +CLASS alphanumeric & [ +INITIAL +, +CASE capitalized+lower ],
             +TRAIT #trait & [ +UW + ], +PRED #pred, +CARG #carg,
             +ID #id, +FROM #from, +TO #to, 
             +TNT [ +MAIN #main, +TAGS < "NNP" > ] ] >,
  +OUTPUT < [ +FORM #form, +CLASS proper_ne,
              +TRAIT #trait, +PRED #pred, +CARG #carg,
              +ID #id, +FROM #from, +TO #to, 
              +TNT null_tnt & [ +MAIN #main ] ] >,
  +CONTEXT < >,
  +POSITION "I1@O1" ].   

;;
;; on all tokens that we expect to activate generic entries, make the +PRED
;; value reflect the orthography and PoS tag.
;;
generic_pred_tmr := token_mapping_rule &  
[ +INPUT < [ +FORM #form,
             +TRAIT #trait & [ +UW + ], +CLASS #class & non_ne, 
             +PRED anti_string, +CARG #carg & ^(.+)$,
             +ID #id, +FROM #from, +TO #to,
             +TNT #tnt & [ +TAGS < ^(.*)$ > ] ] >,
  +OUTPUT < [ +FORM #form,
              +TRAIT #trait, +CLASS #class,
              +PRED 
              "_${lc(I1:+CARG:1)}/${I1:+TNT.+TAGS.FIRST:1}_u_unknown_rel",
              +CARG #carg,
              +ID #id, +FROM #from, +TO #to,
              +TNT #tnt ] >,
  +CONTEXT < >,
  +POSITION "O1@I1" ].

;;; DPF 2012-11-05 - Added rule to accommodate multi-word proper names
;;; where right-most token has a hyphen and thus a right bracket, here
;;; moving the left bracket leftward one token.  The rule is awkward
;;; because capitalized tokens come in two flavors: a generic proper name
;;; and a capitalized native token, and we have a lexical filtering rule
;;; that eliminates a generic proper given a native proper.
;;; So we take as input both of these, followed by a hyphenated proper_ne 
;;; (in +CONTEXT), and we produce four new tokens, two copying the input 
;;; tokens but now with left brackets, and two copying the context token 
;;; but erasing the left brackets.  This lets us get all three of the 
;;; following:
;;; (1) |Hong Kong-based| where |Hong| is a native proper name
;;; (2) |Los Gatos-based| where |Los| is unknown
;;; (3) |New York-based| where |York| is unknown
;;;
#|
name_name_hyphen_tmr := one_two_four_tmt &
[ +CONTEXT < [ +FORM ^[[:alpha:]]+[–-]$, 
	       +CLASS proper_ne ,
	       +TRAIT [ +LB < ctype . #lb >, +RB #rb ] ] >,
  +INPUT < [ +CLASS proper_ne,
             +TRAIT [ +UW #uw1 & +, +RB #rb ], +TICK - ],
	   [ +CLASS.+CASE capitalized,
             +TRAIT [ +UW #uw2 & -, +RB #rb ], +TICK - ] >,
  +OUTPUT < [ +TRAIT [ +UW #uw1, +LB < ctype . #lb >, +RB <> ], +TICK + ],
	    [ +TRAIT [ +UW #uw2, +LB < ctype . #lb >, +RB <> ], +TICK + ],
            [ +TRAIT [ +UW #uw1, +LB <>, +RB #rb ] ],
	    [ +TRAIT [ +UW #uw2, +LB <>, +RB #rb ] ] > ].
|#