;;; -*- Mode: tdl; Coding: utf-8; -*-

;;;
;;; Copyright (c) 2009 -- 2018 Stephan Oepen (oe@ifi.uio.no); 
;;; see `LICENSE' for conditions.
;;;


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; re-combine punctuation marks with adjacent tokens, based on directionality
;;; of punctuation marks, e.g. opening vs. closing quotes and brackets.  doing
;;; one such re-combination at a time is sufficient, as each rewrite rule will
;;; apply as many times as it possibly can, seeing its own output from earlier
;;; applications.
;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;
;; but first, preserve the current (non-punctuated) form in +CARG, for later
;; reference, e.g. in constructing +PRED values for generics.  NE rules have
;; done this already, hence make sure to not overwrite existing +CARGSs.
;;
default_carg_tmr := one_one_tmt &
[ +INPUT < [ +FORM #form, +TRAIT #trait, +CLASS #class,
             +PRED #pred, +CARG anti_string, +TNT #tnt ] >,
  +OUTPUT < [ +FORM #form, +TRAIT #trait, +CLASS #class,
              +PRED #pred, +CARG #form, +TNT #tnt ] >,
  +CONTEXT < > ].

;;
;; _fix_me_
;; when re-attaching pre- or suffix punctuation to NEs, we should find a way of
;; forcing the application of corresponding punctuation rules eventually.  as
;; things are now, an NE with adjacent punctuation creates spurious ambiguity:
;; |oe@yy.com.| is matched as an NE prior to re-attaching the period.  when the
;; token with the trailing period is sent through the morphology, two analyses
;; are created, one with, another without a `punct_period' expectation.  both
;; succeed, as there is no testing against a lexical stem with the generic LE.
;; for NEs, at least, i think one could work around this by adding properties 
;; to each token, +PRFX and +SFFX, say, each a list of strings.  in the case of
;; |oe@yy.com.|, the suffix punctuation rule would add to the +SFFX front, say:
;; [ +SFFX < "." > ].  the corresponding orthographemic rules would then have
;; to `pop' the list (to make things simpler, non-generic tokens could leave 
;; +SFFX underspecified), and at some point (syntactic rules, for example), an
;; empty +SFFX (and +PRFX ) would be the pre-requisite to any further rule
;; applications.  --- discuss this with dan one day.            (8-feb-09; oe)
;;

;;
;; both in prefix and suffix punctuation, we need to protect against `clusters'
;; of punctuation marks, e.g. for |(“foo”)|.  if |“| were to attach to |foo|,
;; followed by |(| attaching to |“|, then we are left with two partially over-
;; lapping tokens, but there is no way of arriving at |(“foo| any longer.  the
;; safe-guarding against this in the +INPUT seems a little baroque, but i fail
;; to come up with an easier solution.
;;
;; straight quotes are treated separately because they can attach both to the
;; left and to the right; to prevent fragmented token lattices, this means we
;; also need to preserve the original token to which a straight quote attaches
;; in the chart.  initial straight quotes, however, need extra care, to avoid
;; creating ambiguous start or end nodes in the lattice; essentially, they are
;; left to a specialized version of the general prefix punctuation rule.
;;
;; _fix_me_
;; there is a remaining issue in the treatment of straight quotes, related to
;; other prefix or suffix punctuation to the left or right of a straight quote:
;; if we are to avoid punctuation clusters, we need to enable attachment of
;; straight quotes both before /and/ after regular `affix' punctuation, say in
;; a case like [( "] |Kim , " )|.                               (14-feb-12; oe)
;;

prefix_straight_quote_non_initial_tmr := token_mapping_rule &
[ +CONTEXT < [ +FORM ^(["'])$, +CLASS.+INITIAL -,
               +ID [ LIST #front, LAST #middle ], +FROM #from ],
             [ +FORM ^(.*[^[({“"‘'].*)$,
               +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg,
               +ID [ LIST #middle, LAST #back ], +TO #to,
               +TNT #tnt ] >,
  +INPUT < >,
  +OUTPUT < [ +FORM "${C1:+FORM:1}${C2:+FORM:1}",
              +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg,
              +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to,
              +TNT #tnt  ] >,
  +POSITION "C1<C2, O1@C1, O1@C2" ].

prefix_straight_quote_initial_tmr := prefix_punctuation_tmt &
[ +CONTEXT < [ +FORM ^(["'])$, +CLASS.+INITIAL + ] >,
  +INPUT < [ +FORM ^(.*[^[({“"‘'].*)$ ] >,
  +OUTPUT < [ +FORM "${C1:+FORM:1}${I1:+FORM:1}" ] > ].

prefix_punctuation_tmr := prefix_punctuation_tmt &
[ +CONTEXT < [ +FORM ^([[({\*“‘])$ ] >,
  +INPUT < [ +FORM ^(.*[^[({“"‘'].*)$ ] >,
  +OUTPUT < [ +FORM "${C1:+FORM:1}${I1:+FORM:1}" ] > ].

;;
;; _fix_me_
;; there is a special case in suffix punctation: an apostrophe following a 
;; token ending in |s| could be a possessive marker (which should remain a 
;; token in its own right), or could be a closing single quote (which should
;; be suffixed onto the preceding token).  in principle, the same is true for 
;; double closing quotes (or double primes), but the `inches' measure unit, 
;; maybe, will have been detected during NE recognition earlier.
;;                                                              (20-mar-10; oe)
;; _fix_me_
;; one might wonder about cases like: [the] |factor(s)'s| [importance], but at
;; least today i am not enough of a perfectionist for that.     (23-mar-10; oe)
;;
ambiguous_apostrophe_tmr := token_mapping_rule &
[ +CONTEXT < [ +FORM ^(.*[sS])$,
               +CLASS #class1, +TRAIT #trait1, +PRED #pred1, +CARG #carg1,
               +ID [ LIST #front, LAST #middle ], +FROM #from1,
               +TNT #tnt1 ] >,
  +INPUT < [ +FORM #form2 & ^([’'])$,
	     +CLASS anti_apostrophe, 
             +TRAIT #trait2, +PRED #pred2, +CARG #carg2,
             +ID #id2 & [ LIST #middle, LAST #back ],
             +FROM #from2, +TO #to2 ] >,
  +OUTPUT < [ +FORM "${C1:+FORM:1}${I1:+FORM:1}",
              +CLASS #class1, +TRAIT #trait1, +PRED #pred1, +CARG #carg1,
              +ID [ LIST #front, LAST #back ], +FROM #from1, +TO #to2,
              +TNT #tnt1  ],
            [ +FORM #form2,
              +CLASS apostrophe, +TRAIT #trait2, +PRED #pred2, +CARG #carg2,
              +ID #id2 & [ LIST #middle, LAST #back ],
              +FROM #from2, +TO #to2 ] >,
  +POSITION "C1<I1, O1@C1, O1@I1, O2@I1" ].

suffix_apostrophe_tmr := suffix_punctuation_tmt &
[ +INPUT < [ +FORM ^(.*[^sS])$ ] >,
  +CONTEXT < [ +FORM "’" ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}’" ] > ].

;;
;; at this point, an apostrophe may have attached to a suffix punctuation mark
;; (e.g. from |‘Abrams,’|), hence optionally allow it in the +CONTEXT coda.
;; _fix_me_
;; but of course single quotes can occur anywhere in suffix clusters, as for
;; example in [“those] |‘chairs,’”| [she said.].               (19-feb-13; oe)
;; DPF 2017-09-06 - Added "raised" comma character |⸴| used to signal a 
;; sandwiched comma, so we can choose to report its use for error correction.
;;
suffix_punctuation_tmr := suffix_punctuation_tmt &
[ +INPUT < [ +FORM ^(.*[^])}”",;.!?].*)$ ] >,
  +CONTEXT < [ +FORM ^([])}”,⸴;.!?]’?)$ ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}${C1:+FORM:1}" ] > ].

suffix_straight_quote_tmr := token_mapping_rule &
[ +CONTEXT < [ +FORM ^(.*[^sS])$,
               +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg,
               +ID [ LIST #front, LAST #middle ], +FROM #from,
               +TNT #tnt ],
             [ +FORM ^(["'])$,
               +ID [ LIST #middle, LAST #back ], +TO #to ],
             [ ] >,
  +INPUT < >,
  +OUTPUT < [ +FORM "${C1:+FORM:1}${C2:+FORM:1}",
              +CLASS #class, +TRAIT #trait, +PRED #pred, +CARG #carg,
              +ID [ LIST #front, LAST #back ], +FROM #from, +TO #to,
              +TNT #tnt ] >,
  +POSITION "C1<C2, C2<C3, O1@C1, O1@C2" ].

ditch_apostrophe_tmr := token_mapping_rule &
[ +INPUT < [ +FORM ^[[({“"‘']*[’'][])}”",;.!?]*$, +CLASS anti_apostrophe ] >,
  +OUTPUT < >,
  +CONTEXT < > ].


;; DPF 2016-11-23 - See comment in pos.tdl for tnt_ditch_punctuation_tmr.
;; But apparently can't just drop this rule, since it then leaves all
;; sentence-final periods as separate tokens, and confuses the parse chart.
;; DPF 2017-09-16 - For punctuation that can't get attached to a neighboring
;; token, as with sentence-final opening double quotes ini |like “|, ACE
;; reasonably enough complains about a "post reduction gap" since we leave an
;; empty cell with nothing spanning it.  So let's try replacing such a stranded
;; token with a token giving rise to a generic lexical entry for punctuation,
;; where we check for stranded ones by seeing for the prefix puncts whether
;; any other token has the same start position +FROM (indicating that there 
;; was a successful application of the prefix-punct rule), and similarly with 
;; the end position +TO for suffix puncts.
;; Unfortunately, this does not quite work yet, since the ACE treebanker 
;; somehow still treats the opening double quote as distinct in e.g. 
;; wsj03c:20351065 |"We were oversold| even though interactively all seems well.
;; So leave as before for now, and live with `post reduction gap' error
;; for e.g. wlb03:121002340050 |questions like “| where the opening
;; double quote is sentence-final (presumably from wrong sentence-splitting)
;; and thus stranded.
;;
ditch_punctuation_tmr := token_mapping_rule &
[ +INPUT < [ +FORM ^([[({“‘"]|[])}”",⸴;.!?]+’?)$ ] >,
  +OUTPUT < >,
  +CONTEXT < > ].

;;
;; two similar rules, converting (some) directional GML tokens into affixes
;;
prefix_markup_tmr := basic_prefix_punctuation_tmt &
[ +CONTEXT < [ +FORM ^([[({“‘"']*⌊/)$ ] >,
  +INPUT < [ +FORM ^(.+)$, +CLASS #class & non_ne, +TNT #tnt ] >,
  +OUTPUT < [ +FORM "${C1:+FORM:1}${I1:+FORM:1}",
              +CLASS #class, +TNT #tnt ] > ].

prefix_markup_ne_tmr := basic_prefix_punctuation_tmt &
[ +CONTEXT < [ +FORM ^([[({“‘"']*⌊/)$ ] >,
  +INPUT < [ +FORM ^(.+)$, +CLASS named_entity & #class ] >,
  +OUTPUT < [ +FORM "${C1:+FORM:1}${I1:+FORM:1}",
              +CLASS #class ] > ].

suffix_markup_tmr := basic_suffix_punctuation_tmt &
[ +INPUT < [ +FORM ^(.+)$, +CLASS #class & non_ne, +TNT #tnt ] >,
  +CONTEXT < [ +FORM ^(/⌋[])}”"’',⸴;.!?]*)$ ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}${C1:+FORM:1}",
              +CLASS #class, +TNT #tnt ] > ].

suffix_markup_ne_tmr := basic_suffix_punctuation_tmt &
[ +INPUT < [ +FORM ^(.+)$, +CLASS named_entity & #class, +TNT #tnt ] >,
  +CONTEXT < [ +FORM ^(/⌋[])}”"’',⸴;.!?]*)$ ] >,
  +OUTPUT < [ +FORM "${I1:+FORM:1}${C1:+FORM:1}",
              +CLASS #class, +TNT #tnt ] > ].

ditch_markup_tmr := token_mapping_rule &
[ +INPUT < [ +FORM ^[[({“‘"']*(⌊/|/⌋)[])}”"’',⸴;.!?]*$ ] >,
  +OUTPUT < >,
  +CONTEXT < > ].