;;; -*- Mode: Lisp; Coding: utf-8 -*- ;;; HaG (Hausa Grammar) ;;; Based on Matrix user-fns.lsp ;;; Post-generation remapping of tones (BC) (in-package :lkb) ;;; ;;; identify characters that can form words; all other characters will create ;;; word boundaries and later be suppressed in tokenization. ;;; (defun alphanumeric-or-extended-p (c) (and (graphic-char-p c) (not (member c *punctuation-characters*)))) ;;; ;;; determine surface order of constituents in rule: returns list of paths into ;;; feature structure of rule, i.e. (nil (args first) (args rest first)) for a ;;; binary rule, where the first list element is the path to the mother node of ;;; the rule. ;;; (defun establish-linear-precedence (rule) (let ((daughters (loop for args = (existing-dag-at-end-of rule '(args)) then (existing-dag-at-end-of args *list-tail*) for daughter = (when args (get-value-at-end-of args *list-head*)) for path = (list 'args) then (append path *list-tail*) while (and daughter (not (eq daughter 'no-way-through))) collect (append path *list-head*)))) (if (null daughters) (cerror "Ignore it" "Rule without daughters") (cons nil daughters)))) ;;; ;;; detect rules that have orthographemic variation associated to them; those ;;; who do should only be applied within the morphology system; for the time ;;; being use value of NEEDS-AFFIX feature, though it would be nicer to rely ;;; on a type distinction of lexical rules or re-entrancy of ORTH. ;;; (defun spelling-change-rule-p (rule) (let ((affix (get-dag-value (tdfs-indef (rule-full-fs rule)) 'needs-affix))) (and affix (bool-value-true affix)))) ;;; ;;; create feature structure representation of orthography value for insertion ;;; into the output structure of inflectional rules; somewhat more complicated ;;; than one might expect because of treatment for multi-word elements. ;;; (defun make-orth-tdfs (orth) (let ((unifs nil) (tmp-orth-path *orth-path*)) (loop for orth-value in (split-into-words orth) do (let ((opath (create-path-from-feature-list (append tmp-orth-path *list-head*)))) (push (make-unification :lhs opath :rhs (make-u-value :type orth-value)) unifs) (setq tmp-orth-path (append tmp-orth-path *list-tail*)))) (let ((indef (process-unifications unifs))) (when indef (setf indef (create-wffs indef)) (make-tdfs :indef indef))))) ;;; ;;; assign priorities to parser tasks and lexical entries ;;; ;;; ERB 2008-03-12 rule-priory has to return a value for every ;;; rule or the mmt system fails. (defun rule-priority (rule) (case (rule-id rule) (subj 1000) (t 0))) (defun gen-rule-priority (rule) (rule-priority rule)) (defun lex-priority (mrec) (declare (ignore mrec)) 800) (defun gen-lex-priority (fs) (declare (ignore fs)) 800) ;;; ;;; determine path and file names for lexicon and leaf type cache files. ;;; (defun set-temporary-lexicon-filenames nil (let* ((version (or (find-symbol "*GRAMMAR-VERSION*" :common-lisp-user) (and (find-package :lkb) (find-symbol "*GRAMMAR-VERSION*" :lkb)))) (prefix (if (and version (boundp version)) (remove-if-not #'alphanumericp (symbol-value version)) "lexicon"))) (setf *psorts-temp-file* (make-pathname :name prefix :directory (pathname-directory (lkb-tmp-dir)))) (setf *psorts-temp-index-file* (make-pathname :name (concatenate 'string prefix ".idx") :directory (pathname-directory (lkb-tmp-dir)))) (setf *leaf-temp-file* (make-pathname :name (concatenate 'string prefix ".lfs") :directory (pathname-directory (lkb-tmp-dir)))))) (defun bool-value-true (fs) (and fs (let ((fs-type (type-of-fs fs))) (eql fs-type '+)))) (defun bool-value-false (fs) (and fs (let ((fs-type (type-of-fs fs))) (eql fs-type '-)))) (defun gen-extract-surface (edge &optional (initialp t) &key cliticp stream) (if stream (let ((daughters (edge-children edge))) (if daughters (loop for daughter in daughters for foo = initialp then nil do (and (not (get-dag-value (get-dag-value (get-dag-value (get-dag-value (tdfs-indef (edge-dag edge)) 'SUPRA) 'LEN) 'LIST) 'FIRST) ) (progn (setf supra (unify-dags (tdfs-indef (ltype-tdfs (get-type-entry 'supra_reent))) (get-dag-value (tdfs-indef (edge-dag daughter)) 'SUPRA) ) ) (setf tones (get-dag-value (get-dag-value supra 'TONE) 'LIST) ) (setf lengths (get-dag-value (get-dag-value supra 'LEN) 'LIST) ) (setf rulename (unify-get-type (tdfs-indef (edge-dag daughter)))) ) ) (setf cliticp (gen-extract-surface daughter foo :cliticp cliticp :stream stream)) #+:logon finally #+:logon (setf (edge-lnk edge) (mrs::combine-lnks (edge-lnk (first daughters)) (edge-lnk (first (last daughters)))))) (let* ((entry (get-lex-entry-from-id (first (edge-lex-ids edge)))) (orth (format nil "~{~a~^ ~}" (lex-entry-orth entry))) ;; ;; ;; (orth (if (ppcre::scan "man$" orth) ;; (subseq orth 0 (- (length orth) 3)) ;; orth)) (tdfs (and entry (lex-entry-full-fs entry))) (type (and tdfs (type-of-fs (tdfs-indef tdfs)))) (string (string-downcase (copy-seq (first (edge-leaves edge))))) ;;; Map Tonal annotation to diacritics (string (loop do (setf string (praf string rulename)) (setf string (tonelen string lengths tones)) (setf tones nil) (setf lengths nil) (if (and (boundp '*hag-demo*) *hag-demo*) (return (composite-tone string)) (return string) ;;(tidy-tone string)) ) )) ;; ;; _fix_me_ ;; maybe we could be more courageous and just search for .orth. ;; as a sub-sequence of .string., starting at position .prefix. ;; (22-dec-06; oe) (prefix (loop for c across string while (member c '(#\( #\" #\') :test #'char=) count 1)) (suffix (min (length string) (+ prefix (length orth)))) (suffix (when (string-equal orth string :start2 prefix :end2 suffix) suffix)) (rawp (and suffix (loop for c across orth thereis (upper-case-p c)))) (capitalizep (ignore-errors (loop for match in '(proper-noun-lex ) thereis (or (eq type match) (subtype-p type match))))) (cliticp (or cliticp (and (> (length string) 0) (char= (char string 0) #\'))))) (if rawp (setf string (concatenate 'string (subseq string 0 prefix) orth (subseq string suffix))) (when capitalizep (loop with spacep = t for i from 0 to (- (length string) 1) for c = (schar string i) when (char= c #\Space) do (setf spacep t) else when (char= c #\_) do (setf spacep t) (setf (schar string i) #\Space) else do (when (and spacep (alphanumericp c)) (setf (schar string i) (char-upcase c))) (setf spacep nil)))) (when (and (> (length string) 1) (char= (char string 0) #\_) (upper-case-p (char string 1))) (setf string (subseq string 1))) (when (and initialp (alphanumericp (schar string 0))) (setf (schar string 0) (char-upcase (schar string 0)))) (unless (or initialp cliticp) (format stream " ")) (let (#+:logon (start (file-position stream))) (loop with hyphenp for c across string unless (and hyphenp (char= c #\space)) do (write-char c stream) when (char= c #\-) do (setf hyphenp t) else do (setf hyphenp nil)) #+:logon (setf (edge-lnk edge) (list :characters start (file-position stream)))) ;; ;; finally, inform the caller as to whether we output something that ;; inhibits intervening space (e.g. `mid-July'). ;; (unless (string= orth "") (member (schar orth (- (length orth) 1)) '(#\-) :test #'char=))))) (let ((stream (make-string-output-stream))) (gen-extract-surface edge initialp :stream stream) (get-output-stream-string stream)))) (eval-when #+:ansi-eval-when (:load-toplevel :compile-toplevel :execute) #-:ansi-eval-when (load eval compile) (setf *gen-extract-surface-hook* 'gen-extract-surface)) (defun praf (string rulename) (if (eql rulename 'DO-PRON-IRULE) (setf string (ppcre::regex-replace "^(.*)(ka|ki|shi|ta|mu|ku|su)$" string "\\1 \\2")) ) string ) (defun tonelen (string lengths tones) (if (setf len (get-dag-value lengths 'FIRST)) (progn (or (setf tone (get-dag-value tones 'FIRST)) (setf tone (get-dag-value (possibly-new-constraint-of (unify-get-type (unify-dags (create-typed-dag 'CONS) tones ))) 'FIRST ) )) (setf string (map-tone-tfs string (type-of-fs tone) (type-of-fs len))) (if (get-dag-value lengths 'REST) (if (get-dag-value tones 'REST) (setf string (tonelen string (get-dag-value lengths 'REST) (get-dag-value tones 'REST))) (setf string (tonelen string (get-dag-value lengths 'REST) (get-dag-value (possibly-new-constraint-of (unify-get-type (unify-dags (create-typed-dag 'CONS) tones ))) 'REST ) ) ) ) ) ) ) string ) (defun map-tone-tfs (string tone len) ;;; FIXME: tone pretty printing currently presupposes chart packing ;;; Does not really hurt, mais quand-même... (setf vow (ppcre::regex-replace "^.*?(ai|au|[aeiou])[^aeiou]*$" string "\\1")) (if (eql len lkb::'LONG) (cond ((eql tone lkb::'HIGH) (case (intern vow) (|a| (setf string (ppcre::regex-replace "^(.*)([a])([^aeiou]*)$" string "\\1áá\\3"))) (|e| (setf string (ppcre::regex-replace "^(.*)([e])([^aeiou]*)$" string "\\1éé\\3"))) (|i| (setf string (ppcre::regex-replace "^(.*)([i])([^aeiou]*)$" string "\\1íí\\3"))) (|o| (setf string (ppcre::regex-replace "^(.*)([o])([^aeiou]*)$" string "\\1óó\\3"))) (|u| (setf string (ppcre::regex-replace "^(.*)([u])([^aeiou]*)$" string "\\1úú\\3"))) ) ) ((eql tone lkb::'LOW) (case (intern vow) (|a| (setf string (ppcre::regex-replace "^(.*)([a])([^aeiou]*)$" string "\\1àà\\3"))) (|e| (setf string (ppcre::regex-replace "^(.*)([e])([^aeiou]*)$" string "\\1èè\\3"))) (|i| (setf string (ppcre::regex-replace "^(.*)([i])([^aeiou]*)$" string "\\1ìì\\3"))) (|o| (setf string (ppcre::regex-replace "^(.*)([o])([^aeiou]*)$" string "\\1òò\\3"))) (|u| (setf string (ppcre::regex-replace "^(.*)([u])([^aeiou]*)$" string "\\1ùù\\3"))) ) ) ((eql tone lkb::'FALL) (case (intern vow) (|a| (setf string (ppcre::regex-replace "^(.*)([a])([^aeiou]*)$" string "\\1áà\\3"))) (|e| (setf string (ppcre::regex-replace "^(.*)([e])([^aeiou]*)$" string "\\1éè\\3"))) (|i| (setf string (ppcre::regex-replace "^(.*)([i])([^aeiou]*)$" string "\\1íì\\3"))) (|o| (setf string (ppcre::regex-replace "^(.*)([o])([^aeiou]*)$" string "\\1óò\\3"))) (|u| (setf string (ppcre::regex-replace "^(.*)([u])([^aeiou]*)$" string "\\1úù\\3"))) ) ) ) (cond ((eql tone lkb::'HIGH) (case (intern vow) (|ai| (setf string (ppcre::regex-replace "^(.*)(ai)([^aeiou]*)$" string "\\1áí\\3"))) (|au| (setf string (ppcre::regex-replace "^(.*)(au)([^aeiou]*)$" string "\\1áú\\3"))) (|a| (setf string (ppcre::regex-replace "^(.*)([a])([^aeiou]*)$" string "\\1á\\3"))) (|e| (setf string (ppcre::regex-replace "^(.*)([e])([^aeiou]*)$" string "\\1é\\3"))) (|i| (setf string (ppcre::regex-replace "^(.*)([i])([^aeiou]*)$" string "\\1í\\3"))) (|o| (setf string (ppcre::regex-replace "^(.*)([o])([^aeiou]*)$" string "\\1ó\\3"))) (|u| (setf string (ppcre::regex-replace "^(.*)([u])([^aeiou]*)$" string "\\1ú\\3"))) ) ) ((eql tone lkb::'LOW) (case (intern vow) (|ai| (setf string (ppcre::regex-replace "^(.*)(ai)([^aeiou]*)$" string "\\1àì\\3"))) (|au| (setf string (ppcre::regex-replace "^(.*)(au)([^aeiou]*)$" string "\\1àù\\3"))) (|a| (setf string (ppcre::regex-replace "^(.*)([a])([^aeiou]*)$" string "\\1à\\3"))) (|e| (setf string (ppcre::regex-replace "^(.*)([e])([^aeiou]*)$" string "\\1è\\3"))) (|i| (setf string (ppcre::regex-replace "^(.*)([i])([^aeiou]*)$" string "\\1ì\\3"))) (|o| (setf string (ppcre::regex-replace "^(.*)([o])([^aeiou]*)$" string "\\1ò\\3"))) (|u| (setf string (ppcre::regex-replace "^(.*)([u])([^aeiou]*)$" string "\\1ù\\3"))) ) ) ((eql tone lkb::'FALL) (case (intern vow) (|ai| (setf string (ppcre::regex-replace "^(.*)(ai)([^aeiou]*)$" string "\\1áì\\3"))) (|au| (setf string (ppcre::regex-replace "^(.*)(au)([^aeiou]*)$" string "\\1áù\\3"))) (|a| (setf string (ppcre::regex-replace "^(.*)([a])([^aeiou]*)$" string "\\1â\\3"))) (|e| (setf string (ppcre::regex-replace "^(.*)([e])([^aeiou]*)$" string "\\1ê\\3"))) (|i| (setf string (ppcre::regex-replace "^(.*)([i])([^aeiou]*)$" string "\\1î\\3"))) (|o| (setf string (ppcre::regex-replace "^(.*)([o])([^aeiou]*)$" string "\\1ô\\3"))) (|u| (setf string (ppcre::regex-replace "^(.*)([u])([^aeiou]*)$" string "\\1û\\3"))) ) ) ) ) string ) (defun map-tone (string) (setf string (ppcre::regex-replace "^([^aeiou]*)(a)(.*)_h:$" string "\\1áá\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(e)(.*)_h:$" string "\\1éé\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(i)(.*)_h:$" string "\\1íí\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(o)(.*)_h:$" string "\\1óó\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(u)(.*)_h:$" string "\\1úú\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(a)(.*)_l:$" string "\\1àà\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(e)(.*)_l:$" string "\\1èè\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(i)(.*)_l:$" string "\\1ìì\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(o)(.*)_l:$" string "\\1òò\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(u)(.*)_l:$" string "\\1ùù\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(au)(.*)_h$" string "\\1áú\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(ai)(.*)_h$" string "\\1áí\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(a)(.*)_h$" string "\\1á\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(e)(.*)_h$" string "\\1é\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(i)(.*)_h$" string "\\1í\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(o)(.*)_h$" string "\\1ó\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(u)(.*)_h$" string "\\1ú\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(a)(.*)_l$" string "\\1à\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(e)(.*)_l$" string "\\1è\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(i)(.*)_l$" string "\\1ì\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(o)(.*)_l$" string "\\1ò\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(u)(.*)_l$" string "\\1ù\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(a)(.*)_hl:$" string "\\1áà\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(e)(.*)_hl:$" string "\\1éè\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(i)(.*)_hl:$" string "\\1íì\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(o)(.*)_hl:$" string "\\1óò\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(u)(.*)_hl:$" string "\\1úù\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(au)(.*)_hl$" string "\\1áù\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(ai)(.*)_hl$" string "\\1áì\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(a)(.*)_hl$" string "\\1â\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(e)(.*)_hl$" string "\\1ê\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(i)(.*)_hl$" string "\\1î\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(o)(.*)_hl$" string "\\1ô\\3")) (setf string (ppcre::regex-replace "^([^aeiou]*)(u)(.*)_hl$" string "\\1û\\3")) ) (defun tidy-tone (string) (setf string (ppcre::regex-replace-all "(áà)" string "âa")) (setf string (ppcre::regex-replace-all "(áù)" string "âu")) (setf string (ppcre::regex-replace-all "(áì)" string "âi")) (setf string (ppcre::regex-replace-all "(éè)" string "êe")) (setf string (ppcre::regex-replace-all "(íì)" string "îi")) (setf string (ppcre::regex-replace-all "(óò)" string "ôo")) (setf string (ppcre::regex-replace-all "(úù)" string "ûu")) (setf string (ppcre::regex-replace-all "(áá)" string "aa")) (setf string (ppcre::regex-replace-all "(éé)" string "ee")) (setf string (ppcre::regex-replace-all "(íí)" string "ii")) (setf string (ppcre::regex-replace-all "(óó)" string "oo")) (setf string (ppcre::regex-replace-all "(úú)" string "uu")) (setf string (ppcre::regex-replace-all "(àà)" string "àa")) (setf string (ppcre::regex-replace-all "(àì)" string "ài")) (setf string (ppcre::regex-replace-all "(àù)" string "àu")) (setf string (ppcre::regex-replace-all "(èè)" string "èe")) (setf string (ppcre::regex-replace-all "(ìì)" string "ìi")) (setf string (ppcre::regex-replace-all "(òò)" string "òo")) (setf string (ppcre::regex-replace-all "(ùù)" string "ùu")) (setf string (ppcre::regex-replace-all "(á)" string "a")) (setf string (ppcre::regex-replace-all "(é)" string "e")) (setf string (ppcre::regex-replace-all "(í)" string "i")) (setf string (ppcre::regex-replace-all "(ó)" string "o")) (setf string (ppcre::regex-replace-all "(ú)" string "u")) ) (defun composite-tone (string) (setf string (ppcre::regex-replace-all "(áà)" string "ā̂")) (setf string (ppcre::regex-replace-all "(áù)" string "âu")) (setf string (ppcre::regex-replace-all "(áì)" string "âi")) (setf string (ppcre::regex-replace-all "(éè)" string "ē̂")) (setf string (ppcre::regex-replace-all "(íì)" string "ī̂")) (setf string (ppcre::regex-replace-all "(óò)" string "ō̂")) (setf string (ppcre::regex-replace-all "(úù)" string "ū̂")) (setf string (ppcre::regex-replace-all "(áá)" string "ā")) (setf string (ppcre::regex-replace-all "(éé)" string "ē")) (setf string (ppcre::regex-replace-all "(íí)" string "ī")) (setf string (ppcre::regex-replace-all "(óó)" string "ō")) (setf string (ppcre::regex-replace-all "(úú)" string "ū")) (setf string (ppcre::regex-replace-all "(àà)" string "ā̀")) (setf string (ppcre::regex-replace-all "(àì)" string "ài")) (setf string (ppcre::regex-replace-all "(àù)" string "àu")) (setf string (ppcre::regex-replace-all "(èè)" string "ḕ")) (setf string (ppcre::regex-replace-all "(ìì)" string "ī̀")) (setf string (ppcre::regex-replace-all "(òò)" string "ṑ")) (setf string (ppcre::regex-replace-all "(ùù)" string "ū̀")) (setf string (ppcre::regex-replace-all "(à)" string "à")) (setf string (ppcre::regex-replace-all "(è)" string "è")) (setf string (ppcre::regex-replace-all "(ì)" string "ì")) (setf string (ppcre::regex-replace-all "(ò)" string "ò")) (setf string (ppcre::regex-replace-all "(ù)" string "ù")) (setf string (ppcre::regex-replace-all "(â)" string "â")) (setf string (ppcre::regex-replace-all "(ê)" string "ê")) (setf string (ppcre::regex-replace-all "(î)" string "î")) (setf string (ppcre::regex-replace-all "(ô)" string "ô")) (setf string (ppcre::regex-replace-all "(û)" string "û")) (setf string (ppcre::regex-replace-all "(á)" string "a")) (setf string (ppcre::regex-replace-all "(é)" string "e")) (setf string (ppcre::regex-replace-all "(í)" string "i")) (setf string (ppcre::regex-replace-all "(ó)" string "o")) (setf string (ppcre::regex-replace-all "(ú)" string "u")) ) (defun instantiate-generic-lexical-entry (gle surface pred &optional carg) (let ((tdfs (copy-tdfs-elements (lex-entry-full-fs (if (gle-p gle) (gle-le gle) gle)))) (spath (if carg '(SYNSEM LKEYS KEYREL CARG) '(SYNSEM LKEYS KEYREL PRED)))) (loop with dag = (tdfs-indef tdfs) for path in (list '(ORTH FIRST) spath) for foo = (existing-dag-at-end-of dag path) when foo do (setf (dag-type foo) *string-type*)) (let* ((surface (or #+:logon (case (gle-id gle) (guess_n_gle (format nil "/~a/" surface)) (decade_gle (format nil "~as" surface))) surface)) (unifications (list (make-unification :lhs (create-path-from-feature-list (append *orth-path* *list-head*)) :rhs (make-u-value :type surface)) (make-unification :lhs (create-path-from-feature-list (append *orth-path* *list-tail*)) :rhs (make-u-value :type *empty-list-type*)) (make-unification :lhs (create-path-from-feature-list spath) :rhs (make-u-value :type (or carg pred))))) (indef (process-unifications unifications)) (indef (and indef (create-wffs indef))) (overlay (and indef (make-tdfs :indef indef)))) (values (when overlay (with-unification-context (ignore) (let ((foo (yadu tdfs overlay))) (when foo (copy-tdfs-elements foo))))) surface)))) (defun extract-strings-from-parse-record nil (loop for edge in *parse-record* collect (extract-string-from-p-edge edge))) (defun extract-string-from-p-edge (edge) (or (edge-string edge) (let ((string (cond ((fboundp *gen-extract-surface-hook*) (funcall *gen-extract-surface-hook* edge)) (t (g-edge-leaves edge))))) (setf (edge-string edge) string)))) ;;;(defun find-infl-pos (unifs orths sense-id) ;;; (declare (ignore unifs orths sense-id)) ;;; nil)