(in-package :lkb) ;;; ;;; identify characters that can form words; all other characters will create ;;; word boundaries and later be suppressed in tokenization. ;;; (defun alphanumeric-or-extended-p (c) (and (graphic-char-p c) (not (member c *punctuation-characters*)))) ;;; ;;; determine surface order of constituents in rule: returns list of paths into ;;; feature structure of rule, i.e. (nil (args first) (args rest first)) for a ;;; binary rule, where the first list element is the path to the mother node of ;;; the rule. ;;; (defun establish-linear-precedence (rule) (let ((daughters (loop for args = (existing-dag-at-end-of rule '(args)) then (existing-dag-at-end-of args *list-tail*) for daughter = (when args (get-value-at-end-of args *list-head*)) for path = (list 'args) then (append path *list-tail*) while (and daughter (not (eq daughter 'no-way-through))) collect (append path *list-head*)))) (if (null daughters) (cerror "Ignore it" "Rule without daughters") (cons nil daughters)))) ;;; ;;; detect rules that have orthographemic variation associated to them; those ;;; who do should only be applied within the morphology system; for the time ;;; being use value of NEEDS-AFFIX feature, though it would be nicer to rely ;;; on a type distinction of lexical rules or re-entrancy of ORTH. ;;; (defun spelling-change-rule-p (rule) (let ((affix (get-dag-value (tdfs-indef (rule-full-fs rule)) 'needs-affix))) (and affix (bool-value-true affix)))) ;;; ;;; create feature structure representation of orthography value for insertion ;;; into the output structure of inflectional rules; somewhat more complicated ;;; than one might expect because of treatment for multi-word elements. ;;; (defun make-unknown-word-sense-unifications (word-string &optional stem) ;;; this assumes we always treat unknown words as proper names ;;; uncomment the *unknown-word-types* in globals.lsp ;;; to activate this (when word-string (list (make-unification :lhs (create-path-from-feature-list '(MORPH LIST FIRST STEM FIRST)) :rhs (make-u-value :type (or stem word-string))) (make-unification :lhs (create-path-from-feature-list '(MORPH LIST FIRST STEM REST)) :rhs (make-u-value :type 'lkb::*null*)) (make-unification :lhs (create-path-from-feature-list '(SYNSEM LKEYS KEYTAG)) :rhs (make-u-value :type (string-downcase word-string)))))) (defun instantiate-generic-lexical-entry (gle surface &optional (carg surface)) (let ((tdfs (copy-tdfs-elements (lex-entry-full-fs (if (gle-p gle) (gle-le gle) gle))))) (loop with dag = (tdfs-indef tdfs) for path in '((MORPH LIST FIRST STEM FIRST) (SYNSEM LKEYS KEYTAG)) for foo = (existing-dag-at-end-of dag path) do (setf (dag-type foo) *string-type*)) (let* ((surface(or #+:logon (case (gle-id gle) (guess_n_gle (format nil "/~a/" surface)) (decade_gle (format nil "~aer" surface))) surface)) (unifications (list (make-unification :lhs (create-path-from-feature-list (append *orth-path* *list-head*)) :rhs (make-u-value :type surface)) (make-unification :lhs (create-path-from-feature-list (append *orth-path* *list-tail*)) :rhs (make-u-value :type *empty-list-type*)) (make-unification :lhs (create-path-from-feature-list '(SYNSEM LKEYS KEYTAG)) :rhs (make-u-value :type carg)))) (indef (process-unifications unifications)) (indef (and indef (create-wffs indef))) (overlay (and indef (make-tdfs :indef indef)))) (values (when overlay (with-unification-context (ignore) (let ((foo (yadu tdfs overlay))) (when foo (copy-tdfs-elements foo))))) surface)))) ;;; from erg (defun instantiate-generic-lexical-entry (gle surface &optional pred carg) (let ((tdfs (copy-tdfs-elements (lex-entry-full-fs (if (gle-p gle) (gle-le gle) gle)))) (spath (if carg '(SYNSEM LKEYS KEYTAG) (and pred '(SYNSEM LKEYS KEYREL))))) (loop with dag = (tdfs-indef tdfs) for path in (append '((MORPH LIST FIRST STEM FIRST)) (list spath)) for foo = (existing-dag-at-end-of dag path) when foo do (setf (dag-type foo) *string-type*)) (let* ((surface (or #+:logon (case (gle-id gle) (guess_n_gle (format nil "/~a/" surface)) (decade_gle (format nil "~aer" surface))) surface)) (unifications (append (list (make-unification :lhs (create-path-from-feature-list (append *orth-path* *list-head*)) :rhs (make-u-value :type surface)) (make-unification :lhs (create-path-from-feature-list (append *orth-path* *list-tail*)) :rhs (make-u-value :type *empty-list-type*))) (when spath (list (make-unification :lhs (create-path-from-feature-list spath) :rhs (make-u-value :type (or carg pred))))))) (indef (process-unifications unifications)) (indef (and indef (create-wffs indef))) (overlay (and indef (make-tdfs :indef indef)))) (values (when overlay (with-unification-context (ignore) (let ((foo (yadu tdfs overlay))) (when foo (copy-tdfs-elements foo))))) surface)))) (defun make-orth-tdfs (orth) (let ((unifs nil) (tmp-orth-path *orth-path*)) (loop for orth-value in (split-into-words orth) do (let ((opath (create-path-from-feature-list (append tmp-orth-path *list-head*)))) (push (make-unification :lhs opath :rhs (make-u-value :type orth-value)) unifs) (setq tmp-orth-path (append tmp-orth-path *list-tail*)))) (let ((indef (process-unifications unifs))) (when indef (setf indef (create-wffs indef)) (make-tdfs :indef indef))))) (defun set-temporary-lexicon-filenames nil (let* ((version (or (find-symbol "*GRAMMAR-VERSION*" :common-lisp-user) (and (find-package :lkb) (find-symbol "*GRAMMAR-VERSION*" :lkb)))) (prefix (if (and version (boundp version)) (remove-if-not #'alphanumericp (symbol-value version)) "lexicon"))) (setf *psorts-temp-file* (make-pathname :name (concatenate 'string prefix ".lex") :host (pathname-host (lkb-tmp-dir)) :device (pathname-device (lkb-tmp-dir)) :directory (pathname-directory (lkb-tmp-dir)))) (setf *psorts-temp-index-file* (make-pathname :name (concatenate 'string prefix ".idx") :host (pathname-host (lkb-tmp-dir)) :device (pathname-device (lkb-tmp-dir)) :directory (pathname-directory (lkb-tmp-dir)))) (setf *leaf-temp-file* (make-pathname :name (concatenate 'string prefix ".lts") :host (pathname-host (lkb-tmp-dir)) :device (pathname-device (lkb-tmp-dir)) :directory (pathname-directory (lkb-tmp-dir)))) (setf *predicates-temp-file* (make-pathname :name (concatenate 'string prefix ".ric") :host (pathname-host (lkb-tmp-dir)) :device (pathname-device (lkb-tmp-dir)) :directory (pathname-directory (lkb-tmp-dir)))) (setf *semantics-temp-file* (make-pathname :name (concatenate 'string prefix ".stc") :host (pathname-host (lkb-tmp-dir)) :device (pathname-device (lkb-tmp-dir)) :directory (pathname-directory (lkb-tmp-dir)))))) (defun bool-value-true (fs) (and fs (let ((fs-type (type-of-fs fs))) (eql fs-type '+)))) (defun bool-value-false (fs) (and fs (let ((fs-type (type-of-fs fs))) (eql fs-type '-)))) #| (defun gen-extract-surface (edge &optional (initialp t) &key stream) (if stream (let ((daughters (edge-children edge))) (if daughters (loop for daughter in daughters for foo = initialp then nil append (gen-extract-surface daughter foo :stream stream)) (let* ((entry (get-lex-entry-from-id (first (edge-lex-ids edge)))) (tdfs (and entry (lex-entry-full-fs entry))) (type (and tdfs (type-of-fs (tdfs-indef tdfs)))) (string (string-downcase (copy-seq (first (edge-leaves edge))))) (capitalizep (ignore-errors (loop for match in '(proper-noun-le noun-and-proper-name-le proper-noun-name-le proper-noun-det-le anrede-title-noun-le title-noun-le anrede-form-noun-le count-title-noun-le deverbal-noun-le count-noun-le count-noun-mass-unit-le adj-count-noun-le count-noun-t-le inf-count-noun-le currency-noun-le unknown-currency-noun-le unit-noun-le unknown-date-noun-le unknown-percent-noun-le pp-noun-le scomp-noun-le int-clause-noun-le mass-noun-le mass-pp-noun-le relational-noun-le month-noun-le month-mod-noun-le day-noun-le time-mod-noun-le time-mod-noun-ty-le cardyear-mod-noun-ty-le special-count-noun-le special-mass-noun-le letter-noun-le ) thereis (or (eq type match) (subtype-p type match))))) (cliticp (and (> (length string) 0) (char= (char string 0) #\')))) (when capitalizep (loop with spacep = t for i from 0 to (- (length string) 1) for c = (schar string i) when (char= c #\Space) do (setf spacep t) else when (char= c #\_) do (setf spacep t) (setf (schar string i) #\Space) else do (when (and spacep (alphanumericp c)) (setf (schar string i) (char-upcase c))) (setf spacep nil))) (when (and initialp (alphanumericp (schar string 0))) (setf (schar string 0) (char-upcase (schar string 0)))) (format stream "~@[ ~*~]~a" (and (not initialp) (not cliticp)) string)))) (let ((stream (make-string-output-stream))) (gen-extract-surface edge initialp :stream stream) (get-output-stream-string stream)))) |# ;;; ;;; the following temporary expedient attempts to get capitalization more right ;;; than we used to do in generator outputs. still, for acronyms like `IBM' or ;;; complex names including lower case elements, i see no alternative to using ;;; STEM to spell out the actual (canonical) surface form. that would seem to ;;; require that we re-view assumptions about capitalization across the lexicon ;;; et al. but the LKB should probably do that one day! (30-aug-05; oe) ;;; --- as of late, the ERG lexicon actually contains (some) STEM values that ;;; reflect canonical capitalization; the modified code below will now try to ;;; either (a) respect the orthography from the lexicon, as long as it contains ;;; at least one upper-case letter and is string-equal() to the inflected form ;;; (which tends to be true for proper names at least :-) or (b) invoke the old ;;; heuristics to try and guess appropriate capitalization. still not quite a ;;; perfect solution, but to do better i now think the morphology would have to ;;; stop upcasing things as soon as one of the inflectional rules applies. ;;; (18-dec-06; oe) ;;; (defun gen-extract-surface (edge &optional (initialp t) &key cliticp stream) (if stream (let ((daughters (edge-children edge))) (if daughters (loop for daughter in daughters for foo = initialp then nil do (setf cliticp (gen-extract-surface daughter foo :cliticp cliticp :stream stream)) #+:logon finally #+:logon (setf (edge-lnk edge) (mrs::combine-lnks (edge-lnk (first daughters)) (edge-lnk (first (last daughters)))))) (let* ((entry (get-lex-entry-from-id (first (edge-lex-ids edge)))) (orth (format nil "~{~a~^ ~}" (lex-entry-orth entry))) ;; ;; need to fix-up irregular cases like `Englishmen' manually :-{ ;; (orth (if (ppcre::scan "man$" orth) (subseq orth 0 (- (length orth) 3)) orth)) (tdfs (and entry (lex-entry-full-fs entry))) (type (and tdfs (type-of-fs (tdfs-indef tdfs)))) (string (string-downcase (copy-seq (first (edge-leaves edge))))) ;; ;; _fix_me_ ;; maybe we could be more courageous and just search for .orth. ;; as a sub-sequence of .string., starting at position .prefix. ;; (22-dec-06; oe) (prefix (loop for c across string while (member c '(#\( #\" #\') :test #'char=) count 1)) (suffix (min (length string) (+ prefix (length orth)))) (suffix (when (string-equal orth string :start2 prefix :end2 suffix) suffix)) (rawp (and suffix (loop for c across orth thereis (upper-case-p c)))) (capitalizep (ignore-errors (loop for match in '(proper-noun-le noun-and-proper-name-le proper-noun-name-le proper-noun-det-le anrede-title-noun-le title-noun-le anrede-form-noun-le count-title-noun-le deverbal-noun-le count-noun-le count-noun-mass-unit-le adj-count-noun-le count-noun-t-le inf-count-noun-le currency-noun-le unknown-currency-noun-le unit-noun-le unknown-date-noun-le unknown-percent-noun-le pp-noun-le scomp-noun-le int-clause-noun-le mass-noun-le mass-pp-noun-le relational-noun-le month-noun-le month-mod-noun-le day-noun-le time-mod-noun-le time-mod-noun-ty-le cardyear-mod-noun-ty-le special-count-noun-le special-mass-noun-le letter-noun-le) thereis (or (eq type match) (subtype-p type match))))) (cliticp (or cliticp (and (> (length string) 0) (char= (char string 0) #\'))))) (if rawp (setf string (concatenate 'string (subseq string 0 prefix) orth (subseq string suffix))) (when capitalizep (loop with spacep = t for i from 0 to (- (length string) 1) for c = (schar string i) when (char= c #\Space) do (setf spacep t) else when (char= c #\_) do (setf spacep t) (setf (schar string i) #\Space) else do (when (and spacep (alphanumericp c)) (setf (schar string i) (char-upcase c))) (setf spacep nil)))) (when (and (> (length string) 1) (char= (char string 0) #\_) (upper-case-p (char string 1))) (setf string (subseq string 1))) (when (and initialp (alphanumericp (schar string 0))) (setf (schar string 0) (char-upcase (schar string 0)))) (unless (or initialp cliticp) (format stream " ")) (let (#+:logon (start (file-position stream))) (loop with hyphenp for c across string unless (and hyphenp (char= c #\space)) do (write-char c stream) when (char= c #\-) do (setf hyphenp t) else do (setf hyphenp nil)) #+:logon (setf (edge-lnk edge) (list :characters start (file-position stream)))) ;; ;; finally, inform the caller as to whether we output something that ;; inhibits intervening space (e.g. `mid-July'). ;; (unless (string= orth "") (member (schar orth (- (length orth) 1)) '(#\-) :test #'char=))))) (let ((stream (make-string-output-stream))) (gen-extract-surface edge initialp :stream stream) (get-output-stream-string stream)))) (eval-when #+:ansi-eval-when (:load-toplevel :compile-toplevel :execute) #-:ansi-eval-when (load eval compile) (setf *gen-extract-surface-hook* 'gen-extract-surface)) (in-package "MT") #+:null (defun gg-construct-semi () (let semi (mt:construct-semi) (with-open-file (stream (format nil "~a/dfki/gg/core.smi" (getenv "LOGONROOT")) :direction :output :if-exists :supersede) (mt::print-semi semi :format :compact :stream stream)) ) )