;;; Copyright (c) 1998-2022 John Carroll, Ann Copestake, Robert Malouf, Stephan Oepen ;;; see LICENSE for conditions (in-package :lkb) ;;; Creating and running the `quick check' (QC). For motivation, algorithms and empirical ;;; results see Malouf, Carroll & Copestake (2000). As well as FS unification failures, ;;; subsumption failures are also included (this addition post-dates the original article) ;;; allowing quick check to be used also as a subsumption pre-filter. ;;; The set of quickcheck paths are held in the parameter *check-paths* which is set in one ;;; of the grammar files. *check-paths* is used to construct the value of *check-paths-tree* ;;; which is what is used internally (defvar *check-paths-tree* nil) #+:sbcl (declaim (sb-ext:always-bound *check-paths-tree*)) (defvar *fail-paths-and-unifs*) (defvar *fail-paths*) (deftype qc-index () '(unsigned-byte 8)) (declaim (type qc-index *check-path-count*)) ; help compiler and defend against bad values (defstruct qcleaf (index 0 :type qc-index :read-only t) (spec nil :type (or hash-table symbol-type-name) :read-only t)) #+:sbcl (declaim (sb-ext:freeze-type qcleaf)) ;;; Wrapping `with-check-path-list-collection' around some call to parse or generate ;;; gathers statistics on feature paths that fail in unification and subsumption, computes ;;; from them a set of quickcheck paths, and writes these to the specified file. Any ;;; existing set of quickcheck paths is left untouched. Example calls below ;;; ;;; NB quickcheck is turned off during quickcheck path computation, so parsing / generation ;;; could be slow and pre-filtering of parsing tasks will be weaker - potentially causing ;;; the parse agenda to exceed reasonable size limit. #| (with-check-path-list-collection "/tmp/checkpaths.lsp" (do-parse-tty "Devito manages a programmer Abrams interviewed and Browne hired")) (with-check-path-list-collection "/tmp/checkpaths.lsp" (parse-sentences "~/Documents/grammars/erg/lkb/checkpaths.items" t)) (with-check-path-list-collection "/tmp/checkpaths.lsp" (generate-from-mrs-file "~/Documents/grammars/erg/tsdb/gold/mrs/result" t)) |# (defmacro with-check-path-list-collection (&whole whole output-file &body forms) `(let ((*recording-fail-paths-p* t) (*fail-paths-and-unifs* nil) (.completedp. nil)) (unwind-protect (multiple-value-prog1 ;; disable existing quickcheck, compute full forest and unpack at most 1 result (let ((*check-path-count* 0) (*check-paths-tree* nil) (*first-only-p* 1) (*gen-first-only-p* 1) (*unpacking-scoring-hook* (constantly 0.0)) (*show-parse-p* nil)) ,@forms) (setq .completedp. t)) (when .completedp. (format t "~%Extracting paths...") (force-output) (let ((check-paths (check-path-convert (extract-check-paths *fail-paths-and-unifs*)))) (with-open-file (str ,output-file :direction :output :if-exists :supersede :if-does-not-exist :create) (with-standard-io-syntax (with-package (:lkb) (format str "#|~%Check paths created from execution of~% ~S~%with grammar ~A on ~A~%|#~%" ',whole (get-grammar-version) (current-time :long t)) (format str "(CL:IN-PACKAGE #:LKB)~%") (format str "(DEFPARAMETER *CHECK-PATHS*~% '(~{~S~^~% ~}))~%" check-paths))))) (format t "~&Wrote file ~A~%" (truename ,output-file)))))) (defun call-with-fail-paths-recording (f dag1 dag2 &rest args) ;; called from a unification / subsumption /etc function when *recording-fail-paths-p* ;; is non-nil - calls f on dag1, dag2 and any extra args, then accumulates fail paths ;; + unification pairs after calls to record-fail-path (declare (dynamic-extent args)) (let* ((*fail-paths* nil) (res (multiple-value-list (apply f dag1 dag2 args)))) (if *fail-paths* (loop with paths = (loop for path in *fail-paths* when (and (existing-dag-at-end-of dag1 path) (existing-dag-at-end-of dag2 path)) collect path) with npaths = (length paths) for path in paths for item = (assoc path *fail-paths-and-unifs* :test #'equal) do (unless item (setq item (cons path (make-hash-table))) (setq *fail-paths-and-unifs* (nconc *fail-paths-and-unifs* (list item)))) ; found later => less frequent (setf (gethash *unify-generation* (cdr item)) npaths) finally (return nil)) ; indicate overall failure since there were fail-paths (values-list res)))) (defun record-fail-path (path) ;; we need all fresh conses when recording path since it was likely stack-allocated (push (reverse path) *fail-paths*)) ;;; Take a list of paths, each paired with a set whose elements are integers ;;; representing unifications and subsumptions that failed, and order the paths ;;; so that the first k paths cover as many of these failures as possible. ;;; This is an instance of the maximum coverage problem (Cormen et al. 2009 ;;; Introduction to Algorithms, 3rd Edition, 35.3). We use the standard greedy search, ;;; which seems to be the only practical approach since at this point k is not fixed ;;; (it can be changed at grammar load time via the parameter *check-path-count*). (defun extract-check-paths (fail-paths-and-unifs) (labels ((greedy-failure-cover (fail-paths-and-unifs) (when fail-paths-and-unifs (let ((max-item ;; path+set accounting for largest number of unification/subsumption failures (reduce #'(lambda (x y) (if (> (hash-table-count (cdr x)) (hash-table-count (cdr y))) x y)) fail-paths-and-unifs))) ;; erase each such failure u from all remaining paths p; each u is associated with ;; a count n of the number of paths it's in, which allows us to stop looking for u ;; as soon as we've found n paths where u is present - giving a substantial speed-up (cons (cons (car max-item) (hash-table-count (cdr max-item))) (let ((rest-items (remove max-item fail-paths-and-unifs :test #'eq))) (maphash #'(lambda (u n) (decf n) ; discount the occurrence of u in max-item (loop for (p . unifs) in rest-items while (> n 0) do (when (remhash u unifs) (decf n)))) (cdr max-item)) ;; repeat, dropping any paths whose failures have all been erased (greedy-failure-cover (remove-if #'(lambda (x) (zerop (hash-table-count (cdr x)))) rest-items)))))))) (greedy-failure-cover ;; an initial sort on set size massively speeds up erasing failures (sort fail-paths-and-unifs #'> :key #'(lambda (x) (hash-table-count (cdr x))))))) #| (defun extract-check-paths (fail-paths-and-unifs) ;; naive version which just returns paths in order of decreasing number of fails (sort (mapcar #'(lambda (item) (cons (car item) (hash-table-count (cdr item)))) fail-paths-and-unifs) #'> :key #'cdr)) |# ;;; Interactive interface (defun interactive-create-check-paths nil (let* ((test-file (ask-user-for-existing-pathname "Checkpaths sample file?")) (output-file (and test-file (ask-user-for-new-pathname "Checkpaths output file?")))) (when (and test-file output-file) (with-check-path-list-collection output-file (parse-sentences test-file t)) (format t "~%Grammar loading script should contain:~ ~%(lkb-load-lisp (this-directory) t)~%")))) ;;; Compute *check-paths-tree* - the 'recipe' for computing a quickcheck ;;; vector that's specific to this particular grammar version. ;;; Called after type hierarchy has been processed and constraints expanded. ;;; First used later on in the grammar loading process to fill the ;;; daughters-restricted field of rules. Each qc vector element corresponds to a ;;; feature path, and can contain one of 3 things: nil, a type name (a symbol), or an ;;; embedding of the type with respect to a portion of the type hierarchy (represented ;;; as a small integer). ;;; ;;; JAC 4-Dec-20: improved the latter case, so that only each leaf type descendant of ;;; the maximal type for the feature path has a bit set. This approach is essentially ;;; the 'compact encoding' in Ait-Kaci et al.'s 1989 paper 'Efficient Implementation ;;; of Lattice Operations' (section 5.1). Logical AND-ing two such integers tells us ;;; whether there is a greatest common subtype, and we can also check subsumption ;;; with LOGAND / LOGANDC1. Here, a qc vector element is nil if that path does not ;;; exist in the FS, or if the type at the end of the path is the maximal type for ;;; that path. Integer embedding paths can be compared far more quickly than type name ;;; paths, so paths are re-sorted with all of the former preceding all of the latter. ;;; ;;; NB Encoding only descendants that are leaves gives a big saving on bits. It still ;;; allows us to correctly determine whether two input types have a greatest common ;;; subtype (although we can't actually identify that subtype). However, for type ;;; subsumption we get false positive results in cases where two distinct input types ;;; have the same leaf descendents; in such cases the types appear to be the same, and ;;; therefore mutually subsume each other - but in fact the only possibilities are ;;; subsumption in one direction or no subsumption relationship at all. This issue could ;;; be fixed by allocating an extra bit to resolve each such ambiguity, but here it's ;;; not worth it. ;;; ;;; The embedding representation gives type unification and subsumption in very few ;;; machine instructions; specifically there are no table lookups, as is required in any ;;; approach based on memoisation. It's of particular benefit in the LKB since it also ;;; avoids indirections to get from type name symbols to corresponding integer indices. (defparameter *print-qc-encodings* nil "if set, information is printed about quickcheck vector elements and their encodings") (defconstant +qc-integer-width+ (1+ (integer-length most-positive-fixnum))) ; both +ve and -ve (defun optimise-check-unif-paths () (setq *check-paths-tree* nil) (when (find :vanilla *features*) (return-from optimise-check-unif-paths nil)) (let ((paths-and-freqs (loop with warned = nil for path-and-freq in *check-paths* when (and (or (and (consp path-and-freq) ; defensive since potentially user-editable (listp (car path-and-freq)) (null (cdr (last (car path-and-freq)))) (every #'symbolp (car path-and-freq)) (typep (cdr path-and-freq) '(integer 1 *))) (progn (format t "~%WARNING: Invalid item ~A in *check-paths* - ignoring it" path-and-freq) nil)) (loop for feat in (car path-and-freq) always (or (maximal-type-of feat) ; feature known? (progn (unless (member feat warned :test #'eq) (format t "~%WARNING: *check-paths* contains unknown feature ~A - ignoring path(s) concerned" feat) (push feat warned)) nil)))) collect path-and-freq))) (let ((paths-and-specs (let ((*type-representations* nil)) (declare (special *type-representations*)) (loop for (path . freq) in paths-and-freqs repeat *check-path-count* collect (optimise-check-unif-path path freq))))) (setq paths-and-specs (stable-sort paths-and-specs #'(lambda (x y) (and (not (symbolp x)) (symbolp y))) ; prioritise integer embeddings :key #'cdr)) (loop for (path . spec) in paths-and-specs for n from 0 do (setq *check-paths-tree* (add-path-to-tree path (make-qcleaf :index n :spec spec) *check-paths-tree*))) t))) (defun add-path-to-tree (p v tree) (if p (let ((branch (find (car p) tree :key #'(lambda (x) (and (consp x) (car x)))))) (if branch (progn (setf (cdr branch) (add-path-to-tree (cdr p) v (cdr branch))) tree) (append tree (list (reduce #'list p :from-end t :initial-value v))))) (cons v tree))) (defun optimise-check-unif-path (path freq) (declare (special *type-representations*) (ignore freq)) (cons path (let ((type (if (null path) *toptype* (minimal-type-for (car (last path)))))) (when *print-qc-encodings* (format t "~&~:A has value of type ~A~%" path type)) (or (getf *type-representations* type) (setf (getf *type-representations* type) (let* ((subs (retrieve-descendants type)) (bit-subs (select-types-getting-bits type subs))) (if bit-subs ;; for type and its subtypes, compute a mapping from ty to an integer ;; with a 1 bit for each of ty's subtypes that are in bit-subs (loop with nsubs = (length subs) with map = (make-hash-table :test #'eq :size (floor (* nsubs 1.5))) for ty in (cons (get-type-entry type) subs) for val = nil then (loop with i = 0 for td in (cons ty (ltype-descendants ty)) for pos = (position td bit-subs) do (etypecase pos (null) ((eql #.(1- +qc-integer-width+)) (setq i (logior i most-negative-fixnum))) ((integer 0 #.(- +qc-integer-width+ 2)) (setq i (dpb 1 (byte 1 pos) i)))) finally (return i)) do (setf (gethash (ltype-name ty) map) val) finally (return map)) type))))))) (defun select-types-getting-bits (type subs) (if (string-type-p type) (progn (when *print-qc-encodings* (format t "~& ~A may take a string value so cannot use bit encoding~%" type)) nil) (let* ((leaves/1-child (remove-if-not #'(lambda (ty) (<= (length (ltype-daughters ty)) 1)) subs)) (leaves (remove-if-not #'(lambda (ty) (null (ltype-daughters ty))) subs)) (res (loop for cand in (list leaves/1-child leaves) when (<= (length cand) +qc-integer-width+) return cand))) (when *print-qc-encodings* (format t "~& ~A has ~D subtypes (~D leaves/1-child, ~D leaves) - ~ ~:[too many for bit encoding~;encoding in ~D bits~]~%" type (length subs) (length leaves/1-child) (length leaves) res (length res))) res))) ;;; Compute quickcheck vector for a dag, and check two such vectors for (unification) ;;; compatibility or subsumption (declaim (inline restrict-fs-type-representation)) (defun restrict-fs-type-representation (type spec) ;; compute the qc representation of a grammar type according to spec argument: this ;; is either the type itself, nil (if it's the maximal type for the value of the ;; feature concerned), or an integer embedding of the type (covering only the part ;; of the hierarchy below that maximal type, looked up in a hash table) (etypecase spec (symbol-type-name (if (eq type spec) ; maximal type for feature concerned? nil type)) (hash-table (gethash type spec)))) (defun restrict-fs (fs) (let ((vals (make-array *check-path-count* :initial-element nil))) (labels ((traverse-qctree (tree d) (declare (type dag d) (inline get-dag-value)) (dolist (branch tree) (etypecase branch (cons (let ((v (get-dag-value d (car branch)))) (when v (traverse-qctree (cdr branch) v)))) (qcleaf (setf (svref vals (qcleaf-index branch)) (restrict-fs-type-representation (type-of-fs d) (qcleaf-spec branch)))))))) (traverse-qctree *check-paths-tree* fs) vals))) (defmacro type-bit-representation-p (x) `(typep ,x 'fixnum)) (defun restrictors-compatible-p (daughter-restricted child-restricted) (declare (simple-vector daughter-restricted child-restricted)) (loop for dt across daughter-restricted for ct across child-restricted always (cond ((or (eq dt ct) (null dt) (null ct))) ((and (type-bit-representation-p dt) (type-bit-representation-p ct)) ;; fixnum (bit) encodings (not (zerop (logand dt ct)))) (t ;; type name symbols / strings (greatest-common-subtype dt ct))))) (defun restrictors-subsuming-p (restricted1 restricted2 genp) ;; check whether the corresponding pairs of types in the restrictors are related by ;; subsumption in one direction and/or the other; also if genp=t, whether one type in the ;; pair subsumes the other in either direction (so there could possibly be a dag that ;; generalises those the restrictors were derived from - in the limited sense used by the ;; parser) (declare (simple-vector restricted1 restricted2)) (loop with forwardp = t ; t1 subsumes or is equal to t2 and backwardp = t ; vice versa for t1 across restricted1 for t2 across restricted2 do (flet ((not-forward () (if (or backwardp genp) (setq forwardp nil) (return nil))) (not-backward () (if (or forwardp genp) (setq backwardp nil) (return nil)))) (cond ((eq t1 t2)) ((null t1) (not-backward)) ; since t2 must be non-nil ((null t2) (not-forward)) ((and (type-bit-representation-p t1) (type-bit-representation-p t2)) ;; fixnum (bit) encodings (unless (= t1 t2) ; compiler can elide if fixnums satisfy not-eq => not-= (let ((gcs (logand t1 t2))) (cond ((= gcs t1) (not-forward)) ((= gcs t2) (not-backward)) (t (return nil)))))) ; !!! prevents any LCS generalisation on this path (t ;; type name symbols / strings (let ((gcs (greatest-common-subtype t1 t2))) ; eq case checked already (cond ((eq gcs t1) (not-forward)) ((eq gcs t2) (not-backward)) (t (return nil))))))) ; !!! ditto LCS generalisation finally (return (values forwardp backwardp genp)))) ;;; Compute quickcheck vector for a dag while inside scope of a set of unifications ;;; e.g. during an attempt to apply a grammar rule (defun unify-restrict-fs (fs) ;; fs assumed to have already been dereferenced (let ((vals (make-array *check-path-count* :initial-element nil))) (labels ((unify-traverse-qctree (tree d) (declare (type dag d) (inline unify-get-dag-value)) (dolist (branch tree) (etypecase branch (cons (let ((v (unify-get-dag-value d (car branch)))) (when v (unify-traverse-qctree (cdr branch) (deref-dag v))))) (qcleaf (setf (svref vals (qcleaf-index branch)) (restrict-fs-type-representation (unify-get-type d) (qcleaf-spec branch)))))))) (unify-traverse-qctree *check-paths-tree* fs) vals))) (defun x-restrict-and-compatible-p (fs child-restricted) ;; Deprecated. ;; Not on a critical execution path, so turned into a no-op. A complete version would ;; include most of unify-restrict-fs and restrictors-compatible-p but with ;; (svref child-restricted (qcleaf-index branch)). Also it could only sensibly check ;; types in the order they occur in *check-paths-tree* rather than in quickcheck vector (declare (ignore fs child-restricted)) t)