;;;-*- Mode: Lisp; Package: CONSTRAINT-GRAMMAR-PARSER -*- (in-package :cgp) ;; see fsa::resolve() for string matching (defmethod token-class ((sentence-class (eql 'regexp-sentence))) *token-class*) ;; convenience function (defun filter (string &key (regexp *regexp-parser*) transduce-p) (let ((*sentence-class* 'regexp-sentence) (*token-class* 'regexp-token) (*cg* (gethash "nbo" *cg-table*))) (disambiguate-from-string string :cg *cg* :stream *standard-output* :print-function (lambda (sentence &key stream &allow-other-keys) (print-matches (setf *sentence* (regexp-filter-sentence *cg* sentence :regexp regexp :transduce-p transduce-p)) :stream stream :expand-tokens-p nil :transduce-p transduce-p))))) #+test (filter "Hun kommer fra Universitetet i Bonn, Det lille kaffekompaniet.") (defmethod print-matches ((sentence regexp-sentence) &rest rest &key stream transduce-p &allow-other-keys) (let ((first-p t)) (dolist (match (reverse (matches sentence))) (if first-p (setf first-p nil) (progn (write-line "---------------------------------------------------------------------" stream) (terpri stream))) (if transduce-p (dolist (token+nf (caddr match)) (destructuring-bind (token . new-features) token+nf (unless (find 'ignore new-features) (apply #'print-token token :additional-features new-features rest)))) (labels ((walk (token last) (cond ((null token) nil) ((eq token last) (apply #'print-token token rest)) (t (apply #'print-token token rest) (walk (token-next token) last))))) (walk (car match) (cadr match)))))) sentence) (defmethod regexp-filter-sentence ((cg ne-constraint-grammar) (sentence regexp-sentence) &key transduce-p (regexp (regexp-parser cg)) (use-lc-features-p t) (no-overlap-p t)) (if transduce-p (transduce-boolean-list-regexp sentence (first-token sentence) regexp :use-lc-features-p use-lc-features-p :no-overlap-p no-overlap-p :maximal-p t) (match-boolean-list-regexp sentence (first-token sentence) regexp :use-lc-features-p use-lc-features-p :no-overlap-p no-overlap-p))) (defun boolean-list-delta-get (token state delta &key use-lc-features-p) (let ((relation (fsa::relation-get state delta)) (features-list (or (and use-lc-features-p (append (token-features token) (lc-features token))) (token-features token))) (result ())) (when relation (dolist (lemma.feature-vector features-list) (when lemma.feature-vector (let* ((features (cons (car lemma.feature-vector) (code-features (cdr lemma.feature-vector)))) (states (fsa::relation-subsumed-get features relation))) ;; STATE ist a list of state id + new features list (cp. fsa::KEY-SUBSUME) (dolist (state states) (pushnew (append state (list features)) result :test #'equal))))) result))) (defmethod transduce-boolean-list-regexp ((sentence regexp-sentence) start-token (fr feature-regexp) &key use-lc-features-p ;; look only at phrases beginning with UC first-uppercase-p (ignore-non-string-tokens-p t) (no-overlap-p t) (maximal-p t)) (let ((dfa (fsa::regexp-dfa fr))) (cond ((null start-token) sentence) ((not (and (stringp (token-value start-token)) (or (not first-uppercase-p) (first-uppercase-p (token-value start-token))))) (transduce-boolean-list-regexp sentence (token-next start-token) fr :use-lc-features-p use-lc-features-p :no-overlap-p no-overlap-p)) (t (let ((delta (fsa:fsa-delta dfa)) (start-state (fsa:fsa-start-state dfa)) (longest-match-token nil)) (labels ((walk (state token) (collecting (let ((new-states (boolean-list-delta-get token state delta :use-lc-features-p use-lc-features-p))) (dolist (new-state+morph new-states) (destructuring-bind (new-state new-features . morph) new-state+morph (declare (ignore morph)) (let ((final-p (fsa:set-member-p new-state (fsa:fsa-final-states dfa)))) (when (and final-p (token< longest-match-token token)) (setf longest-match-token token) (collect (list (cons token new-features)))) (when (or (not final-p) maximal-p) (let ((chain (if ignore-non-string-tokens-p (when-let (next-token (next-str-token token)) (walk new-state next-token)) (when (token-next token) (walk new-state (token-next token)))))) (cond ((null chain) nil) ((find 'ignore new-features) (collect-append chain)) (t (collect (cons (cons token new-features) chain))))))) #+old (cond ((not (fsa:set-member-p new-state (fsa:fsa-final-states dfa))) (let ((chain (if ignore-non-string-tokens-p (when-let (next-token (next-str-token token)) (walk new-state next-token)) (when (token-next token) (walk new-state (token-next token)))))) (cond ((null chain) nil) ((find 'ignore new-features) (collect-append chain)) (t (collect (cons (cons token new-features) chain)))))) ((token< longest-match-token token) (setf longest-match-token token) (collect (list (cons token new-features)))) (t ;; match, but not longest nil)))))))) ;; fetch longest match (labels ((walk-result (branch) (cond ((null branch) nil) ((eq (caar branch) longest-match-token) (list (car branch))) (t (block find (mapc (lambda (sub-branch) (when-let (chain (walk-result sub-branch)) (return-from find (cons (car branch) chain)))) (cdr branch))))))) (let* ((match (block find (mapc (lambda (sub-branch) (when-let (chain (walk-result sub-branch)) (return-from find chain))) (walk start-state start-token)))) (first (caar match)) (last (caar (last match)))) (cond (match (map-tokens sentence (lambda (token) (setf (match token) (cond ((eq token first) (if (eq token last) :whole-match :match-start)) ((eq token last) :match-end) (t :match)))) :start start-token :walk-function (lambda (token) (unless (eq token last) (token-next token)))) (push (list first last match nil nil) (matches sentence)) (transduce-boolean-list-regexp sentence (if no-overlap-p (token-next longest-match-token) (token-next start-token)) fr :use-lc-features-p use-lc-features-p :no-overlap-p no-overlap-p)) (t (transduce-boolean-list-regexp sentence (token-next start-token) fr :use-lc-features-p use-lc-features-p :no-overlap-p no-overlap-p))))) sentence)))))) (defmethod match-boolean-list-regexp ((sentence regexp-sentence) start-token (fr feature-regexp) &key (sentence-start-token start-token) use-lc-features-p ;; look only at phrases beginning with UC first-uppercase-p (ignore-non-string-tokens-p t) (no-overlap-p t)) #+debug(print (cons :start-token start-token)) (let ((dfa (fsa::regexp-dfa fr))) (cond ((null start-token) sentence) ((and (stringp (token-value start-token)) first-uppercase-p (eq start-token sentence-start-token)) (match-boolean-list-regexp sentence (token-next start-token) fr :sentence-start-token (when (and sentence-start-token (or (quote-token-p start-token) (stroke-token-p start-token))) (token-next start-token)) :use-lc-features-p use-lc-features-p :ignore-non-string-tokens-p ignore-non-string-tokens-p :first-uppercase-p first-uppercase-p :no-overlap-p no-overlap-p)) ((and (stringp (token-value start-token)) (or (not first-uppercase-p) (first-uppercase-p (token-value start-token)))) (let ((delta (fsa:fsa-delta dfa)) (start-state (fsa:fsa-start-state dfa)) (longest-match-token nil) (head-token nil) (head-token-features nil)) (labels ((walk (state token head head-features) (let ((new-states (boolean-list-delta-get token state delta :use-lc-features-p use-lc-features-p))) (dolist (new-state+morph new-states) (destructuring-bind (new-state new-features . morph) new-state+morph (declare (ignore morph)) ;; new for logon, buggy!! #+ignore (dolist (f new-features) (unless (eq f 'head) (dolist (pair (token-features token)) (set-feature (cdr pair) f)))) (let ((head (if (find 'head new-features) token head)) (head-features (if (find 'head new-features) (remove 'head new-features) head-features))) (when (and (fsa:set-member-p new-state (fsa:fsa-final-states dfa)) (token< longest-match-token token)) #+debug(print (list :match start-token token)) (setf longest-match-token token head-token head head-token-features head-features)) (if ignore-non-string-tokens-p (when-let (token (next-str-token token)) (walk new-state token head head-features)) (when (token-next token) (walk new-state (token-next token) head))))))))) (walk start-state start-token nil nil) (cond (longest-match-token #+debug (print (list :start-token start-token :longest-match-token longest-match-token)) (map-tokens sentence (lambda (token) (setf (match token) (cond ((eq token start-token) (if (eq token longest-match-token) :whole-match :match-start)) ((eq token longest-match-token) :match-end) (t :match)))) :start start-token :walk-function (lambda (token) (unless (eq token longest-match-token) (token-next token)))) #+debug(print (list :match start-token longest-match-token)) (push (list start-token longest-match-token head-token nil head-token-features) (matches sentence)) (match-boolean-list-regexp sentence (if no-overlap-p (token-next longest-match-token) (token-next start-token)) fr :sentence-start-token nil :use-lc-features-p use-lc-features-p :ignore-non-string-tokens-p ignore-non-string-tokens-p :first-uppercase-p first-uppercase-p :no-overlap-p no-overlap-p)) (t (match-boolean-list-regexp sentence (token-next start-token) fr :sentence-start-token nil :use-lc-features-p use-lc-features-p :ignore-non-string-tokens-p ignore-non-string-tokens-p :first-uppercase-p first-uppercase-p :no-overlap-p no-overlap-p))) sentence))) (t (match-boolean-list-regexp sentence (token-next start-token) fr :sentence-start-token (when sentence-start-token (token-next start-token)) :use-lc-features-p use-lc-features-p :ignore-non-string-tokens-p ignore-non-string-tokens-p :first-uppercase-p first-uppercase-p :no-overlap-p no-overlap-p))))) #+test (defmethod analyze-boolean-list-regexp ((sentence list) (fsa fsa::boolean-list-dfa)) (let ((delta (fsa:fsa-delta fsa)) (start-state (fsa:fsa-start-state fsa)) (length (length sentence)) (result nil)) (labels ((walk (state pos list) (let* ((token (nth pos sentence)) (new-states (boolean-list-delta-get token state delta))) (dolist (new-state+morph new-states) (let ((new-state (car new-state+morph))) (when (fsa:set-member-p new-state (fsa:fsa-final-states fsa)) (setf result token)) (unless (>= pos (1- length)) (walk new-state (+ pos 1) (cons token list))))) (unless (= pos (1- length)) (walk state (1+ pos) list))))) (walk start-state 0 nil) result))) :eof