;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp -*- (in-package :cgp) ;; translates from CG features to Norgram features (defun ner-norgram-features (word cg-features) #+debug(print (list :word word)) #+debug(morph-server::print-norgram-morphology word) (collecting (dolist (reading cg-features) (when reading (collect (cons (car reading) (sort (collecting (let ((features (code-features (cdr reading)))) #+debug(let ((*package* (find-package :cgp))) (print (list :features features))) (dolist (f features) (if (string-equal f "subst") (unless (find "prop" features :test #'string-equal) (collect "+Prop")) (collect-append (cdr (find f *name-features-mapping-table* :test #'string-equal :key #'car))))))) (lambda (f1 f2) (< (or (position f1 morph-server::*bm-morph-feature-vector* :test #'string=) 0) (or (position f2 morph-server::*bm-morph-feature-vector* :test #'string=) 0)))))))))) (defmethod get-ner-token ((token token) &key (stream *standard-output*) concat-token additional-features (print-lc-features t) expand-tokens-p &allow-other-keys) #+debug(print (list :token token :match (match token) :exp (token-expansion token))) (let ((value (effective-token-value token)) (features (remove-feature-inclusions (or (token-features token) (when concat-token (token-features concat-token))))) (concat-features (when concat-token (token-features concat-token))) (lc-features (when print-lc-features (or (lc-features token) (when concat-token (lc-features concat-token))))) (attributes (token-attributes token))) #+debug(print (list :token token :features features :concat-features concat-features :lc-features lc-features)) (cond ((not (stringp value)) nil) ((not expand-tokens-p) (list (token-value token) nil (or (morph-server::cg-to-norgram-features value features) (ner-norgram-features value features)))) (t #+debug(print (list (token-value token) (match token) (or concat-features features) (morph-server::cg-to-norgram-features value (or concat-features features)) (ner-norgram-features value (or concat-features features)))) (list (token-value token) (match token) (or (morph-server::cg-to-norgram-features value (or concat-features features)) (ner-norgram-features value (or concat-features features)))))))) #+test (print (named-entity-tokenize "Grønolen fjellgård, Beito, 50 senger, 8 leiligheter, tlf.: 61 35")) (defun named-entity-tokenize (string &key preparse-p) (let* ((*cg* (gethash "nbo" *cg-table*)) (cgp::*tag-as-multi-word-expression* nil) (cgp::*merge-hyphenated-words-p* nil) (cgp::*no-compounds-p* (not preparse-p)) (prev-dig-morph nil)) (collecting (labels ((ner-tokenize-sentence (sentence &rest rest &key (expand-tokens-p t) &allow-other-keys) (labels ((walk (token concat-token) #+debug(when token (print (list :pos (list (token-stream-position token) token)))) #+debug(print (list token (token-expansion token) concat-token)) (cond ((null token) nil) ((and expand-tokens-p (token-expansion token) ;; avoid nesting (not concat-token)) (do ((ex-token (car (token-expansion token)) (token-next ex-token))) ((eq ex-token (cdr (token-expansion token))) (walk ex-token token)) (walk ex-token token)) (walk (token-next token) nil)) (t ;; fixme: do the below special things for preparse mode, too ;;(describe token) (let ((pos (token-stream-position token)) (word-morph (apply #'get-ner-token token :expand-tokens-p expand-tokens-p :concat-token concat-token rest))) #+debug(print (list :word-morph word-morph :prev-dig-morph prev-dig-morph)) (cond ((or preparse-p (not (stringp (car word-morph)))) (collect (cons pos word-morph))) ((not (find-if-not (lambda (c) (find c "1234567890-")) (car word-morph))) (labels ((split-dig (dig start) (let ((h-pos (position #\- dig :start start))) (cond (h-pos (unless (= h-pos start) (collect (list pos (subseq dig start h-pos) nil nil))) (collect (list (+ pos h-pos) "-" nil nil)) (unless (= h-pos (1- (length dig))) (split-dig dig (1+ h-pos)))) (prev-dig-morph (unless (caddr prev-dig-morph) (setf (caddr prev-dig-morph) :dig-match-start)) (collect (setf prev-dig-morph (list (+ pos start) (subseq dig start) (if (eq token (last-token sentence)) :dig-match-end :match) nil)))) (t (collect (setf prev-dig-morph (list (+ pos start) (subseq dig start) nil nil)))))))) (split-dig (car word-morph) 0))) (t (case (caddr prev-dig-morph) (:match (setf (caddr prev-dig-morph) :dig-match-end)) (:dig-match-start (setf (caddr prev-dig-morph) nil))) (setf prev-dig-morph nil) (cond ;; make this a general thing? ((find (car word-morph) '("m.o.h") :test #'string=) (collect (list pos "m" nil nil)) (collect (list (+ pos 2) "o.h" nil nil))) ((find (car word-morph) '("moh") :test #'string=) (collect (list pos "m" nil nil)) (collect (list (+ pos 1) "o.h" nil nil))) ((find (car word-morph) '("m.o.h.") :test #'string=) (collect (list pos "m" nil nil)) (collect (list (+ pos 2) "o.h." nil nil))) ((find (car word-morph) '("moh.") :test #'string=) (collect (list pos "m" nil nil)) (collect (list (+ pos 1) "o.h." nil nil))) #+test ;; not sure why this is here ((and (> (length (car word-morph)) 2) (char= (last-char (car word-morph)) #\-)) (collect (list* pos (subseq (car word-morph) 0 (1- (length (car word-morph)))) (cdr word-morph))) (collect (list* (+ pos (1- (length (car word-morph)))) "-" (case (cadr word-morph) ((:match-start :match) :match) (otherwise nil)) (cddr word-morph)))) (t (collect (cons pos word-morph))))))) (unless (or (eq token (last-token sentence)) concat-token) (walk (token-next token) nil)))))) (walk (first-token sentence) nil) (case (cadr prev-dig-morph) (:match (setf (cadr prev-dig-morph) :dig-match-end)) (:dig-match-start (setf (cadr prev-dig-morph) nil)))) sentence)) (let ((*tag-from-gazetteer* t) (*sentence-initial-lc-features-p* t)) (disambiguate-from-string string :cg *cg* :tagging-niveau :syntactic-named-entity-disambiguation-logon ;;:total-disambiguate-p t :print-function #'ner-tokenize-sentence)))))) ;; derived from cgp::*inflectional-features-mapping-table* (defparameter *name-features-mapping-table* '(("prop" "+Prop") ("adj") ;; only together with ("" "+PastPart") ("be" "+Def") ("ent" "+Sg") ("fl" "+Pl") ("m/f" "+MF") ("ub" "+Indef") ("nøyt" "+Neut") ("" "+PresPart" "+MFN") ("akk" "+Acc") ("fem" "+Fem") ("imp" "+Impv") ("" "+SForm") ("inf" "+Infin") ("pres" "+Pres") ("pass" "+SForm") ("komp" "+Comp" "+MFN" "+NoDef" "+SP") ("mask" "+Masc") ("nom" "+Nom") ("perf-part" "+PastPart") ("pos" "+Pos") ("pret" "+Past") ("sup" "+Sup" "+MFN" "+Def" "+SP"))) #+test (print (named-entity-tokenize "Vi er på Universitetet i Ljubljana og på Universitetet i Bergen.")) :eof