;;;   -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp -*-

(in-package :cgp)

;; translates from CG features to Norgram features
(defun ner-norgram-features (word cg-features)
  #+debug(print (list :word word))
  #+debug(morph-server::print-norgram-morphology word)
  (collecting
    (dolist (reading cg-features)
      (when reading
	(collect
	 (cons (car reading)
	       (sort
		(collecting
		  (let ((features (code-features (cdr reading))))
		    #+debug(let ((*package* (find-package :cgp)))
			     (print (list :features features)))
		    (dolist (f features)
		      (if (string-equal f "subst")
			  (unless (find "prop" features :test #'string-equal)
			    (collect "+Prop"))
			  (collect-append (cdr (find f *name-features-mapping-table* :test #'string-equal :key #'car)))))))
		(lambda (f1 f2)
		  (< (or (position f1 morph-server::*bm-morph-feature-vector* :test #'string=) 0)
		     (or (position f2 morph-server::*bm-morph-feature-vector* :test #'string=) 0))))))))))

(defmethod get-ner-token ((token token)
			  &key (stream *standard-output*)
			  concat-token
			  additional-features
			  (print-lc-features t)
			  expand-tokens-p
			  &allow-other-keys)
  #+debug(print (list :token token :match (match token) :exp (token-expansion token)))
  (let ((value (effective-token-value token))
	(features (remove-feature-inclusions (or (token-features token)
						 (when concat-token (token-features concat-token)))))
	(concat-features (when concat-token (token-features concat-token)))
	(lc-features (when print-lc-features (or (lc-features token)
						 (when concat-token (lc-features concat-token)))))
	(attributes (token-attributes token)))
    #+debug(print (list :token token :features features :concat-features concat-features :lc-features lc-features))
    (cond ((not (stringp value))
 	   nil)
	  ((not expand-tokens-p)
	   (list (token-value token)
		 nil
		 (or (morph-server::cg-to-norgram-features value features)
		     (ner-norgram-features value features))))
	  (t
	   #+debug(print (list (token-value token) (match token) 
			       (or concat-features features)
			       (morph-server::cg-to-norgram-features value (or concat-features features))
			       (ner-norgram-features value (or concat-features features))))
	   (list (token-value token) (match token)
		 (or (morph-server::cg-to-norgram-features value (or concat-features features))
		     (ner-norgram-features value (or concat-features features))))))))

#+test
(print (named-entity-tokenize "Grønolen fjellgård, Beito, 50 senger, 8 leiligheter, tlf.: 61 35"))

(defun named-entity-tokenize (string &key preparse-p)
  (let* ((*cg* (gethash "nbo" *cg-table*))
	 (cgp::*tag-as-multi-word-expression* nil)
	 (cgp::*merge-hyphenated-words-p* nil)
	 (cgp::*no-compounds-p* (not preparse-p))
	 (prev-dig-morph nil))
    (collecting
      (labels ((ner-tokenize-sentence (sentence &rest rest
						&key (expand-tokens-p t)
						&allow-other-keys)
		 (labels ((walk (token concat-token)
			    #+debug(when token (print (list :pos (list (token-stream-position token) token))))
			    #+debug(print (list token (token-expansion token) concat-token))
			    (cond ((null token)
				   nil)
				  ((and expand-tokens-p (token-expansion token)
					;; avoid nesting
					(not concat-token))
				   (do ((ex-token (car (token-expansion token)) (token-next ex-token)))
				       ((eq ex-token (cdr (token-expansion token)))
					(walk ex-token token))
				     (walk ex-token token))
				   (walk (token-next token) nil))
				  (t ;; fixme: do the below special things for preparse mode, too
				   ;;(describe token)
				   (let ((pos (token-stream-position token))
					 (word-morph (apply #'get-ner-token token
							    :expand-tokens-p expand-tokens-p
							    :concat-token concat-token rest)))
				     #+debug(print (list :word-morph word-morph :prev-dig-morph prev-dig-morph))
				     (cond ((or preparse-p
						(not (stringp (car word-morph))))
					    (collect (cons pos word-morph)))
					   ((not (find-if-not (lambda (c) (find c "1234567890-")) (car word-morph)))
					    (labels ((split-dig (dig start)
						       (let ((h-pos (position #\- dig :start start)))
							 (cond (h-pos
								(unless (= h-pos start)
								  (collect (list pos (subseq dig start h-pos) nil nil)))
								(collect (list (+ pos h-pos) "-" nil nil))
								(unless (=  h-pos (1- (length dig)))
								  (split-dig dig (1+ h-pos))))
							       (prev-dig-morph
								(unless (caddr prev-dig-morph)
								  (setf (caddr prev-dig-morph) :dig-match-start))
								(collect (setf prev-dig-morph (list (+ pos start) (subseq dig start)
												    (if (eq token (last-token sentence))
													:dig-match-end
													:match)
												    nil))))
							       (t
								(collect (setf prev-dig-morph (list (+ pos start) (subseq dig start) nil nil))))))))
					      (split-dig (car word-morph) 0)))
					   (t
					    (case (caddr prev-dig-morph)
					      (:match
					       (setf (caddr prev-dig-morph) :dig-match-end))
					      (:dig-match-start
					       (setf (caddr prev-dig-morph) nil)))
					    (setf prev-dig-morph nil)
					    (cond
					      ;; make this a general thing?
					      ((find (car word-morph) '("m.o.h") :test #'string=)
					       (collect (list pos "m" nil nil))
					       (collect (list (+ pos 2) "o.h" nil nil)))
					      ((find (car word-morph) '("moh") :test #'string=)
					       (collect (list pos "m" nil nil))
					       (collect (list (+ pos 1) "o.h" nil nil)))
					      ((find (car word-morph) '("m.o.h.") :test #'string=)
					       (collect (list pos "m" nil nil))
					       (collect (list (+ pos 2) "o.h." nil nil)))
					      ((find (car word-morph) '("moh.") :test #'string=)
					       (collect (list pos "m" nil nil))
					       (collect (list (+ pos 1) "o.h." nil nil)))
					      #+test ;; not sure why this is here
					      ((and (> (length (car word-morph)) 2)
						    (char= (last-char (car word-morph)) #\-))
					       (collect (list* pos
							       (subseq (car word-morph) 0 (1- (length (car word-morph))))
							       (cdr word-morph)))
					       (collect (list* (+ pos (1- (length (car word-morph))))
							       "-"
							       (case (cadr word-morph)
								 ((:match-start :match) :match)
								 (otherwise nil))
							       (cddr word-morph))))
					      (t
					       (collect (cons pos word-morph)))))))
				   (unless (or (eq token (last-token sentence)) concat-token)
				     (walk (token-next token) nil))))))
		   (walk (first-token sentence) nil)
		   (case (cadr prev-dig-morph)
		     (:match
		      (setf (cadr prev-dig-morph) :dig-match-end))
		     (:dig-match-start
		      (setf (cadr prev-dig-morph) nil))))
		 sentence))
	(let ((*tag-from-gazetteer* t)
	      (*sentence-initial-lc-features-p* t))
	  (disambiguate-from-string string
				    :cg *cg*
				    :tagging-niveau :syntactic-named-entity-disambiguation-logon
				    ;;:total-disambiguate-p t
				    :print-function #'ner-tokenize-sentence))))))

;; derived from cgp::*inflectional-features-mapping-table*
(defparameter *name-features-mapping-table*
  '(("prop" "+Prop")
    ("adj") ;; only together with <perf-part>
    ("<perf-part>" "+PastPart")
    ("be" "+Def")
    ("ent" "+Sg")
    ("fl" "+Pl")
    ("m/f" "+MF")
    ("ub" "+Indef")
    ("nøyt" "+Neut")
    ("<pres-part>" "+PresPart" "+MFN")
    ("akk" "+Acc")
    ("fem" "+Fem")
    ("imp" "+Impv")
    ("<s-verb>" "+SForm")
    ("inf" "+Infin")
    ("pres" "+Pres")
    ("pass" "+SForm")
    ("komp" "+Comp" "+MFN" "+NoDef" "+SP")
    ("mask" "+Masc")
    ("nom" "+Nom")
    ("perf-part" "+PastPart")
    ("pos" "+Pos")
    ("pret" "+Past")
    ("sup" "+Sup" "+MFN" "+Def" "+SP")))

#+test
(print (named-entity-tokenize "Vi er på Universitetet i Ljubljana og på Universitetet i Bergen."))

:eof