;; Copyright (c) 2006
;;;   Ben Waldron;
;;; see `LICENSE' for conditions.

;;
;; code to convert SAF XML into SAF object
;;

;;
;; TODO: complete move to :saf namespace
;;

(in-package :saf)

(defun saf-lxml-to-saf-object (lxml)
  (unless (member (lxml::lxml-elt-name lxml) '(:|saf| :|smaf|))
    (error "smaf/saf element expected as body"))
  (let* ((saf-attributes
	  (lxml::lxml-elt-attributes lxml))
	 (lxml (cdr lxml))
	 (text
	  (if (eq :|text| (lxml::lxml-elt-name (car lxml)))
	      (pop lxml)))
	 (olacStr (string (lxml::lxml-elt-name (car lxml)))) 
	 (olac 
	  (if (or (string= "olac" olacStr) ;;S-xml
		  (string= "olac:olac" olacStr)) ;; pxml
	      (pop lxml))))
;	  (if (eq :|olac:olac| (lxml::lxml-elt-name (car lxml)))
;	      (pop lxml))))
    (make-saf
     :meta (get-saf-meta saf-attributes :olac olac :text text)
     :lattice (get-saf-lattice lxml))))

;; fix_me: get rid of global *config*
(defun xml-to-saf-object (xml &key (dir "~") (config (config)))
  (let ((*dir* dir))
    (saf::instantiate-l-content
     (lxml-to-saf-object (lxml::xml-to-lxml xml))
     config)
    ))

(defun lxml-to-saf-object (lxml)
  (cond
   ((null lxml)
    (error "empty xml body"))
   (t
    (saf-lxml-to-saf-object lxml))))

(defun get-saf-meta (saf-attributes &key olac text)
  (let ((doc
	 (second (member :|document| saf-attributes))))
    (if doc
	(unless (eq :absolute (car (pathname-directory (pathname doc))))
	  (setf doc
	    (merge-pathnames doc (make-pathname :directory *dir*)))))
    (make-saf-meta
     :document doc
     :addressing (get-addressing saf-attributes)
     :olac (get-olac-meta olac)
     :text text)))

(defun get-addressing (saf-attribs)
  (let ((addr (second (member :|addressing| saf-attribs))))
    (if addr
	(intern (string-downcase addr) :keyword)
      :|char|)))

(defun make-feat (x)
  (intern (string x) :keyword))

(defun get-olac-meta (lxml-olac)
  (loop
      for e in (lxml::lxml-elts lxml-olac)
      collect
	(make-saf-fv :feature (make-feat (lxml::lxml-elt-name e))
		     :value (lxml::lxml-elt-text-content e))))

#+:null
(defun make-saf-lattice-from-sequence (lxml &key init final)
  (let ((*saf-v* -1)
	nodes edges)
    (setf init (or init (format nil "v~a" (incf *saf-v*))))
    (loop
	with source = init
	with target
	for e = (pop lxml)
	while e
	do
	  (if (and (null lxml) final)
	      (setf target final)
	    (setf target (format nil "v~a" (incf *saf-v*))))
	  (when (null lxml)
	    (setf final target))
	  (push
	   (case (lxml::lxml-elt-name e)
	     (|annot|
	      (lxml-annot-to-edge e :source source :target target))
	     ;; special case
	     (|token|
	      (lxml-token-to-edge e :source source :target target))
	     ;; special case
	     (|wordForm|
	      (lxml-wordform-to-edge e :source source :target target))
	     ;; special case
	     (|sentence|
	      (lxml-sentence-to-edge e :source source :target target))
	     (t
	      (error "unhandled saf edge type: ~a" (lxml::lxml-elt-name e))))
	   edges)
	  (setf source target))
    (loop 
	for e in edges
	do
	  (pushnew (saf-edge-source e) nodes :test #'string=)
	  (pushnew (saf-edge-target e) nodes :test #'string=))
    (make-saf-lattice
     :start-node init
     :end-node final
     :nodes nodes
     :edges edges)))

(defun get-saf-lattice (lxml)
  (cond
   ((member (lxml::lxml-elt-name (car lxml)) '(:|fsm| :|lattice|));; SAF / SMAF
    (when (cdr lxml)
      (error "no elements expected after fsm/lattice"))
    (get-saf-lattice-from-fsm (car lxml)))
   (t
    (error "malformed lattice: ~a" lxml)
    )))

(defun get-saf-lattice-from-fsm (lxml-fsm)
  (let* ((fsm-attributes (lxml::lxml-elt-attributes lxml-fsm))
	 (nodes 
	  (loop for x in (lxml::lxml-elt-elts lxml-fsm :|state|) 
	      collect (lxml::lxml-elt-attr x :|id|)))
	 (token-edges 
	  (loop for e in (lxml::lxml-elt-elts lxml-fsm :|token|)
	      collect (lxml-token-to-edge e)))
	 (annot-edges 
	  (append
	   (loop for e in (lxml::lxml-elt-elts lxml-fsm :|annot|);; SAF
	       for e2 = (lxml-annot-to-edge e)
	       when e2
	       collect e2)
	   (loop for e in (lxml::lxml-elt-elts lxml-fsm :|edge|);; SMAF
	       for e2 = (lxml-annot-to-edge e)
	       when e2
	       collect e2)
	   ))
	 (wordform-edges 
	  (loop for e in (lxml::lxml-elt-elts lxml-fsm :|wordForm|) ;;shorthand
	       for e2 = (lxml-wordform-to-edge e)
	       when e2
	       collect e2)
	      
	      
	 ; collect (lxml-wordform-to-edge e))
	 )
	 (sentence-edges 
	  (loop for e in (lxml::lxml-elt-elts lxml-fsm :|sentence|) ;;shorthand
	      collect (lxml-sentence-to-edge e)))
	 (all-edges (append token-edges annot-edges 
			    wordform-edges sentence-edges))
	 (start-node (second (member :|init| fsm-attributes)))
	 (end-node (second (member :|final| fsm-attributes)))
	 )
    ;; fix missing details
    ;; ensure nodes is complete
    (loop for e in all-edges
	do
	  (pushnew (saf-edge-source e) nodes :test #'string=)
	  (pushnew (saf-edge-target e) nodes :test #'string=))
    (unless (and start-node end-node)
      ;; attempt to guess, based on string< order
      (format t "~%;;; WARNING: final/init nodes should both be specified (will guess based on string< order)")
      (loop 
	  with max
	  with min
	  for n in nodes
	  do
	    (when (or (null min) (string< n min)) 
	      (setf min n))
	    (when (or (null max) (string> n max)) 
	      (setf max n))
	  finally
	    (unless start-node 
	      (format t "~%;;; using init node = '~a'" min)
	      (setf start-node min))
	    (unless end-node 
	      (format t "~%;;; using final node = '~a'" max)	      
	      (setf end-node max))))
    (when
	(check-saf-lattice-consistency start-node end-node nodes all-edges)
      (make-saf-lattice 
       :start-node start-node
       :end-node end-node
       :nodes nodes
       :edges all-edges)
      )))

;; when set, clobber edges will destroy competing non-clobber edges/paths
(defvar *clobber-p* nil)

(defun postprocess-lattice (lattice)
  ;; apply clobber edges
  (if *clobber-p*
      (clobber lattice)
    lattice))

(defun get-smaf-lattice-size (saf)
  ;; num nodes minus 1, or zero
  (let ((n (get-smaf-lattice-node-count saf)))
    (if (zerop n)
	0 ;; ??
      (- n 1))))

(defun get-smaf-lattice-node-count (saf)
  ;; number of unique nodes
  (loop
      with nodes
      with lattice = (saf-lattice saf)
      with edges = (and lattice (saf-lattice-edges lattice))
      for e in edges
      for source = (saf-edge-source e)
      for target = (saf-edge-target e)
      do
	(pushnew source nodes :test 'string=)
	(pushnew target nodes :test 'string=)
      finally
	(return (length nodes))))

(defun check-saf-lattice-consistency (start-node end-node nodes edges)
  (declare (ignore start-node end-node nodes))
  (let ((consistent t))
    ;; check for duplicate edge ids
    (loop
	for e in edges
	for id = (saf-edge-id e)
	if (member id ids :test 'string=)
	do (format t "~%;;; WARNING: invalid lattice input (duplicate id: ~S)" id)
	   (setf consistent nil)
	else
	collect id into ids)
    consistent))

(defun lxml-state-to-node (lxml-state)
  (lxml::lxml-elt-attr lxml-state :|id|))

(defun saf-type (str)
  (intern str :keyword))

;; call lxml-annot-to-edge instead?
(defun lxml-token-to-edge (lxml-token &key type source target)
  (make-saf-edge
   :id (lxml::lxml-elt-attr lxml-token :|id|)
   :type (saf-type (or type :|token|))
   :source (or source (lxml::lxml-elt-attr lxml-token :|source|))
   :target (or target (lxml::lxml-elt-attr lxml-token :|target|))
   :from (lxml::lxml-elt-attr lxml-token :|from|)
   :to (lxml::lxml-elt-attr lxml-token :|to|)
   :content (lxml::lxml-elt-attr lxml-token :|value|)))

(defparameter *HOOK-lxml-rmrs-to-mrs-fn* #'(lambda (x) (declare (ignore x))))

(defun lxml-annot-to-edge (lxml-annot &key type source target)
  (let* ((id (lxml::lxml-elt-attr lxml-annot :|id|))
	 (fs-list (lxml::lxml-elt-elts lxml-annot :|fs|))
	 (slots (lxml::lxml-elt-elts lxml-annot :|slot|))
	 (rmrs (lxml::lxml-elt-elts lxml-annot :|rmrs|))
	 (fs (if (cdr fs-list)
		 (error "max 1 fs element allowed in wordform")
	       (car fs-list)))
	 (content 
	  (or 
	   (lxml::lxml-elt-text-content2 lxml-annot) ;; simple content embedded
	   (lxml::lxml-elt-attr lxml-annot :|value|) ;; simple content value attr
	   (append ;; complex content
	    (lxml-fs-content-to-fs fs) 
	    (lxml-slots-to-fs slots)
	    (funcall *HOOK-lxml-rmrs-to-mrs-fn* rmrs)
;	    #+:mrs (lxml-rmrs-to-rmrs rmrs)
	    )
	   ))
	 (source (or source (lxml::lxml-elt-attr lxml-annot :|source|)))
	 (target (or target (lxml::lxml-elt-attr lxml-annot :|target|)))
	 )
    (unless content
      (format t "~&WARNING: no/unknown content for SMAF edge '~a'" id))
    (unless source
      (format t "~&WARNING: missing source for SMAF edge '~a'" id))
    (unless target
      (format t "~&WARNING: missing target for SMAF edge '~a'" id))
    (when (string= source target)
      (format t "~&ERROR: identical 'source' and 'target' on SMAF edge '~a'" id)
      (return-from lxml-annot-to-edge nil))
    (make-saf-edge
     :id id
     :type (saf-type (or type (lxml::lxml-elt-attr lxml-annot :|type|)))
     :source source
     :target target
     :deps (split-str-on-spc (lxml::lxml-elt-attr lxml-annot :|deps|))
     :content content
     :from (or (lxml::lxml-elt-attr lxml-annot :|from|) 
	       (lxml::lxml-elt-attr lxml-annot :|cfrom|))
     :to (or
	  (lxml::lxml-elt-attr lxml-annot :|to|)
	  (lxml::lxml-elt-attr lxml-annot :|cto|))
     )))

(defun split-str-on-spc (str)
  (mapcar #'car (split-on-spc str)))

;; return list of (WORD-STRING FROM TO)
;; where FROM, TO are char offsets
(defun split-on-spc (preprocessed-string)
  (remove 
   ""
   (loop 
       with c-list = (coerce preprocessed-string 'list)
       with c-list-word
       with from = 0
       for c in c-list
       for i from 1 to (length c-list) 
       if (char= c #\Space) collect (list (coerce (nreverse c-list-word) 'string) from (1- i)) into words
       and do (setf from i)
       and do (setf c-list-word nil)
       else do (push c c-list-word)
       finally 
	 (return (append words
			 (list (list (coerce (nreverse c-list-word) 'string)
				     from i)))))
   :key #'car
   :test #'string=))

;; special case
(defun lxml-wordform-to-edge (lxml-wordform &key source target)
  (lxml-annot-to-edge lxml-wordform :type :|wordForm|
		      :source source :target target))

;; special case
(defun lxml-sentence-to-edge (lxml-sentence &key source target)
  (lxml-token-to-edge lxml-sentence :type :|sentence|
		      :source source :target target))

(defun lxml-fs-content-to-fs (lxml)
  (cond
   ((null lxml)
    nil)
   ((stringp lxml) ;; shorthand
    lxml)   
   ((eq (lxml::lxml-elt-name lxml) :|fs|)
    (loop for f in (lxml::lxml-elt-elts lxml :|f|)
	collect (make-saf-fv
		 :feature (make-feat (lxml::lxml-elt-attr f :|name|))
		 :value (lxml-fs-content-to-fs (first (lxml::lxml-elt-contents f))))))
   ((eq (lxml::lxml-elt-name lxml) :|binary|)
    :binary-ignored)
   ((eq (lxml::lxml-elt-name lxml) :|symbol|)
    :symbol-ignored)
   ((eq (lxml::lxml-elt-name lxml) :|numeric|)
    :numeric-ignored)
   ((eq (lxml::lxml-elt-name lxml) :|string|)
    (let ((str (first (lxml::lxml-elt-contents lxml))))
      (unless (stringp str)
	(error "string expected"))
      str))))

(defun lxml-slots-to-fs (lxml-slots)
  (loop
      for s in lxml-slots
      for feat = (lxml::lxml-elt-attr s :|name|)
      for val = (first (lxml::lxml-elt-contents s))
      for val-str = (if (stringp val)
			val
		      (error "string expected"))
      collect 
	(make-saf-fv :feature (make-feat feat)
		     :value val-str)))

(defun saf-fs-path-value (path fs)
  (cond
   ((null fs)
    nil)
   ((null path)
    fs)
   ((listp fs)
    (saf-fs-path-value 
     (cdr path)
     (saf-fs-feature-value2 fs (car path))))))

(defun saf-fv-value! (x)
  (saf-fv-value x))

(defun saf-edge-l-content! (x)
  (saf-edge-l-content x))

;; extract gMap.* features
;; OUT: (:feat . "val")*
(defun get-gmap-unifs (l-content)
  (loop 
      for fv in l-content
      for feat = (saf-fv-feature fv)
      for feat-str = (string feat)
      for val = (saf-fv-value fv)
      when (and
	    (> (length feat-str) 4)
	    (string= (subseq feat-str 0 5)
		    "gMap."))
      collect
	(cons
	 (intern (subseq feat-str 5) :keyword)
	 val)))
	 
(defun read-file-to-string (filename &key (numchars -1))
  (coerce 
   (with-open-file (ifile filename
		    :direction :input)
     (loop
	 with i = 0
	 for c = (read-char ifile nil)
	 while (and c (not (= i numchars)))
	 collect c
	 do 
	   (incf i)))
   'string))

(defun char-map-add-x (point)
  (if point
      (format nil "~a" (+ (or *char-map-add-offset* 0) (point-to-char-point point :|char|)))))

(defun saf-fs-partial-tree-2-list-partial-tree (fs)
  (if (null fs)
      nil
    (let* ((first (saf-fs-path-value '(:first) fs))
	   (rule2 (saf-fs-path-value '(:rule) first)) 
	   (str2 (saf-fs-path-value '(:str) first))
	   (rule (if (stringp rule2)
		     (intern rule2)
		   (error "string expected for saf-fs 'rule': ~a" rule2)))
	   (str (if (stringp str2)
		     (intern str2)
		   (error "string expected for saf-fs 'str': ~a" str2)))
	   (rest (saf-fs-path-value '(:rest) fs)))
      (cons (list rule str) (saf-fs-partial-tree-2-list-partial-tree rest)))))


(defun saf-num-lattice-nodes (saf)
  (length (saf-lattice-nodes (saf-lattice saf))))

(defun sort-edges-by-from (edges &key addressing)
  (sort (copy-list edges)
	'<
	:key (lambda (x)
	       (or (point-to-char-point (saf-edge-from x) addressing) 0))
	))

;;
;; point schemes:
;; - char
;; - xpoint
;; - line
;;

(defun x-span (text from to addressing)
  (let ((cfrom (point-to-char-point from addressing))
	(cto (point-to-char-point to addressing)))
    (cond
     ((eq :|char| addressing)
      (and cfrom cto
	   (subseq text cfrom cto)))
     ((eq :|xpoint| addressing)
      (error "addressing scheme 'xpoint' not implemented"))
     (t
      (error "unknown addressing scheme '~a'" addressing)))))

;;
;; clobber code
;;

;; add 1-paths from node-z to agenda
(defun update-paths-x2y-agenda (node-z agenda &key lattice)
  (unless lattice
    (error "missing LATTICE argument"))
  (loop
      for edge in (get-edges-source node-z :lattice lattice)
      for source = (saf-edge-source edge)
      for target = (saf-edge-target edge)
      do (pushnew (cons source target) agenda
		  :test #'equalp))
  agenda)

;; returns array of forward pointers for paths node-x to node-y
(defun get-paths-x2y (node-x node-y &key lattice)
  (unless lattice
    (error "missing LATTICE argument"))
  (let* (;; create hash to store paths from x
	 (paths-from-x (make-hash-table :test #'equalp))
	 ;; initialise agenda
	 agenda)
    (setf agenda (update-paths-x2y-agenda node-x nil :lattice lattice))
    (unless (equalp node-x node-y)
      ;; process agenda items...
      (loop 
	  with processed = nil
	  for item = (pop agenda)
	  for source = (car item)
	  for target = (cdr item)
	  while agenda
		;; next item
	  unless (member item processed :test #'equalp)
	  do
	    ;(format t "~&item ~a" item)
	    ;; update array
	    (setf (gethash target paths-from-x)
	      (cons source
		    (gethash target paths-from-x)))
	    (unless (equalp target node-y)
	      ;; no loops, so no need to look further
	      (setf agenda
		(update-paths-x2y-agenda target agenda :lattice lattice)))
	    (push item processed)))
    ;; pick out result
    paths-from-x))

;; return edge set taken from all paths node-x to node-y
(defun get-edges-x2y (node-x node-y &key lattice)
  (unless lattice
    (error "missing LATTICE argument"))
  (loop
      with hash = (get-paths-x2y node-x node-y :lattice lattice)
      for target being each hash-key in hash
      for sources = (gethash target hash)
      append
	(loop for source in sources
	    append (get-edges-source-target source target :lattice lattice))))

;; paths of exactly length len
#+:null
(defun annot-paths (annot lattice &key len)
  (cond
   ((zerop len))
   ((= 1 len)
    (list (list annot)))
   (t
    (loop
	with next-node = (saf-edge-target annot)
	for next-annot in (get-edges-source next-node :lattice lattice)
	append
	  (loop 
	      for path in (annot-paths next-annot lattice :len (1- len))
	      collect (push annot path))))))

;; return edges from source node to target node
(defun get-edges-source-target (source target &key lattice)
  (unless lattice
    (error "missing LATTICE argument"))
  ;; FIXME: inefficient
  (loop 
      for edge in (saf-lattice-edges lattice)
      for source1 = (saf-edge-source edge)
      for target1 = (saf-edge-target edge)
      when (and (equalp source1 source)
		(equalp target1 target))
      collect edge))

;; when set, clobber rules enabled
(defvar *warning-clobber* nil)

;; attempt to read int from "clobber" field
;; if fail, 0
(defun get-clobber-level (edge)
  (let* ((l-content (saf-edge-l-content edge))
	 (clobber (saf-fs-feature-value2 l-content :|clobber|)))
    (or 
     (and (null clobber) 0) ;; in case we have null val
     (parse-integer clobber :junk-allowed t) 
     0)))
  

;; for each clobber edge, remove all edge with a lower clobber level between
;; source and target nodes according to clobebr level (see below)
(defun clobber (lattice)
  (let* ((edges (saf-lattice-edges lattice))
	 (clobber-edges
	  (loop 
	      for edge in edges
	      unless (zerop (get-clobber-level edge))
	      collect edge)))
    (loop
	for clobber-edge in clobber-edges
	for clobber-level = (get-clobber-level clobber-edge)
	for source = (saf-edge-source clobber-edge)		    
	for target = (saf-edge-target clobber-edge)
	for edges = (get-edges-x2y source target :lattice lattice)
	for clobbered-edges =
	  ;; FIXME: inefficient
	  (loop
	      for edge in edges
	      for clobber-level2 = (get-clobber-level edge)
	      when (or
		    ;; clobber level positive: clobber if abs STRICTLY greater
		    (and (> clobber-level 0)
			 (> (abs clobber-level) (abs clobber-level2)))
		    ;; clobber level negative: clobber if abs EQUAL OR greater
		    (and (< clobber-level 0)
			 (>= (abs clobber-level) (abs clobber-level2))))
	      do
		(if *warning-clobber*
		    (format t "~%;;; WARNING: edge ~a clobbered" (saf-edge-id edge)))
	      and
	      collect edge)
	do
	  (setf (saf-lattice-edges lattice)
	    (set-difference (saf-lattice-edges lattice) clobbered-edges))))
    lattice)