;;; -*- Mode: lisp; Syntax: ansi-common-lisp; Package: cgp; Base: 10; Readtable: augmented-readtable -*- ;; ;; Copyright (C) Paul Meurer 2004-2006. All rights reserved. ;; paul.meurer@aksis.uib.no ;; Aksis, University of Bergen ;; ;; SOAP interface to the Oslo-Bergen tagger ;; (in-package :cgp) (use-package :net.xmp.soap) #+test (setf *soap-server-debug* t) #+test (setf *soap-client-debug* t) #+test (net.aserve::debug-on :xmit) #+test (net.aserve::debug-off :xmit) ;; This form ensures that XSD is a nickname of the NET.XMP.SCHEMA ;; package: (defpackage :net.xmp.schema (:use) (:nicknames :xs :xsd)) (defpackage :net.xmp.schema-instance (:use) (:nicknames :xsi)) (defpackage :net.xmp.soap.none (:use) (:nicknames :none)) (defpackage :net.xmp.soap.envelope (:use) (:nicknames :env)) (defpackage :net.xmp.soap.encoding (:use) (:nicknames :enc)) (defpackage :net.xmp.soap.cgp (:use) (:nicknames :cgps) (:export |tagText| |tagTextBase64| |sResponse| |sResponseBase64| |taggedXML|)) (define-soap-element nil 'cgps:|tagText| '(:complex (:seq (:element :|language| xsd:|string|) (:element :|version| xsd:|string|) (:element :|niveau| xsd:|string|) (:element :|text| xsd:|string|) (:element :|xml| xsd:|string|) ;;enc:|base64|) (:element :|encoding| xsd:|string|) (:element :|input-format| xsd:|string|) (:element :|output-format| xsd:|string|) (:element :|feature-filter| (:complex (:seq* (:element :|f| xsd:|string|)))) (:element :|total-disambiguate| xsd:|boolean|) ) :action "ACLSOAP" :namespaces (nil (:cgps "cgps" "http://www.aksis.uib.no/cgp")))) (define-soap-element nil 'cgps:|tagTextBase64| '(:complex (:seq (:element :|language| xsd:|string|) (:element :|version| xsd:|string|) (:element :|niveau| xsd:|string|) (:element :|text| xsd:|string|) (:element :|xml| xsd:|string|) ;;enc:|base64|) (:element :|encoding| xsd:|string|) (:element :|input-format| xsd:|string|) (:element :|output-format| xsd:|string|) (:element :|feature-filter| (:complex (:seq* (:element :|f| xsd:|string|)))) (:element :|total-disambiguate| xsd:|boolean|) (:element :|include-elements| (:complex (:seq* (:element :|element| xsd:|string|)))) (:element :|exclude-elements| (:complex (:seq* (:element :|element| xsd:|string|)))) (:element :|sentence-delimiter-elements| (:complex (:seq* (:element :|element| xsd:|string|)))) (:element :|in-sentence-elements| (:complex (:seq* (:element :|element| xsd:|string|)))) ) :action "ACLSOAP" :namespaces (nil (:cgps "cgps" "http://www.aksis.uib.no/cgp")))) #+test (cgp-tag-server :port 8091) #+test (print (cgp-soap-parse "Jeg har fått et stort fly og du skal gi meg en brannbil." :total-disambiguate-p t :port 8019 ;;:format :xml :niveau :snp :encoding :utf-8 :feature-filter *bredt-part-of-speech* :output-format #+test :word-lemma-tags-nps-xml :word-lemma-nps-tags)) #+test (trace net.aserve.client:do-http-request) ;; fix for strange bug in built application #+ignore (defmethod net.uri:uri= ((uri1 net.uri:uri) (uri2 string)) (net.uri:uri= uri1 (net.uri:uri uri2))) (define-soap-element nil 'cgps:|sResponse| '(:simple xsd:|string| :namespaces (nil (:cgps "cgps" "http://www.aksis.uib.no/cgp")))) (define-soap-element nil 'cgps:|sResponseBase64| '(:simple xsd:|string| ;; xsd:|base64Binary| ;;xsd:|string| :namespaces (nil (:cgps "cgps" "http://www.aksis.uib.no/cgp")))) #+test (net.aserve::shutdown) #+test (print (net.xmp::xmp-normal-nsd '(:cgp "cgp" "http://www.aksis.uib.no/cgp") t)) (define-namespace-map :cgp-namespaces nil '(:cgps "cgps" "http://www.aksis.uib.no/cgp")) ;; toplevel function in the stand-alone cgp-server app. (defun cl-user::start-local-tag-server () (let ((port (or (when (> (sys::command-line-argument-count) 1) (parse-integer (sys::command-line-argument 1) :junk-allowed t)) 8000)) (local-p (when (> (sys::command-line-argument-count) 2) (sys::command-line-argument 2)))) (cgp-tag-server :port port :start-web-server-p t :local-p local-p))) (defun cgp-tag-server (&key port default-port start-web-server-p local-p (server-action "ACLSOAP") (method-action :default) (client-action server-action)) (declare (ignore client-action)) (let ((path "/cgp-soap") (host (when local-p "localhost"))) (when start-web-server-p (start :port port :host host)) (unless port (setf port (if (and *wserver* (wserver-socket *wserver*)) (socket:local-port (wserver-socket *wserver*)) default-port))) (format t "~&cgp-server listening on port ~d; relative URI: ~a~%" port path) (let ((s (soap-message-server :start `(:port ,port :host ,host) :enable :start :publish `(:path ,path) :action server-action :lisp-package :keyword :base-dns :soap :message-dns :cgp-namespaces))) (soap-export-method s 'cgps:|tagTextBase64| (list :|text| :|xml| :|language| :|encoding| :|version| :|niveau| :|total-disambiguate| :|feature-filter| :|input-format| :|output-format| :|include-elements| :|exclude-elements| :|sentence-delimiter-elements| :|in-sentence-elements|) :lisp-name 'cgp-tag-text :action method-action :return '(:element cgps:|sResponseBase64| #+orig enc:|base64Binary| ;;|taggedXML| #-ignore xsd:|string|)) (soap-export-method s 'cgps:|tagText| (list :|text| :|xml| :|language| :|encoding| :|version| :|niveau| :|total-disambiguate| :|feature-filter| :|input-format| :|output-format|) :lisp-name 'cgp-tag-text :action method-action :return '(:element cgps:|sResponse| |taggedXML| #+ignore xsd:|string|))))) (define-soap-type nil 'cgps:|taggedXML| #-orig 'xsd::|string| ;;'enc:|base64| #+test '(:COMPLEX (:SEQ* (:any)) :namespaces (nil (:cgps "cgps" "http://www.aksis.uib.no/cgp")))) #+test (defmethod soap-encode-object :before (conn name type data) (print (list :conn conn :name name :type type :data data))) #+test (defmethod soap-encode-object (conn name (type (eql 'xsd:|string|)) data) (print :fifi) data #+test (print (list :conn conn :name name :type type :data data))) #+test (princ (cgp-tag-text :|text| *test-string* :|language| "nbo" :|niveau| "np" :|total-disambiguate| t :|output-format| :word-lemma-tags-xml ;;:word-tags ;;:word-lemma-tags-nps-xml )) #+test (cgp-tag-server) #+test (cgp-tag-server :port 8005) #+test (cgp-tag-server :default-port 8005) #+test (net.uri:uri= "http://decentius.aksis.uib.no:8000/cgp-soap" "http://decentius.aksis.uib.no:8000/cgp-soap") #+test (trace net.uri:uri=) (defun cgp-soap-parse (&key xml string (format :base64) (language "nbo") (encoding "iso-8859-1") (host "decentius.aksis.uib.no") (niveau :md) (output-format :cg) (input-format :text) (port 8055) feature-filter #+ignore syntactic-feature-filter total-disambiguate-p include-elements exclude-elements sentence-delimiter-elements in-sentence-elements) #+debug(print (list :xml xml :string string)) (let* ((conn (soap-message-client :lisp-package :keyword :url (format nil "http://~a:~d/cgp-soap" host port))) (response (call-soap-method conn (if (eq format :base64) 'cgps:|tagTextBase64| 'cgps:|tagText|) :|text| string :|xml| xml :|language| language :|encoding| encoding :|niveau| niveau :|feature-filter| (collecting (map nil (lambda (f) (collect :|f|) (collect (let ((f (subst-substrings (string-downcase f) '("&" "&" "<" "<" ">" ">")))) (if (equal encoding "iso-8859-1") (utf-8-encode f) f)))) feature-filter)) :|total-disambiguate| total-disambiguate-p :|input-format| input-format :|output-format| output-format :|include-elements| (loop for elt in include-elements collect :|element| collect elt) :|exclude-elements| (loop for elt in exclude-elements collect :|element| collect elt) :|sentence-delimiter-elements| (loop for elt in sentence-delimiter-elements collect :|element| collect elt) :|in-sentence-elements| (loop for elt in in-sentence-elements collect :|element| collect elt)))) (case (car response) (cgps:|sResponse| (cdr response)) (cgps:|sResponseBase64| (cadr response)) (env:|Fault| (error "~a: ~s" (cadr (assoc :|faultcode| (cdr response))) (cadr (assoc :|faultstring| (cdr response))))) (otherwise response)))) (defparameter *bredt-part-of-speech* #(subst ;; 0 adj ;; 1 verb ;; 2 adv ;; 3 prep ;; 4 interj ;; 5 det kvant ;; 6 det dem ;; 7 det ;; 8 pron poss ;; 9 pron pers ;; 10 pron refl ;; 11 pron ;; 12 inf-merke ;; 13 sbu ;; 14 fork ;; 15 foreign ;; 16 ukjent ;; 17 ent fl be ub mask fem nøyt ;; 18 ;; 19 clb ;; 20 konj ;; 21 ;; 22 ;; 23 @subj @obj @s-pred @iv @fv @adv @det> @adv> @adj> @Det var en gang et skip som gikk til lands." :total-disambiguate-p t ;;:host "hilarion.aksis.uib.no" :port 8098 :host "cc.uio.no" :port 8080 :niveau :snp :feature-filter *bredt-part-of-speech* :output-format :word-lemma-tags-nps-xml) ;;stream )) #+test (princ (cgp-tag-text :|text| *test-string* :|language| "nbo" :|niveau| "snp" :|total-disambiguate| t :|output-format| :word-lemma-tags-nps-xml ;;:word-tags )) #+copy (disambiguate-stream 'xml-tokenizer in-stream :tokenizer-initargs (if (probe-file param-file) (parse-tagger-parameters param-file) '(:sentence-delimiter-elements (:|s| :|S|) :include-path (:|text| :|TEXT|) :exclude-path ())) :tagging-niveau (if (find "synt" path-components :test #'equal) :syntactic-disambiguation :morphological-disambiguation) :cg (if (find "nny" path-components :test #'equal) (gethash "nny" *cg-table*) (gethash "nbo" *cg-table*)) :print-function (lambda (s &key token-print-fn) (declare (ignore token-print-fn)) (print-sentence-xml s :print-rules nil :print-sentence-elts-p t ;;nil :word-elt :|word| :fresh-line-before-word-p t :expand-tokens-p t :print-lc-features print-lc-features :stream out-stream))) #+test (let ((xml (with-output-to-string (stream) #m(text #L(dotimes (i 2) #m((loll :id 1) "Jonas & leker med flyet sitt.") #m((fuff :id 2) "Han leker ikke med dukker."))) #+test #m(no-text #s(make-string 5743 :initial-element #\a))))) (print (cgp-soap-parse :xml xml :total-disambiguate-p t :port 8098 ;8019 :host "hilarion.aksis.uib.no" :niveau :snp :feature-filter *bredt-part-of-speech* :input-format :xml :output-format :word-tags ;;:word-lemma-tags-xml :sentence-delimiter-elements nil ;; '("s") :include-elements '("text") :exclude-elements '("comment") ))) ;; function to debug: print-sentence-xml() in cgp-www.lisp #+test (let ((xml "

Det er et hus. Huset er stort.

" ;;"

Det er et hus. Huset er stort.

" ;;"

Det er et hus. Huset er stort.

" )) (print (cgp-soap-parse :xml xml :total-disambiguate-p t :port 8061 ;8019 :host "hilarion.aksis.uib.no" :niveau :none :feature-filter *bredt-part-of-speech* :input-format :xml :output-format :word-tags ;;:word-lemma-tags-xml :sentence-delimiter-elements nil ;; '("s") :in-sentence-elements '("emph") :include-elements '("p") :exclude-elements '("comment") ))) ;; test big file #+test (let ((xml (with-output-to-string (stream) (with-file-lines (line "/usr/local/cwb/corpora/enpc/fict/nbo/BV1.xml") (write-line line stream))))) (print (cgp-soap-parse :xml xml :total-disambiguate-p t :port 8098 ;; 8019 :host "hilarion.aksis.uib.no" :niveau :none ;; :snp :feature-filter *bredt-part-of-speech* :encoding "iso-8859-1" :input-format :xml :output-format :word-tags ;;:word-lemma-tags-xml :sentence-delimiter-elements'("s") :include-elements '("text") :exclude-elements '()))) #+test (let ((xml (with-output-to-string (stream) #m(text #L(dotimes (i 2) #m((s :id 1) "Jonas & leker med eselet sitt.") #m((s :id 2) "Han leker ikke med dukker.")))))) (print (cgp-tag-text :|xml| xml :|niveau| :snp :|input-format| :xml :|sentence-delimiter-elements| '((:element "s")) :|include-elements| '((:element"text"))))) (defun cgp-tag-text (&rest rest &key |text| |xml| |language| (|input-format| "text") |version| |niveau| |total-disambiguate| (|encoding| "utf-8") |feature-filter| |output-format| |include-elements| |exclude-elements| |sentence-delimiter-elements| |in-sentence-elements|) #+debug(print (list :rest rest)) (let ((tagging-niveau (ecase (intern (string-upcase (string |niveau|)) :keyword) (:none :none) (:mt :multi-tagging) (:md :morphological-disambiguation) (:sm :syntactic-mapping) (:sd :syntactic-disambiguation) (:sdr :syntactic-disambiguation-regexp) ((:neo :neol) :named-entity-recognition-only) (:mnd :named-entity-disambiguation) (:snm :syntactic-named-entity-mapping) (:sdnm :syntactic-disambiguation-named-entity-mapping) ((:snd :sndl) :syntactic-named-entity-disambiguation) (:te :term-extraction) (:np :np-recognition) (:snp :sd-np-recognition))) (stack nil)) (ecase (intern (string-upcase |input-format|) :keyword) (:xml (with-output-to-string (out-stream) (with-input-from-string (in-stream |xml|) (disambiguate-stream 'xml-tokenizer in-stream :resolve-entities '(:except "amp" "lt" "gt") :tokenizer-initargs (if nil ;;(probe-file param-file) (parse-tagger-parameters param-file) `(:sentence-delimiter-elements ,(mapcar (lambda (elt) (intern (cadr elt) :keyword)) |sentence-delimiter-elements|) :include-path ,(mapcar (lambda (elt) (intern (cadr elt) :keyword)) |include-elements|) :exclude-path ,(mapcar (lambda (elt) (intern (cadr elt) :keyword)) |exclude-elements|) :in-sentence-elements ,(mapcar (lambda (elt) (intern (cadr elt) :keyword)) |in-sentence-elements|) :encoding ,(when |encoding| (intern (string-upcase |encoding|) :keyword)))) :tagging-niveau tagging-niveau :cg (or (gethash (or |language| |version|) cgp::*cg-table*) cgp::*nbo-cg*) :feature-filter (loop for f in |feature-filter| collect (if t;(equal |encoding| utf-8) (utf-8-decode (subst-substrings (reduce (lambda (x y) (concat x y)) (cdr f)) '("&" "&" "<" "<" ">" ">"))) (subst-substrings (reduce (lambda (x y) (concat x y)) (cdr f)) '("&" "&" "<" "<" ">" ">")))) :total-disambiguate-p |total-disambiguate| :print-function (lambda (s &key token-print-fn) (declare (ignore token-print-fn)) (setf stack (print-sentence-xml s :print-rules nil :print-sentence-elts-p (null |sentence-delimiter-elements|) :word-elt :|word| :fresh-line-before-word-p t :expand-tokens-p t :substitutions '("&" "&" "<" "<" ">" ">") ;;:print-lc-features print-lc-features :stream out-stream :stack stack))))))) (:text (with-output-to-string (stream) (let ((*package* (find-package :cgp)) (np-count -1) (word-count -1)) (cgp:disambiguate-from-string (ecase (intern (string-upcase |encoding|) :keyword) (:iso-8859-1 |text|) (:utf-8 (utf-8-decode |text|))) :stream stream :cg (or (gethash (or |language| |version|) cgp::*cg-table*) cgp::*nbo-cg*) :print-function ;;#'cgp::print-sentence (lambda (sentence &rest rest) (apply #'cgp::print-sentence sentence :expand-tokens-p :te rest)) :tagging-niveau tagging-niveau :feature-filter (loop for f in |feature-filter| collect (utf-8-decode (subst-substrings (reduce (lambda (x y) (concat x y)) (cdr f)) '("&" "&" "<" "<" ">" ">")))) :total-disambiguate-p |total-disambiguate| :token-print-fn (case (intern (string-upcase (string |output-format|)) :keyword) (:word-lemma-tags (lambda (token &rest rest &key stream &allow-other-keys) (let ((fl (find-if-not #'null (token-features token)))) (format stream "~a \"~a\" ~(~{~a~^ ~}~)~%" (token-value token) (or (car fl) "") (when fl (code-features (cdr fl))) )))) (:word-tags (lambda (token &rest rest &key stream &allow-other-keys) (when (find token (matches (token-chain token)) :key #'car) (write-line ">" stream)) (let ((fl (find-if-not #'null (token-features token)))) (format stream "~a ~(~{~a~^ ~}~)~%" (token-value token) (when fl (code-features (cdr fl))) )) (when (find token (matches (token-chain token)) :key #'cadr) (write-line "<" stream)))) (:word-lemma-tags-xml (lambda (token &rest rest &key stream &allow-other-keys) (cond ((eq (token-value token) :newline) #m(newline/)) (t (let ((fl (find-if-not #'null (token-features token)))) #m((word :id #s(format nil "w~d" (incf word-count)) :lemma #s(when fl (car fl)) :features #s(format nil "~(~{~a~^ ~}~)" (when fl (code-features (cdr fl))))) #s(token-value token))))))) (:word-lemma-tags-nps-xml (lambda (token &rest rest &key stream &allow-other-keys) (cond ((eq (token-value token) :newline) #m(newline/)) (t (when (find token (matches (token-chain token)) :key #'car) (format stream "~%" (incf np-count))) (let ((fl (find-if-not #'null (token-features token)))) #m((word :id #s(format nil "w~d" (incf word-count)) :lemma #s(when fl (car fl)) :features #s(format nil "~(~{~a~^ ~}~)" (when fl (code-features (cdr fl))))) #s(token-value token))) (when (find token (matches (token-chain token)) :key #'cadr) (write-line "" stream)))))) (otherwise (lambda (token &rest rest) (apply #'print-token token :print-rules nil rest))))))))))) #+test (let ((xml (with-output-to-string (stream) #m(text ((s :id 1) "Jonas leker med bilene sine.") ((s :id 2) "Han leker ikke med dukker."))))) (print (cgp-tag-text :|xml| xml :|language| "nbo" :|input-format| "xml" :|niveau| :none :|feature-filter| '((:|f| "subst") (:|f| "cn"))))) #+test (let ((xml #+test(make-string 5743 :initial-element #\a) (with-output-to-string (stream) #m(text #L(dotimes (i 2) #m((s :id 1) "Jonas > leker med bilene sine.") #m((s :id 2) "Han leker ikke med dukker.")))))) (print (cgp-soap-parse :xml xml :total-disambiguate-p t :port 8019;8090 :host "rigus.aksis.uib.no" ;;:host "decentius.aksis.uib.no" :niveau :snp :feature-filter *bredt-part-of-speech* :input-format :xml :output-format :word-tags ;;:word-lemma-tags-xml ))) #+test (print (excl::base64-string-to-string (print (delete #\newline (excl::string-to-base64-string (make-string 15744 :initial-element #\a)))))) #+test (let ((text "Jonas leker med bilene sine. & Han leker ikke med dukker.")) (print (cgp-soap-parse :string text :total-disambiguate-p t :port 8091;8090 ;;:host "hilarion.aksis.uib.no" ;;:host "michael.aksis.uib.no" :host "decentius.aksis.uib.no" :niveau :snp :feature-filter *bredt-part-of-speech* ;;:input-format :xml :output-format :word-tags ;;:word-lemma-tags-xml ))) #+test (let ((text "Jonas leker med bilene sine. & Han leker ikke med dukker.")) (print (cgp-soap-parse :string text :total-disambiguate-p t :port 8092;8090 ;;:host "hilarion.aksis.uib.no" ;;:host "michael.aksis.uib.no" :host "decentius.aksis.uib.no" :niveau :md :encoding "utf-8" :feature-filter *bredt-part-of-speech* ;;:input-format :xml :output-format :word-lemma-nps-tags ))) #+test (print (cgp-tag-text :|text| "Jonas leker med bilene sine." :|language| "nbo" :|input-format| "text" :|niveau| :mt :|feature-filter| '((:|f| "subst")))) :eof