;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; Readtable: augmented-readtable -*- ;; ;; Copyright (C) Paul Meurer 1999 - 2004. All rights reserved. ;; paul.meurer@aksis.uib.no ;; Aksis, University of Bergen ;; (in-package :cgp) (setf (logical-pathname-translations "oslo-tekster") '(("**;*.*" "/home/kristinh/tekster/**/*.*"))) (setf (logical-pathname-translations "till-tekster") '(("**;*.*" "/home/till/tekster/**/*.*"))) (setf (logical-pathname-translations "helgeh-tekster") '(("**;*.*" "/home/helgeh/tekster/**/*.*"))) #+old (defparameter *oslo-corpus-dirs* '("oslo-corpus:bm;aviser;txt;*.renset" "oslo-corpus:bm;sakprosa;txt;*.renset" "oslo-corpus:bm;skj-litt;txt;*.renset")) #+old (defparameter *oslo-corpus-dirs* '("tekster:**;*.xml" "tekster:**;*.txt" "tekster:**;*.renset")) (defparameter *corpus-dirs* '(:oslo ("oslo-tekster:**;*.xml" "oslo-tekster:**;*.txt" "oslo-tekster:**;*.renset") :till ("till-tekster:**;*.xml" "till-tekster:**;*.txt") :helgeh ("helgeh-tekster:**;*.xml" "helgeh-tekster:**;*.txt"))) #+test (print (directory "tekster:**;*.txt")) #+test (print (namestring #p"oslo-corpus:bm;gaga.lolo")) #+test (print (pathname-directory #p"oslo-corpus:bm;fifi;gaga.lolo")) (defun map-corpus-dirs (fun corpus-dirs) (dolist (dir corpus-dirs) (dolist (in-file (directory dir)) (let ((out-file (concat (namestring in-file) ".syn"))) (unless (probe-file out-file) (funcall fun in-file out-file)))))) #+test (print (merge-pathnames "fifi.lolo" #p"/home/kristinh/tekster/v01_no/turist/11.no.txt")) #+test (map-corpus-dirs (lambda (in-file out-file dir) (print (list in-file out-file))) *oslo-corpus-dirs*) ;; (defparameter *break-tagging-p* nil) (defparameter *tag-processes* (list :oslo nil :till nil :helgeh nil)) (defparameter *log-strings* (list :oslo "" :till "" :helgeh "")) #+test (with-open-file (stream "/home/kristinh/tekster/xml/tagger-parameters.xml" :direction :output :if-exists :supersede) #m(?xml :version "1.0" :encoding "utf-8" :standalone "yes") #m(TAGGER-PARAMETERS (SENTENCE-DELIMITER-ELEMENTS (ELEMENT "s") (ELEMENT "S")) (INCLUDE-PATH (ELEMENT "text") (ELEMENT "TEXT")) (EXCLUDE-PATH))) #+test (with-open-file (stream "/home/kristinh/tekster/xml/tagger-parameters.xml" :direction :output :if-exists :supersede) #m(tagger-parameters (sentence-delimiter-elements) (include-path (element "rota")) (exclude-path))) #+test ((:TAGGER-PARAMETERS) ((:SENTENCE-DELIMITER-ELEMENTS) ((:ELEMENT) "s") ((:ELEMENT) "S")) ((:INCLUDE-PATH) ((:ELEMENT) "text") ((:ELEMENT) "TEXT")) ((:EXCLUDE-PATH))) #+test (print (parse-tagger-parameters #p"/home/kristinh/tekster/xml/tagger-parameters.xml")) #+test (print (parse-tagger-parameters #p"/home/paul/temp.xml")) (defun parse-tagger-parameters (file) (unless (find-grammar "xml-parser") (lxml::compile-xml-parser)) (let* ((xml (cadar (zebu::xml-file-parser (print file) :grammar (find-grammar "xml-parser") :verbose nil))) (parameters ())) (dolist (params (cdr xml)) (when (listp params) (destructuring-bind ((key) . values) params (setf (getf parameters (intern (string-upcase key) :keyword)) (mapcar (lambda (val-elt) (intern (print (cadr val-elt)) :keyword)) values))))) (print parameters))) ;; main routine (defun start-corpus-tagger (&key user print-lc-features) (unless (getf *tag-processes* user) (setf (getf *tag-processes* user) (mp:process-run-function "tag-corpus" (lambda () (let ((*package* (find-package :cgp))) (setf (getf *log-strings* user) "") (setf (getf *log-strings* user) (format nil "Starting at ~a.~%" (now))) (map-corpus-dirs (lambda (in-file out-file) (let ((param-file (merge-pathnames "tagger-parameters.xml" in-file))) (unless (equal param-file in-file) (handler-case (progn (format *standard-output* "~%
Tagging ~a ... " in-file) (setf (getf *log-strings* user) (concat (getf *log-strings* user) (format nil "~%
Tagging ~a ... " in-file))) (let* ((path-components (pathname-directory in-file)) (*package* :cgp)) (if (find "xml" path-components :test #'equal) (with-open-file (in-stream in-file :direction :input) (with-open-file (out-stream (concat out-file ".tmp") :direction :output :if-exists :supersede :if-does-not-exist :create) (disambiguate-stream 'xml-tokenizer in-stream :tokenizer-initargs (if (probe-file param-file) (parse-tagger-parameters param-file) '(:sentence-delimiter-elements (:|s| :|S|) :include-path (:|text| :|TEXT|) :exclude-path ())) :tagging-niveau (if (find "synt" path-components :test #'equal) :syntactic-disambiguation :morphological-disambiguation) :cg (if (find "nny" path-components :test #'equal) (gethash "nny" *cg-table*) (gethash "nbo" *cg-table*)) :print-function (lambda (s &key token-print-fn) (declare (ignore token-print-fn)) (print-sentence-xml s :print-rules nil :print-sentence-elts-p t ;;nil :word-elt :|word| :fresh-line-before-word-p t :expand-tokens-p t :print-lc-features print-lc-features :stream out-stream))))) (disambiguate-file in-file (concat out-file ".tmp") :tagging-niveau :syntactic-disambiguation :cg (if (find "nny" path-components :test #'equal) (gethash "nny" *cg-table*) (gethash "nbo" *cg-table*)) #+ignore(newest-cg :nbo) :print-function (lambda (s &key stream token-print-fn) (declare (ignore token-print-fn)) (print-sentence s :stream stream :print-features t :print-rules nil :print-attributes t :print-lc-features print-lc-features :print-special-tokens-p nil :expand-tokens-p nil))))) (rename-file (concat out-file ".tmp") out-file) (setf (getf *log-strings* user) (concat (getf *log-strings* user) (format nil "done (~a)." (now))))) (error (cond) (setf (getf *log-strings* user) (concat (getf *log-strings* user) (format nil "~%
Error: ~a
" cond)))))))) (getf *corpus-dirs* user))) (stop-corpus-tagger :user user)))))) (defun stop-corpus-tagger (&key user) (when (getf *tag-processes* user) (mp::process-kill (getf *tag-processes* user)) (setf (getf *tag-processes* user) nil) (with-open-file (log-stream (ecase user (:till "till-tekster:tagger.log") (:oslo "oslo-tekster:tagger.log") (:helgeh "helgeh-tekster:tagger.log") ) :direction :output :if-exists :append :if-does-not-exist :create) (write-string (getf *log-strings* user) log-stream)) (setf (getf *log-strings* user) ""))) #+test (start-corpus-tagger :user :till) #+test (print (getf *tag-processes* :till)) #+test (stop-corpus-tagger :user :till) (defmethod tag-corpus-page ((request http-request) entity &key user-key) (with-html-response (request entity stream (user update stop start print-lc-features)) (let ((user-key (or user-key (intern (string-upcase user) :keyword)))) (cond (update nil) (stop (stop-corpus-tagger :user user-key)) (start (start-corpus-tagger :user user-key :print-lc-features print-lc-features))) #m(html (head (title "Tagg korpuset")) (body ((form:method "post") (input/ :type "hidden" :name "user" :value #S(string-downcase (symbol-name user-key))) (p (input/ :type "submit" :name "start" :value "Start taggingen")) (p (input/ :type "submit" :name "stop" :value "Avbryt taggingen")) (p (input/ :type "submit" :name "update" :value "Oppdater")) (p ((input :type "checkbox" :name "print-lc-features") "legg til LC-trekk"))) (br/) (nobr #L(write-string (getf *log-strings* user-key) stream))))))) (publish :path "/cl/cgp/tag-oslo-corpus.html" :content-type "text/html" :authorizer *authorizer* :function (lambda (request entity) (tag-corpus-page request entity :user-key :oslo))) (publish :path "/cl/cgp/tag-till-corpus.html" :content-type "text/html" :authorizer *authorizer* :function (lambda (request entity) (tag-corpus-page request entity :user-key :till))) (publish :path "/cl/cgp/tag-helgeh-corpus.html" :content-type "text/html" :authorizer *authorizer* :function (lambda (request entity) (tag-corpus-page request entity :user-key :helgeh))) #+test (print (probe-file "lisp:projects;cgp;texts;vaartl-1.renset")) #+test (print (probe-file "/Users/paul/lisp/projects/cgp/texts/vaartl-1.renset")) #+test (print (directory "lisp:projects;cgp;texts;vaartl_*.renset")) ;;; EOF