;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;; ;; Copyright (C) Paul Meurer 1999, 2000. All rights reserved. ;; paul.meurer@hit.uib.no ;; HIT-centre, University of Bergen ;; ;; Version 0.9 ;; ;; Constraint Grammar Parser, code for testing ;;------------------------------------------------------------------------------------- ;; TO DO: ;; ;;------------------------------------------------------------------------------------- (in-package "CGP") ;(resolve-set *cg* 'ikke-@s-pred-@obj nil) #+obsolete? (defun disambiguate-stream (stream &optional (out-stream *standard-output*)) "Toplevel disambiguation function with output to *standard-output*" (let ((sentence-count 0) (error-count 0) (lingsoft-error-count 0)) (with-stream-sentences (sentence stream) (incf sentence-count) #+ignore (when (and (not (eq out-stream *standard-output*)) (zerop (mod sentence-count 10))) (format *standard-output* "Sentences: ~4d, words: ~5d~%" sentence-count word-count)) (let ((word-count 0) (d-sentence (disambiguate sentence))) (loop for cohort across d-sentence when (stringp (caar cohort)) do (incf word-count) (prin1 (caar cohort) out-stream) (dolist (rule-id (reverse (cdar cohort))) (write-char #\Space out-stream) (write-string rule-id out-stream)) (terpri out-stream) (let ((error-p nil) (lingsoft-error-p nil) (sentence-errors ())) (dolist (reading (cdr cohort)) (when (and reading (not (eq (car reading) '<<<))) (when (and *report-uncorrect-readings-p* (not (eq out-stream *standard-output*))) (cond ((find ' reading) (when (find (cons word-count (car reading)) (gethash sentence-count *uncorrect-readings*) :test #'equal) (push (format nil "<< ~3d << ~4d: ~a" lingsoft-error-count sentence-count reading) sentence-errors) (setf lingsoft-error-p t))) ((not (find (cons word-count (car reading)) (gethash sentence-count *uncorrect-readings*) :test #'equal)) (push (format nil "** ~3d >> ~4d: ~a" error-count sentence-count reading) sentence-errors) (setf error-p t)) (t (setf error-p t lingsoft-error-p t)))) (write-char #\Tab out-stream) (dolist (feature reading) (cond ((stringp feature) (format out-stream "~s " feature)) ((integerp feature) (format out-stream "~d " feature)) (t (if (listp feature) (format out-stream "(~{~s~^ ~})" feature) (write-string (string-downcase (string feature)) out-stream)) (write-char #\Space out-stream)))) (write-char #\Newline out-stream))) (cond ((and lingsoft-error-p (not error-p)) (mapc (lambda (line) (write-line line *standard-output*)) (reverse sentence-errors)) (incf lingsoft-error-count)) ((and error-p (not lingsoft-error-p)) (mapc (lambda (line) (write-line line *standard-output*)) (reverse sentence-errors)) (incf error-count))) (setf sentence-errors ()))))))) #+sgml-entities (defun %convert-string (str) (let ((length (length str))) (labels ((convert (pos result) (if (= pos length) result (let ((start pos) (entity nil)) (loop for char = (char str pos) do (setf entity (and (not (<= (the fixnum #.(char-code #\A)) (the fixnum (char-code char)) (the fixnum #.(char-code #\z)))) (char/= char #\<) (char/= char #\>) (gethash char *sgml-char-entity-table*))) (incf pos) until (or (= pos length) entity)) (cond ((not entity) (concat result (subseq str start))) ((> pos start) (convert pos (concat result (subseq str start (1- pos)) entity))) (t (convert pos (concat result entity)))))))) (convert 0 "")))) #+old (defun %nsubstitute-numbers (list) (nsubstitute '\1 1 (nsubstitute '\2 2 (nsubstitute '\3 3 list)))) ;; change name! (defun %nsubstitute-numbers (list) (cons (car list) (mapcar (lambda (elt) (case elt (1 '\1) (2 '\2) (3 '\3) (otherwise (intern (string-upcase (%convert-string (symbol-name elt))) :cgp)))) (cdr list)))) (defvar *tagged-file*) (defun strip-quotes (word) (string-trim $quote word)) ;(convert-string "prøv-line" :to :sgml) (defun find-differences (file tagged-file) (declare (optimize speed)) (declare (fixnum word-count sentence-count)) (let ((*tagger* *nbo-tagger*)) (setf *tagged-file* tagged-file) (with-open-file (tagged-stream tagged-file :direction :input) (let ((tagged-line-count 0) (prev-line (read-line tagged-stream nil)) (accumulated-words ()) (accumulated-tagged-words ())) (when prev-line (with-open-file (stream file :direction :input) (map-sentences stream (lambda (sentence) (let ((difference-p nil)) (map-tokens (disambiguate (initialize-sentence-array sentence) (list #'disambiguate-sentence #'h-disambiguate-sentence #'disambiguate-sentence)) (lambda (token) ;(setf prev-line (%fix-quote prev-line)) (loop with not-in-sync-p = nil do (let* ((word (substitute #\" #\„ (token-value token))) (tagged-word (remove-stars (string-left-trim "$" (subseq prev-line 2 (position #\> prev-line)))))) (cond ((or (string-equal (strip-quotes word) (strip-quotes tagged-word)) ;(and (string= word "´") (string= tagged-word "\"")) ;(and (string= word "ª") (string= tagged-word "\"")) (and accumulated-words (string= (apply #'concatenate 'string (reverse (cons word accumulated-words))) tagged-word)) (and accumulated-tagged-words (string= (apply #'concatenate 'string (reverse (cons tagged-word accumulated-tagged-words))) word))) (let ((cohorts (loop do (setf prev-line (read-line tagged-stream nil)) when (null prev-line) do (return-from find-differences) until (char= (char prev-line 0) #\") collect (%nsubstitute-numbers (read-from-string (u:concat "(" (%fix-quote prev-line) ")"))))) (features (token-features token))) (dolist (f-list features) (unless (or (null f-list) (find-if (lambda (cohort) (and (string-equal (strip-quotes (remove-stars (car cohort))) (strip-quotes (car f-list))) #+bit-vectors (setf (sbit (cdr f-list) (feature-code '<<<)) 0) #+bit-vectors (equal (cdr f-list) #+ignore(cdr cohort) #-ignore (apply #'encode-features (cdr cohort))) #-bit-vectors (loop for c in (cdr cohort) always (or (find c '()) (find c (cdr f-list)))) #-bit-vectors (loop for f in (cdr f-list) always (or (find f '(<<<)) (find f (cdr cohort)))))) cohorts #+ignore features)) (push (format nil "#~a##" tagged-line-count) (token-used-rules token)) (unless (string= word "seg") ; *** (setf difference-p t)))) (dolist (cohort cohorts) (incf tagged-line-count) (unless (find-if (lambda (f-list) (and (string-equal (strip-quotes (car f-list)) (strip-quotes (remove-stars (car cohort)))) #+bit-vectors (setf (sbit (cdr f-list) (feature-code '<<<)) 0) #+bit-vectors (equal (cdr f-list) (apply #'encode-features (cdr cohort))) #-bit-vectors (loop for c in (cdr cohort) always (or (find c '()) (find c (cdr f-list)))) #-bit-vectors (loop for f in (cdr f-list) always (or (find f '(<<<)) (find f (cdr cohort)))))) features) (push (format nil "#~a#" tagged-line-count) (token-used-rules token)) (unless (string= word "seg") ; *** (setf difference-p t)))) (setf accumulated-words () accumulated-tagged-words () not-in-sync-p nil) (incf tagged-line-count))) ((string= word "|") ;; probably a wrong sentence terminator (push (format nil "#~a#!!#" tagged-line-count) (token-used-rules token)) (setf difference-p t)) ((string= word ".") ;; probably a wrong sentence terminator (push (format nil "#~a#!!#" tagged-line-count) (token-used-rules token)) (setf difference-p t)) ((string= tagged-word "|") ;; probably a wrong sentence terminator ;; forward to next word (loop do (setf prev-line (read-line tagged-stream nil)) when (null prev-line) do (return-from find-differences) do (incf tagged-line-count) until (char= (char prev-line 0) #\")) (push (format nil "#~a#??#" tagged-line-count) (token-used-rules token)) (setf difference-p t not-in-sync-p t)) ((string= tagged-word ".") ;; probably a wrong sentence terminator ;; forward to next word (loop do (setf prev-line (read-line tagged-stream nil)) when (null prev-line) do (return-from find-differences) do (incf tagged-line-count) until (char= (char prev-line 0) #\")) (push (format nil "#~a#!!#" tagged-line-count) (token-used-rules token)) (setf difference-p t not-in-sync-p t)) ((and (< (length word) (length tagged-word)) (zerop (search (apply #'concatenate 'string (append accumulated-words (list word))) tagged-word))) (push (format nil "#~a#?#" tagged-line-count) (token-used-rules token)) (push word accumulated-words) (setf difference-p t)) ((and (< (length tagged-word) (length word)) (zerop (search (apply #'concatenate 'string (append accumulated-tagged-words (list tagged-word))) word))) ;; forward to next word (loop do (setf prev-line (read-line tagged-stream nil)) when (null prev-line) do (return-from find-differences) do (incf tagged-line-count) until (char= (char prev-line 0) #\")) (push (format nil "#~a#?#" tagged-line-count) (token-used-rules token)) (push tagged-word accumulated-tagged-words) (setf difference-p t not-in-sync-p t)) (t (print (list word tagged-word)) (break) (return-from find-differences :not-in-sync)))) while not-in-sync-p))) (when difference-p (print-sentence sentence))))))))))) #+test (d " Forholdet mellom amerikanske myndigheter, representert ved DEA, og myndigheter i transittlandene har ofte vært anstrengt.") #+test (d "Vi vil vite mer om stjernestoffet vi er laget av.") #+test (find-differences "projects:cgp;training;delkorp.txt" "projects:cgp;training;delkorpbm.dis") #+test (find-differences "projects:cgp;training;delkorp-1.txt" "projects:cgp;training;delkorpbm-1.dis") #+test (find-differences "projects:cgp;training;test.txt" "projects:cgp;training;test.dis") #+test (with-open-file (stream "projects:cgp;multi-tagger;texts;delkorp.txt") (map-sentences stream (lambda (s) (print-sentence s)))) #+test (d " - Vi blir stadig mistenkeliggjort, mens virkeligheten er at vi driver en legitim virksomhet av meget stor betydning for landet.") #+test (d "Regjeringen må ta et kollektivt ansvar for å berge næringen nå i ellevte time. Dette forutsetter et sterkt engasjement særlig fra statsministeren, næringsministeren og fra handelsministerens side. \"Reguleringspolitikken øker vår råvareorientering og fører til redusert verdiskapning og sysselsetting i distriktene.\" /*BILLEDTEKST:*/ POTENSIAL: Havbruksnæringen er en næring med et enormt potensial for økt sysselsetting og verdiskapning i distriktene og betydelige eksportinntekter for landet, skriver Øystein Steiro. Foto: KATRINE NORDLI ") #| (mt "DR. KATRINE NORDLI PÅ BARNE-TV OG VOKSEN-TV") (mt "Katrine Nordli") (mt "Olav Trygge Storvik") (mt "OLAV TRYGGE STORVIK") (mt "Øystein Steiro") (mt "ØYSTEIN TRYGGE STEIGRAVE") (mt "Steigrave") |# #+test (d " Hjertesukk fra administrerende direktør i Kongsberg-gruppen: Frustrert over mistenkeliggjørelsen Forsvarsindustrien er frustrert. ") #+test (d " Hjertesukk fra administrerende direktør i Kongsberg-gruppen: Frustrert over mistenkeliggjørelsen Forsvarsindustrien er frustrert over en stadig mistenkeliggjørelse fra politisk hold og i mediene. OLAV TRYGGE STORVIK Det er administrerende direktør i Kongsberg-gruppen Jan T. Jørgensen som gir uttrykk for dette i en samtale med Aftenposten.") #+test (d " Forsvarsindustrien er frustrert over en stadig mistenkeliggjørelse fra politisk hold og i mediene. OLAV TRYGGE STORVIK Det er administrerende direktør i Kongsberg-gruppen Jan T. Jørgensen som gir uttrykk for dette i en samtale med Aftenposten.") #+test (d "TOPPSJEF: Administrerende direktør Jan T. Jørgensen i Kongsberg-gruppen. KOMPETANSE: Kongsberg-gruppen har utviklet en etterspurt spisskompetanse innen sjømilitære missiler. Foto: KONGSBERG-GRUPPEN ROMFART: Kunnskaper ervervet under utviklingen av den militære Penguin-raketten har gitt Kongsberg-gruppen kontrakt om levering av deler til den franske Ariane-raketten, viktig i europeisk romfart. Foto: KONGSBERG-GRUPPEN En havbruksnæring i politisk krise ØYSTEIN STEIRO Det er de uforutsigelige politiske rammebetingelsene som utgjør det største hinderet for ytterligere investeringer og vekst i havbruksnæringen, hevder Øystein Steiro i dagens kronikk.") #+test (mt "Men han er en ekstraprisbelønnet forsker.") ;;; benchmarking #+test (progn (cl-user::gc) (time (let ((*show-used-rules* nil)) (setf *check-constraint-count* 0) (disambiguate-from-string " - BŒde vi og andre konserner s¿ker strategiske partnere internasjonalt, men slikt samarbeide forplikter. Vi mŒ ogsŒ kunne levere den dagen vi vinner en kontrakt, sier J¿rgensen. For at strategiske industriallianser av denne typen ikke skal bli en umulighet i fremtiden, ¿nsker han Œ komme i dialog med myndighetene om regelverk og rammebetingelser." :print-function (lambda (&rest rest) (declare (ignore rest))) ;:tagging-niveau :multi-tagging ;:syntactic-disambiguation ;:multi-tagging ;:syntactic-mapping :cg *nbo-cg*)))) ; *nbo-cg* ;(setf *check-constraint-count* 0) ;; 50 words: ;; 6.639 seconds, 3,380,712 bytes, G4/450, 31.8.2000 ;; after memoizing of parsed position operators: ;; 5.361 seconds, 2,093,640 bytes ;; 2.151 seconds, 2,104,128 bytes without syntactic mapping and disambiguation ;; 23.09.2000 ;; 1.714 seconds, 2,169,672 bytes, new rules ; 20523 invocations of constraint-satisfied-p ;;(0.825 seconds, 1,562,752 bytes for multitagging alone) ;; 1.597 seconds, 2,354,072 bytes ;; 1.433 seconds, 1,586,976 bytes ;; 1.362 seconds, 1,572,120 bytes (nested hash tables for rules lookup) ;; 1.258 seconds, 1,566,056 bytes (better DO-DISAMBIGUATION-RULES macro) ;; 0.673 seconds, 255,976 bytes !! (better code-vector -> features conversion) ;;(0.250 seconds, 252,656 bytes !! for multitagging alone) ;; 25.09.2000 ;; 0.612 seconds, 218,384 bytes (simplified nets) ;;(0.204 seconds, 214,768 bytes for multitagging alone) ;; 0.584 seconds, 218,072 bytes (bit -> sbit) ;;(0.170 seconds, 214,768 bytes for multitagging alone) ;; 0.464 seconds, 235,344 bytes with bit-vectors ;;(0.180 seconds, 232,024 bytes for multitagging) ;; 0.426 seconds, 193,856 bytes, net coded as bit-vector ;; 0.392 seconds, 195,520 bytes, 17419 invocations of constraint-satisfied-p ;; 03.11.2000 ;; 1.917 seconds, 196,520 bytes syntactic disambiguation! ;; 1,813 seconds, 197,552 bytes ;; 0.782 seconds, 196,968 bytes after tree-coding of rules! 3.5 times faster! ;;(0.139 seconds multitagging) ;; 0.777 seconds, 196,912 bytes ;; 0.749 seconds, 196,912 bytes ;; 27.2.2001 ;; 0.412 seconds mt + d , constraints checked: 13075 ;; tree storage: ;; 0.514 seconds (0.133 seconds GC), constraints checked: 15381, 1,715,976 bytes of memory allocated. ;; (mt: 0.159 seconds) ;; 0.481 seconds (0.123 seconds GC), constraints checked: 13932, 1,461,408 bytes of memory allocated. ;; reordering of constraints: ; constraint count: 13075, 15381 optimized ;-) ; #+test (disambiguate-from-string "Kva er det som tyt og aldri teier i villan hei? - Bekken han tyt og aldri teier på langan lei." ;:print-function (lambda (&rest rest) (declare (ignore rest))) ;:tagging-niveau :multi-tagging :cg *nny-cg*) #+ignore (time (dotimes (i 10000) (make-array *code-vector-length* :element-type 'bit :initial-element 0))) #+ignore (time (let ((bit-vector (make-array *code-vector-length* :element-type 'bit :initial-element 0)) (null-vector (make-array *code-vector-length* :element-type 'bit :initial-element 0))) (dotimes (i 10000) (setf bit-vector (bit-and bit-vector null-vector bit-vector))))) #+ignore (let ((bit-vector (make-array *code-vector-length* :element-type 'bit :initial-element 0)) (null-vector (make-array *code-vector-length* :element-type 'bit :initial-element 0))) (setf (sbit bit-vector 1) 1) (setf bit-vector (bit-and bit-vector null-vector bit-vector)) bit-vector) #+ignore (let ((bit-vector #*111)) (ccl::bit-boole ccl::boole-clr bit-vector bit-vector bit-vector)) #+test (mt " - Både vi og andre konserner søker strategiske partnere internasjonalt, men slikt samarbeide forplikter. Vi må også kunne levere den dagen vi vinner en kontrakt, sier Jørgensen. For at strategiske industriallianser av denne typen ikke skal bli en umulighet i fremtiden, ønsker han å komme i dialog med myndighetene om regelverk og rammebetingelser." ) #+test (let ((*cg* *nbo-cg*) (*check-constraint-count* 0) (*tried-rules-count* 0) (*used-rules-count* 0)) (time (disambiguate-file "projects:cgp;training;delkorp.txt" "projects:cgp;training;delkorp-nbo-morph.dis" :tagging-niveau :morphological-disambiguation) #+ignore (disambiguate-file "projects:cgp;training;delkorp.txt" "projects:cgp;training;delkorp-nbo-mult.dis" :tagging-niveau :multi-tagging)) (print *check-constraint-count*) (print *tried-rules-count*) (print *used-rules-count*)) ;; 3897.570 seconds (= 1:04:57 h), 3,632,037,640 bytes ;; 105,284,230 ;; 15,389,001 ;; 83,029 ;; ca. 38 min ;; ca. 26 min ;; 1506.506 s = ca. 25.0 min ;; 403.289 s = ca. 6.7 min ;; 25.2.2001 (G4 450MHz) ;; 1640.885 s = ca. 27.3 min ;; 1653.900 s = ca. 27.5 min, 408,419,088 bytes ;; 1637.878 s ;; 23.4.2001 Allegro6.0/Linux/1GHz PentiumIII/512MB Ram ; cpu time (total) 671,610 msec (00:11:11.610) user, 3,440 msec system ;; 23.4.2001 CMUCL 18c/Linux/1GHz PentiumIII/512MB Ram ;; 734.03 seconds of real time ;; 713.23004 seconds of user run time ;; 15.45 seconds of system run time ;; [Run times include 132.06 seconds GC run time] ;; 0 page faults and ;; 5457519480 bytes consed. ;; 10.1.2002 Allegro6.0/Linux/2GHz PentiumIV ;; cpu time (non-gc) 594,450 msec (00:09:54.450) user, 580 msec system ;; cpu time (gc) 5,240 msec user, 0 msec system ;; cpu time (total) 599,690 msec (00:09:59.690) user, 580 msec system ;; real time 658,095 msec (00:10:58.095) ;; space allocation: ;; 122,019,340 cons cells, 2,512,610,008 other bytes, 91784 static bytes #+test (let ((*check-constraint-count* 0) (*tried-rules-count* 0) (*used-rules-count* 0)) (disambiguate-from-string " - Både vi og andre konserner søker strategiske partnere internasjonalt, men slikt samarbeide forplikter. Vi må også kunne levere den dagen vi vinner en kontrakt, sier Jørgensen. For at strategiske industriallianser av denne typen ikke skal bli en umulighet i fremtiden, ønsker han å komme i dialog med myndighetene om regelverk og rammebetingelser." :print-function (lambda (&rest rest) (declare (ignore rest)))) (print *check-constraint-count*) (print *tried-rules-count*) *used-rules-count* )