;;;-*- Mode: Lisp; Package: STRING-NET -*- ;; Copyright (C) Paul Meurer 2000 - 2004. All rights reserved. ;; paul.meurer@aksis.uib.no ;; Aksis, UNIFOB, University of Bergen ;; Very efficient (multi)index compression using minimal finite state ;; networks, Garsia-Wachs optimal tree character compression and ;; variable length pointer compression with relative addressing ;;------------------------------------------------------------------------------------- ;; TO DO: ;; - translate description ;; - give it its own package ;; - make storing/loading code platform independent ;; ;;------------------------------------------------------------------------------------- #|| Essential functions: add-string (defmethod add-string ((net active-string-net) string) "adds a string to an active net which has not been minimized" minimize-net (defmethod minimize-net ((net active-string-net) &optional node-list (count 1)) "recursively compresses from right, results in a minimal network (directed acyclic graph)" calculate-gw-compression-tree (defmethod calculate-gw-compression-tree ((net active-string-net)) compress-net (defmethod compress-net ((net active-string-net) &key (iterate t)) store-net (defmethod store-net ((net vector-string-net) file) load-string-net (defun load-string-net (file) match-string (defmethod match-string ((net vector-string-net) string) print-strings -> write-strings (defmethod print-strings ((net active-string-net) &optional (stream *standard-output*)) count-strings (defmethod count-strings ((net vector-string-net)) map-strings (defmethod map-strings ((net vector-string-net) fn &optional (start 0)) "generates all paths through the subnetwork starting in start" nmap-strings (defmethod nmap-strings ((net vector-string-net) fn &optional (start 0) string) "generate all paths through the subnetwork starting in start. String is not copied." map-string-values (defmethod map-string-values ((net vector-string-net) string fn &optional (string-end-marker #\:)) "FN is a function of one argument which is called sucessively on all values of string in NET, or on T if there is no value (??)" nmap-string-values (defmethod nmap-string-values ((net vector-string-net) string fn &optional (string-end-marker #\:)) "FN is a function of one argument which is called sucessively on all values of string in NET, or on T if there is no value (??)" nmap-string+array (defmethod nmap-string+array ((net vector-string-net) fn string array start split-char) nmap-string+array-values (defmethod nmap-string+array-values ((net vector-string-net) string fn %string %array &optional (string-end-marker #\:)) nmap-string+string+array (defmethod nmap-string+string+array ((net vector-string-net) fn string0 string1 array start split-char) ||# (in-package "STRING-NET") (eval-when (:load-toplevel :compile-toplevel :execute) (pushnew :garsia-wachs *features*)) ;(setf *features* (delete :garsia-wachs *features*)) (defclass string-net () ()) ;; uncompressed net where each node is a list of (char . nodes) or NIL as end marker (defclass list-string-net (string-net) ((list-net :initform () :initarg :list-net :accessor list-net) (length-marked-p :initform nil) #|;; new (compression-tree :initform nil :accessor compression-tree) (integer-compression-tree :initform nil :accessor integer-compression-tree) (arcs :initform nil :accessor net-arcs) ;; byte vector (nodes :initform nil :accessor net-nodes-vector)|#)) (defun node-equal (pointers node1 node2) (and (= (length node1) (length node2)) (eq (car node1) (car node2)) (loop for (c1 a1) on (cdr node1) by #'cddr and (c2 a2) on (cdr node2) by #'cddr always (and (= c1 c2) (= (dereference pointers a1) (dereference pointers a2)))))) (defmethod print-strings ((net string-net) &optional (stream *standard-output*)) (nmap-strings net (lambda (v) (print v stream)))) ;;(print (string-values (cgp::fullforms cgp::*nbo-tagger*) "gæ")) #+test (defun test (str) (print str) (loop for i from 0 to (1- (length str)) do (setf (char str i) (code-char (unix-to-mac-char-code ;; translate-char-code (char-code (char str i)))))) (print str) (print (string-values (cgp::fullforms cgp::*nbo-tagger*) str))) #-ecl (defmethod string-values ((net string-net) string &optional (string-end-marker #\:)) (collecting (map-string-values net string (lambda (v) (collect (copy-seq v))) string-end-marker))) #+ecl (defmethod string-values ((net string-net) string &optional (string-end-marker #\:)) (let ((values ())) (map-string-values net string (lambda (v) (push (copy-seq v) values)) string-end-marker) (nreverse values))) (defmethod match-string ((net list-string-net) string) (declare (optimize (safety 0) (space 0) (speed 3))) (with-slots (list-net) net (let ((length (length string))) (declare (fixnum length)) (labels ((walk (pos nodes) (declare (fixnum pos n)) (let ((sub-node (find (restore-char (char string pos)) nodes :key #'car))) (cond ((null sub-node) nil) ((< pos (the fixnum (1- length))) (walk (the fixnum (1+ pos)) (cdr sub-node))) ((member nil (cdr sub-node)) ; end node marker t) (t (values nil (the fixnum (1+ pos)))))))) (walk 0 list-net))))) ;; subnet with STRING as prefix (defmethod string-subnet ((net list-string-net) string) (declare (optimize (safety 0) (space 0) (speed 3))) (with-slots (list-net) net (let ((length (length string))) (declare (fixnum length)) (labels ((walk (pos nodes) (declare (fixnum pos n)) (let ((sub-node (find (restore-char (char string pos)) nodes :key #'car))) (cond ((null sub-node) nil) ((< pos (the fixnum (1- length))) (walk (the fixnum (1+ pos)) (cdr sub-node))) (t sub-node))))) (walk 0 list-net))))) (defmethod count-strings ((net list-string-net)) (declare (optimize (safety 0) (space 0) (speed 3))) (let ((count 0)) (declare (fixnum count)) (labels ((walk (nodes) (loop for sub-node in nodes do (if sub-node (walk (cdr sub-node)) (incf count))))) (walk (list-net net)) count))) (defmethod nmap-strings ((net list-string-net) fn &optional start-node string (translate-p t) end-char) (declare (optimize (safety 0) (space 0) (speed 3))) (with-slots (list-net) net (let ((string (or string (make-array 0 :element-type #-mcl 'character #+mcl 'base-character :adjustable t :fill-pointer t)))) (declare (string string)) (labels ((walk (branches) (loop for sub-node in branches do (if sub-node (destructuring-bind (c . branches) sub-node (cond ((eq c end-char) (funcall fn string)) (t (vector-push-extend (if translate-p (translate-char c) c) string) (walk branches) (decf (fill-pointer string))))) (funcall fn string))))) (walk (or (cdr start-node) list-net)))))) (defmethod nmap-string-values ((net list-string-net) string fn &optional (string-end-marker #\:) start-node (restore-p t) (translate-p t)) "FN is a function of one argument which is called sucessively on all values (= subnets) of string in NET, or on T if there is no value (??)" (declare (optimize (speed 3) (safety 0)) (string string)) (with-slots (list-net) net (let ((length (length string))) (declare (fixnum length)) ;; first recognize the string, end state is string-end-marker (labels ((walk (pos branches) (declare (fixnum pos)) (when branches (if (= pos length) (when-let (sub-node (find string-end-marker branches :key #'car)) (nmap-strings net fn sub-node nil translate-p)) (when-let (sub-node (find (if restore-p (restore-char (char string pos)) (char string pos)) branches :key #'car)) (walk (the fixnum (1+ pos)) (cdr sub-node))))))) (walk 0 (or (cdr start-node) list-net)))))) ;; obs: no restore-char! (defmethod nmap-string-suffixes ((net list-string-net) string fn &optional start-node) "FN is a function of one argument which is called sucessively on all values (= subnets) of string in NET, or on T if there is no value (??)" (declare (optimize (speed 3) (safety 0)) (string string)) (with-slots (list-net) net (let ((length (length string))) (declare (fixnum length)) ;; first recognize the string, then map over the subnet (labels ((walk (pos branches) (declare (fixnum pos)) (when branches (if (= pos length) (dolist (sub-node branches) (nmap-strings net fn sub-node)) (when-let (sub-node (find (char string pos) branches :key #'car)) (walk (the fixnum (1+ pos)) (cdr sub-node))))))) (walk 0 (or (cdr start-node) list-net)))))) (defmethod nmap-string+array ((net list-string-net) fn string array start-node split-char) (declare (optimize (safety 0) (space 0) (speed 3))) (labels ((walk-string (branches) (dolist (sub-node branches) (if sub-node (destructuring-bind (c . branches) sub-node (cond ((char= c split-char) (walk-array branches)) (t (vector-push-extend (translate-char c) string) (walk-string branches) (decf (fill-pointer string))))) (funcall fn string array)))) (walk-array (branches) (dolist (sub-node branches) (if sub-node (destructuring-bind (c . branches) sub-node (cond #+ignore ;; this seems to be a bug ((char= c split-char) (walk-array branches)) (t (vector-push-extend (char-code c) array) (walk-array branches) (decf (fill-pointer array))))) (funcall fn string array))))) (walk-string (cdr start-node)))) (defmethod nmap-string+array-values ((net list-string-net) string fn %string %array &optional (string-end-marker #\:)) (declare (optimize (speed 3) (safety 0)) (string string)) (with-slots (list-net) net (let ((length (length string))) (declare (fixnum length)) ;; first recognize the string, end state is string-end-marker (labels ((walk (pos branches) (declare (fixnum pos)) (when branches (if (= pos length) (when-let (sub-node (find string-end-marker branches :key #'car)) (nmap-string+array net fn %string %array sub-node string-end-marker)) (when-let (sub-node (find (restore-char (char string pos)) branches :key #'car)) (walk (the fixnum (1+ pos)) (cdr sub-node))))))) (walk 0 list-net))))) ;; compression (defun bit-set-p (array pos bit-pos) (declare (fixnum pos bit-pos)) (let ((set-p (logbitp bit-pos (the fixnum (aref array pos))))) (if (= bit-pos 7) (values set-p (the fixnum (1+ pos)) 0) (values set-p pos (the fixnum (1+ bit-pos)))))) ;;; ------------------------------- storing ---------------------------------- (defun translate-char-code (code) #+(or :pcl :unix) (mac-to-unix-char-code code) #+mcl code #+(or :win32 :mswindows) (mac-to-win-char-code code)) (defun restore-char-code (code) #+(or :pcl :unix) (unix-to-mac-char-code code) #+mcl code #+(or :win32 :mswindows) (win-to-mac-char-code code)) (defun translate-char (char) #+mcl char #-mcl (if (eq char :end) char (code-char (translate-char-code (char-code char))))) (defun restore-char (char) #+mcl char #-mcl (code-char (restore-char-code (char-code char)))) ;;; ----------------------- right relative compression -------------------------- (defun string-difference (str1 str2 &optional (back-marker #\#)) (let ((common (string/= str1 str2))) (cond ((null common) ; means they are equal "") ((= common (length str1)) (subseq str2 common)) ((= common (length str2)) (let ((str (make-string 2))) (setf (char str 0) back-marker (char str 1) (code-char (- (length str1) common))) str)) (t (let ((str (make-string 2))) (declare (dynamic-extent str)) (setf (char str 0) back-marker (char str 1) (code-char (- (length str1) common))) (concatenate 'string str (subseq str2 common))))))) ;(string-difference "asdfr" "asdfy") (defun compress-string (string base &optional (back-marker #\#)) "compresses string relative to base string" (string-difference base string back-marker)) (defun decompress-string (code-str base &optional (back-marker #\#)) (declare (optimize (speed 3) (safety 0)) (string code-str base)) (cond ((string= code-str "") base) ((char= (char code-str 0) back-marker) (let ((back (char-code (char code-str 1))) (add (subseq code-str 2))) (concatenate 'string (subseq base 0 (- (length base) back)) add))) (t (concatenate 'string base code-str)))) ;;; -------------------------- testing, benchmarks ------------------------- #|| (reset-net *net*) (mapc (lambda (string) (add-string *net* string)) '("abcd" "abce" "abgd" "abge" "b")) (minimize-net *net*) #+copy (defmacro with-stream-lines ((stream line) &body body) `(let (,line) ;(locally (declare (speed 3) (safety 0))) (loop do (setf ,line (read-line ,stream nil :eof)) until (eq ,line :eof) do ,@body))) #+copy (defmacro with-file-lines ((line path) &body body) (let ((stream (gensym))) `(with-open-file (,stream ,path) (with-stream-lines (,stream ,line) ,@body)))) ; most-positive-fixnum ; 536870911 ;(log most-positive-fixnum 2) ; 29.0 ;;;; MCL/G4: ;; 1000 strings: 20s, 9 iterations, 800 states, 1472 arcs ;; 2000 strings: 46s, 9 iterations, 1652 states, 3032 arcs ;; 24876 strings: 1043s, 12 iterations, 15035 states, 31417 arcs ;;;; MCL/G4 new code: ;; 1000 strings: 2s, 10 iterations, 787 states, 1455 arcs ;; 2000 strings: 5s, 10 iterations, 1614 states, 2982 arcs ;; 24876 strings: 95s, 13 iterations, 13737 states, 29231 arcs ;;;; MCL/G4 very new code: ;; 1000 strings: 0.1s, 10 iterations, 786 states, 1454 arcs ;; 2000 strings: 0.2s, 10 iterations, 1605 states, 2970 arcs ;; 24876 strings: 13.8s, 13 iterations, 13599 states, 29081 arcs ;; 45407 strings: 44.9s, 14 iterations, 23109 states, 50761 arcs ;;;; MCL/G4 very very new code (hashtables) (without compression): ;; 1000 strings: 0.1s, 10 iterations, 786 states, 1454 arcs ;; 2000 strings: 0.2s, 10 iterations, 1605 states, 2970 arcs ;; 24876 strings: 3.4s, 13 iterations, 13599 states, 29081 arcs ;; 45407 strings: 6.5s, 14 iterations, 23109 states, 50761 arcs ;; 288096 strings: 86.5s, 21 iterations, 59656 states, 144175 arcs woww!! ;;;; MCL/G4 very very new code (with full compression): ;; 1000 strings: 0.4s, 10 iterations, 786 states, 1454 arcs ;; 2000 strings: 0.9s, 10 iterations, 1605 states, 2970 arcs ;; 24876 strings: 9.8s, 13 iterations, 13599 states, 29081 arcs ;; string.text: 18.2s / 21,1 ;; 288096 strings: ??s, 21 iterations, 59656 states, 144175 arcs ;; (3,925,294 bytes, simply compressed: 959,499 bytes = states x 4 + arcs x 5) ;;;; MCL/G4 with Garsia-Wachs compression: ;; 24876 strings: 9.2s, 13 iterations, size: [78307] (without: size: [83444]) ;;;; Allegro/NT new code: ;; 1000 strings: 5s, 10 iterations, 78 states, 1455 arcs ;; 2000 strings: 13s, 10 iterations, 1614 states, 2982 arcs ;; 24876 strings: ??s, 13 iterations, 13737 states, 29231 arcs ;;;; Allegro/NT very very new code: ;; 1000 strings: 0.9s, 10 iterations, 78 states, 1455 arcs ;; 2000 strings: 3.1s, 10 iterations, 1614 states, 2982 arcs ;; 24876 strings: 370s (!), 13 iterations, 13737 states, 29231 arcs ;;;; Allegro/NT very very new code (vector-push-extend fixed): ;; 1000 strings: 0.4s, 10 iterations, 78 states, 1455 arcs (= *delta* 10000) ;; 2000 strings: 0.8s, 10 iterations, 1614 states, 2982 arcs ;; 24876 strings: 11.8s, 13 iterations, 13737 states, 29231 arcs ;; 45407 strings: 26.5s, 14 iterations, 23109 states, 50761 arcs ;;;; LispWorks/NT new code: ;; 1000 strings: 7s, 10 iterations, 787 states, 1455 arcs ;; 2000 strings: 15s, 10 iterations, 1614 states, 2982 arcs ;; 24876 strings: ??s, 13 iterations, 13737 states, 29231 arcs (print-strings *net1*) (store-net *net* "projects:cgp;string-net;words.net") (defparameter *net1* (load-string-net "projects:cgp;string-net;words.net")) (defparameter *nbo* (load-string-net "projects:cgp;nets;nbo-code-lexicon.net")) (defparameter *nbo-abb* (load-string-net "projects:cgp;nets;nbo-abbreviations.net")) (time (defparameter *nbo-abb-list* (make-list-string-net *nbo-abb*))) (time (count-strings *nbo*)) ;; 18 sec (time (count-strings *nbo-list*)) ;; 0.68 sec (print-strings *nbo-abb*) (time (dotimes (i 10) (count-strings *net*))) ; 404 ms with Garsia-Wachs compression ; 519 ms without (defparameter *net* (make-instance 'active-string-net)) (time (let ((count 0)) (setf *net* (make-instance 'active-string-net)) (block add (with-file-lines (line ;;"projects:string-net;words1.text" ;;"projects:string-net;test.text" "/usr/share/dict/words" ) (add-string *net* line) (incf count) (when (zerop (mod count 10000)) (print count)) (when nil ;(>= count 1000) (return-from add)))) (print count) (minimize-net *net*) ;(count-strings *net*) (calculate-gw-compression-tree *net*) (compress-net *net* :iterate t))) (time (let ((count 0)) (setf *net* (make-instance 'active-string-net)) (block add (with-file-lines (line "projects:cgp;string-net;elhuyar-lemma.txt") (add-string *net* line) (incf count) (when (zerop (mod count 10000)) (print count)) (when nil ;(>= count 1000) (return-from add)))) (print count) (minimize-net *net*) ;(count-strings *net*) (calculate-gw-compression-tree *net*) (compress-net *net* :iterate t))) ; compressing ... [156742] ; compressing ... [172235] ; compressing ... [173873] ; compressing ... [173964] ; compressing ... [173967] ; compressing (5) ... [149312] ; compressing ... [3398] (pushnew :absolute-addresses *features*) (setf *features* (delete :absolute-addresses *features*)) ;(match-string "concubine") (time (minimize-net)) (count-nodes *net*) (print-strings *net*) (count-strings *net*) (loop for i across *pointers* do (print-node i)) (dotimes (i (fill-pointer *nodes*)) do (print-node i)) ||# ;; Garsia-Wachs compression ;; conses like crazy (defmethod frequency-list ((net string-net)) (with-slots (arcs compression-vector) net (let ((table (make-hash-table)) (f-list ())) (labels ((walk (pos) (multiple-value-bind (arc-list) (decode-arcs arcs compression-vector pos nil) (loop for (c a) on arc-list by #'cddr do (incf (gethash c table 0)) (walk a))))) (walk 0)) (maphash (lambda (c w) (push (cons c w) f-list)) table) (sort f-list #'> :key #'cdr)))) ;(count-strings *bm-lexicon*) ; (/ 13337872.0 5652128.0) #+test (frequency-list *bm-lexicon*) ;;; list-string-net (defmethod add-string ((net list-string-net) string) "adds a string to a string net tree" (with-slots (list-net) net (let ((length (length string)) (lnet (cons nil list-net))) (labels ((add-char (pos node) (declare (fixnum pos n)) (if (= pos length) (pushnew nil (cdr node)) ; end marker (let* ((char (char string pos)) (sub-node (find char (cdr node) :key #'car))) (cond ((null sub-node) (let ((new-node (list char))) (setf (cdr node) (append (cdr node) (list new-node))) #+test (push new-node (cdr node)) (add-char (the fixnum (1+ pos)) new-node))) (t (add-char (the fixnum (1+ pos)) sub-node))))))) (add-char 0 lnet) (setf list-net (cdr lnet)) string)))) (defmethod remove-branch ((net list-string-net) prefix) "removes a branch from a string net tree" (with-slots (list-net) net (let ((length (length prefix)) (lnet (cons nil list-net))) (labels ((remove-node (pos node) (let* ((char (char prefix pos)) (sub-node (find char (cdr node) :key #'car))) ;;(print (list char sub-node)) (cond ((null sub-node) ;; prefix not in node nil) ((or (= (1+ pos) length) (remove-node (1+ pos) sub-node)) (cond ((cddr node) (setf (cdr node) (remove sub-node (cdr node))) nil) ((= pos 0) (setf (cdr lnet) nil)) (t ;; only branch; recurse back t))) (t nil))))) (remove-node 0 lnet) (setf list-net (cdr lnet)))))) (defmethod map-tree-nodes ((net list-string-net) fun) (with-slots (list-net) net (labels ((walk (node) (funcall fun node) (mapc #'walk (cdr node)))) (mapc #'walk list-net)))) (defun node< (node1 node2) (cond ((null node1) nil) ((null node2) t) ((char> (car node1) (car node2)) t) ((char< (car node1) (car node2)) nil) (t (let ((len1 (length node1)) (len2 (length node2))) (cond ((< len1 len2) t) ((> len1 len2) nil) (t (loop for n1 in (cdr node1) and n2 in (cdr node2) unless (eq n1 n2) return (node< n1 n2)))))))) (defun pnode< (node1 node2) (cond ((null node1) nil) ((null node2) t) ;; necessary?? ((> (caar node1) (caar node2)) t) ;; necessary?? ((< (caar node1) (caar node2)) nil) ((char> (cdar node1) (cdar node2)) t) ((char< (cdar node1) (cdar node2)) nil) (t (let ((len1 (length node1)) (len2 (length node2))) (cond ((< len1 len2) t) ((> len1 len2) nil) (t (loop for n1 in (sorted-branch node1) and n2 in (sorted-branch node2) unless (eq n1 n2) ;;(pnode-set-equal n1 n2) ;; (eq n1 n2) ***** return (pnode< n1 n2)) #+old (loop for n1 in (cdr node1) and n2 in (cdr node2) unless (pnode-set-equal n1 n2) ;; (eq n1 n2) ***** return (pnode< n1 n2)))))))) ;; rename! (defun spnode< (node1 node2) (cond ((eq node1 node2) :equal) ((null node1) nil) ((null node2) t) ;; necessary?? ((> (abs (caar node1)) (abs (caar node2))) t) ;; necessary?? ((< (abs (caar node1)) (abs (caar node2))) nil) ((char> (cdar node1) (cdar node2)) t) ((char< (cdar node1) (cdar node2)) nil) (t (let ((len1 (length node1)) (len2 (length node2))) (cond ((< len1 len2) t) ((> len1 len2) nil) (t (loop for n1 in (sorted-branch node1) and n2 in (sorted-branch node2) unless (eq n1 n2) ;;(pnode-set-equal n1 n2) ;; (eq n1 n2) ***** return (spnode< n1 n2) finally (return :equal)))))))) (defun sorted-branch (node) (sort (copy-seq (cdr node)) (lambda (node1 node2) (cond ((null node1) nil) ((null node2) t) ((char> (cdar node1) (cdar node2)) t) ((char< (cdar node1) (cdar node2)) nil))))) (defmethod sort-net ((net list-string-net) sp) (stable-sort sp #'node<)) (defclass partial-string-net (list-string-net) ()) (defmethod sort-net ((net partial-string-net) sp) (stable-sort sp #'pnode<)) (defun node-set-equal (set1 set2) (and (= (length set1) (length set2)) (loop for elt in set1 always (member elt set2)))) (defun pnode-set-equal (node1 node2) (or (and (null node1) (null node2)) (and node1 node2 (= (length node1) (length node2)) (char= (cdar node1) (cdar node2)) (loop for elt in (cdr node1) always (member elt (cdr node2)))))) #-ecl (defmethod minimize-net ((net list-string-net) &optional %node-list (count 1)) "recursively compresses from right, results in a minimal network (directed acyclic graph)" (with-slots (list-net) net ;; first, sort the net (format t "~%sorting ...") (let ((prev-node t) (changed-p nil) (changed-nodes (make-hash-table)) (node-list (or (sort-net net %node-list) (sort-net net (collecting (map-tree-nodes net (lambda (node) #+ignore (when (member nil (cdr node)) (collect node)) #-ignore (unless (or (null node) (find-if-not #'null (cdr node))) (collect node))))))))) ;; then, unify common subnets at a given level, starting with final nodes (format t "~%minimizing (~d) ... [~d]" count (length node-list)) (loop for node in node-list do (cond ((eq prev-node t) (setf prev-node node)) ;; since the nodes are sorted, we need only compare ;; adjacent nodes for equality ((node-set-equal prev-node node) ;; in case they are equal, they can be unified ;;(unless (consp (car node)) ;; don't do it twice! (setf (car node) prev-node) (setf (gethash prev-node changed-nodes) t) (setf changed-p t)) (t ;; if not, try next node (setf prev-node node)))) (let ((new-changed-nodes ())) (labels ((walk (node branch) (let ((changed-p nil)) (loop for children on branch do (cond ((consp (caar children)) (setf (car children) (caar children) changed-p t)) (t (when (gethash (car children) changed-nodes) (setf changed-p t))))) (when (and node changed-p) (push node new-changed-nodes)) ;; necessary also when changed-p = T! (mapc (lambda (child-node) (walk child-node (cdr child-node))) branch)))) (when changed-p ;; new-changed-nodes (walk nil list-net) (minimize-net net new-changed-nodes (1+ count)))))))) (defmethod count-nodes ((net list-string-net)) (declare (optimize (safety 0) (space 0) (speed 3))) (let ((node-table (make-hash-table)) (count 0)) (declare (fixnum count)) (labels ((walk (nodes) (incf count) (loop for sub-node in nodes do (unless (gethash sub-node node-table) (setf (gethash sub-node node-table) t) (when sub-node (walk (cdr sub-node))))))) (walk (list-net net)) count))) (defmethod count-conses ((net list-string-net)) (declare (optimize (safety 0) (space 0) (speed 3))) (let ((node-table (make-hash-table)) (count 1)) (declare (fixnum count)) (labels ((walk (nodes) (loop for sub-node in nodes do (when sub-node (incf count)) (unless (gethash sub-node node-table) (setf (gethash sub-node node-table) t) (when sub-node (incf count) (walk (cdr sub-node))))))) (walk (list-net net)) count))) #|| (defparameter *net* nil) (time (let ((count 0)) (setf *net* (make-instance 'active-string-net)) (block add (with-file-lines (line "projects:dictionaries;elhuyar-lemma.txt") (add-string *net* line) (incf count) (when (zerop (mod count 10000)) (print count)) (when (>= count 100) (return-from add)))) (print count) (minimize-net *net*) ;(count-strings *net*) (calculate-gw-compression-tree *net*) (compress-net *net* :iterate t))) (defparameter *list-net* nil) (time (let ((count 0)) (setf *list-net* (make-instance 'list-string-net)) (block add (with-file-lines (line "projects:dictionaries;elhuyar-lemma.txt") (add-string *list-net* line) (incf count) (when (zerop (mod count 10000)) (print count)) (when (>= count 100) (return-from add)))) (print count) (minimize-net *list-net*) ;(count-strings *list-net*) #+ignore (calculate-gw-compression-tree *list-net*) #+ignore (compress-net *list-net* :iterate t))) (count-strings *list-net*) (count-strings *net*) (count-nodes *net*) (count-nodes *list-net*) (print-strings *net*) (print-strings *list-net*) ||# (defclass feature-string-net (list-string-net) ()) (defun fs-node< (node1 node2) (cond ((null node1) nil) ((null node2) t) ((and (characterp (car node1)) (integerp (car node2))) t) ((and (characterp (car node2)) (integerp (car node1))) nil) ((and (integerp (car node2)) (integerp (car node1)) (> (car node1) (car node2))) t) ((and (integerp (car node2)) (integerp (car node1)) (< (car node1) (car node2))) nil) ((and (characterp (car node1)) (characterp (car node2)) (char> (car node1) (car node2))) t) ((and (characterp (car node1)) (characterp (car node2)) (char< (car node1) (car node2))) nil) (t (let ((len1 (length node1)) (len2 (length node2))) (cond ((< len1 len2) t) ((> len1 len2) nil) (t (loop for n1 in (cdr node1) and n2 in (cdr node2) unless (eq n1 n2) return (fs-node< n1 n2)))))))) (defmethod sort-net ((net feature-string-net) sp) (stable-sort sp #'fs-node<)) (defmethod add-string-and-features ((net feature-string-net) string features) "adds a string to a string net" (with-slots (list-net) net (let ((length (length string)) (lnet (cons nil list-net))) (labels ((add-char (pos node) (declare (fixnum pos n)) (if (= pos length) node ;;(pushnew nil (cdr node)) ; end marker (let* ((char (char string pos)) (sub-node (find char (cdr node) :key #'car))) (cond ((null sub-node) (let ((new-node (list char))) (push new-node (cdr node)) (add-char (the fixnum (1+ pos)) new-node))) (t (add-char (the fixnum (1+ pos)) sub-node)))))) (add-features (features node) (if (null features) (pushnew nil (cdr node)) ; end marker (let* ((feature (car features)) (sub-node (find feature (cdr node) :key #'car))) (declare (fixnum feature)) (cond ((null sub-node) (let ((new-node (list feature))) (push new-node (cdr node)) (add-features (cdr features) new-node))) (t (add-features (cdr features) sub-node))))))) (add-features (sort features #'<) (add-char 0 lnet)) (setf list-net (cdr lnet)) string)))) ;;; test (defmethod add-partial-net ((net partial-string-net) string partial-net) "adds a string to a string net" (with-slots (list-net) net (let ((length (length string)) (lnet (cons nil list-net))) (labels ((add-char (pos node) (declare (fixnum pos n)) (if (= pos length) (dolist (sub-net partial-net) (pushnew sub-net (cdr node))) ; end marker (let* ((char (char string pos)) (sub-node (find char (cdr node) :key #'car))) (cond ((null sub-node) (let ((new-node (list char))) (push new-node (cdr node)) (add-char (the fixnum (1+ pos)) new-node))) (t (add-char (the fixnum (1+ pos)) sub-node))))))) (add-char 0 lnet) (setf list-net (cdr lnet)) string)))) ;; much smarter ... (defmethod map-net-nodes ((net list-string-net) fun) (with-slots (list-net) net (labels ((walk (node) (unless (or (null node) (minusp (caar node))) (funcall fun node) (setf (caar node) (- (caar node))) (mapc #'walk (cdr node))))) (mapc #'walk list-net)) ;; restore count (labels ((walk (node) (when (and node (minusp (caar node))) (setf (caar node) (- (caar node))) (mapc #'walk (cdr node))))) (mapc #'walk list-net)))) #+test (defmethod map-net-nodes ((net partial-string-net) fun) (let ((traversed-nodes (make-hash-table))) (with-slots (list-net) net (labels ((walk (node) (unless (gethash (caar node) traversed-nodes) (print (caar node)) (setf (gethash (caar node) traversed-nodes) (make-hash-table))) (let ((nodes-table (gethash (caar node) traversed-nodes))) (unless (gethash node nodes-table) (setf (gethash node nodes-table) t) (funcall fun node) (mapc #'walk (cdr node)))))) (mapc #'walk list-net))))) #+naive-test (defmethod map-net-nodes ((net list-string-net) fun) (let ((traversed-nodes ())) (with-slots (list-net) net (labels ((walk (node) (unless (find node traversed-nodes) (push node traversed-nodes) (funcall fun node) (mapc #'walk (cdr node))))) (mapc #'walk list-net))))) #+obsolete (defmethod map-double-net-nodes ((net list-string-net) fun) (let ((traversed-nodes (make-hash-table))) (with-slots (list-net) net (labels ((walk (node) (when (zerop (gethash node traversed-nodes 0)) (incf (gethash node traversed-nodes 0)) (funcall fun node) (mapc #'walk (cdr node))))) (mapc #'walk list-net))))) (defmethod mark-subnet-length ((net partial-string-net)) (with-slots (list-net length-marked-p) net (labels ((mark (node) (cond ((null node) 0) ((consp (car node)) (caar node)) (t (let ((length (1+ (if (cdr node) (reduce #'max (cdr node) :key #'mark) ;; ???? (progn (print node) 0))))) (setf (car node) (cons length (car node))) length))))) (unless length-marked-p (mapc #'mark list-net) (setf length-marked-p t)) nil))) (defmethod remove-subnet-length-marking ((net partial-string-net)) (with-slots (list-net length-marked-p) net (labels ((unmark (node) (cond ((null node) 0) ((consp (car node)) (setf (car node) (cdar node)) (mapc #'unmark (cdr node))) (t nil)))) (mapc #'unmark list-net) (setf length-marked-p nil) nil))) (defmethod minimize-net ((net partial-string-net) &optional %node-list (level 1)) "recursively compresses from right, results in a minimal network (directed acyclic graph)" (declare (ignore %node-list)) (with-slots (list-net length-marked-p) net (unless length-marked-p (format t "~%marking ...") (mark-subnet-length net)) ;; first, sort the net (format t "~%sorting ... ") (let ((replace-nodes (make-hash-table)) (node-list (make-skip-list #'spnode<))) ;; collect nodes of equal level (map-net-nodes net (lambda (node) (when (and node (= (abs (caar node)) level)) (let ((prev-node (skip-list-get node node-list))) (if prev-node (setf (gethash node replace-nodes) prev-node) (setf (skip-list-get node node-list) node)))))) ;;(setf node-list (sort-net net node-list)) ;; then, unify common subnets at LEVEL (format t "minimizing (~d) ... [~d]" level (skip-list-length node-list)) (cond ((not (skip-list-empty-p node-list)) (format t " (nodes: ~d, " (hash-table-count replace-nodes)) (let ((replace-count 0)) (labels ((walk (node branch) (cond ((null node) nil) ((or (eq node t) (>= (abs (caar node)) (1+ level))) (unless (and (consp node) (minusp (caar node))) (when (consp node) (setf (caar node) (- (caar node)))) (loop for children on branch do (when-let (replace-node (gethash (car children) replace-nodes)) (unless (eq (car children) replace-node) (incf replace-count) (setf (car children) replace-node)))) (when (or (eq node t) (> (abs (caar node)) (1+ level))) (mapc (lambda (child-node) (walk child-node (cdr child-node))) branch)))) (t nil))) (restore (node branch) (cond ((null node) nil) ((or (eq node t) (>= (abs (caar node)) (1+ level))) (when (or (eq node t) (minusp (caar node))) (when (consp node) (setf (caar node) (- (caar node)))) (when (or (eq node t) (> (abs (caar node)) (1+ level))) (mapc (lambda (child-node) (restore child-node (cdr child-node))) branch)))) (t nil)))) (walk t list-net) (restore t list-net) (format t " replaced: ~d) " replace-count) (minimize-net net nil (1+ level))))) (t (remove-subnet-length-marking net)))))) ;; obs: no restore-char! (defmethod map-string-values ((net list-string-net) string fn &optional (string-end-marker #\:) (start 0) first-char) "FN is a function of one argument which is called sucessively on all values of string in NET, or on T if there is no value (??)" (declare (optimize (speed 3) (safety 0)) (string string)) (let ((length (length string))) (declare (fixnum length)) (with-slots (list-net) net ;; first recognize the string, end state is string-end-marker (labels ((walk (pos branches first-char) (loop for sub-node in branches do (when sub-node (destructuring-bind (c . branches) sub-node (cond (first-char (when (char= c first-char) (walk pos branches nil))) ((= pos length) (when (or (null string-end-marker) (char= c string-end-marker)) (nmap-strings net fn sub-node))) ((char= c (char string pos)) (walk (1+ pos) branches nil)))))))) (walk start list-net first-char))))) #| (defparameter *basque-noun-list-net* nil) (defparameter *basque-noun-partial-net* nil) (time (let ((count 0) (net (make-instance 'list-string-net))) (block add (with-file-lines (line "projects:dictionaries;elhuyar-lemma.txt") (let* ((lemma+pos (string-parse line :whitespace ":/"))) (destructuring-bind (lemma . pos-list) lemma+pos (dolist (pos pos-list) (cond ((string= pos "iz.") (map-basque-case-forms lemma (lambda (lemma+word+features) (destructuring-bind (word lemma . features) lemma+word+features (declare (ignore lemma features)) (add-string net word) (incf count) (when (zerop (mod count 10000)) (print (list count word))))))) (t nil))))))) (print count) (minimize-net net) (setf *basque-noun-list-net* net))) ;;(write-string-net *basque-noun-list-net* "projects:string-net;basque-nouns.net") ;; took 1,004,698 milliseconds (1004.698 seconds) to run. 387,593,632 bytes of memory allocated. ;; 1130.004 seconds (time (let ((count 0) (a-net (make-instance 'partial-string-net)) (e-net (make-instance 'partial-string-net)) (i-net (make-instance 'partial-string-net)) (o-net (make-instance 'partial-string-net)) (u-net (make-instance 'partial-string-net)) (C-net (make-instance 'partial-string-net)) (net (make-instance 'partial-string-net))) ;; a (map-basque-case-forms "a" (lambda (lemma+word+features) (let ((sfx (car lemma+word+features))) (add-string a-net sfx)))) (minimize-net a-net) ;; V (map-basque-case-forms "e" (lambda (lemma+word+features) (let ((sfx (car lemma+word+features))) (add-string e-net sfx)))) (minimize-net e-net) ;; V (map-basque-case-forms "i" (lambda (lemma+word+features) (let ((sfx (car lemma+word+features))) (add-string i-net sfx)))) (minimize-net i-net) ;; V (map-basque-case-forms "o" (lambda (lemma+word+features) (let ((sfx (car lemma+word+features))) (add-string o-net sfx)))) (minimize-net o-net) ;; V (map-basque-case-forms "u" (lambda (lemma+word+features) (let ((sfx (car lemma+word+features))) (add-string u-net sfx)))) (minimize-net u-net) ;; C (map-basque-case-forms "k" (lambda (lemma+word+features) (let ((sfx (car lemma+word+features))) (add-string c-net (subseq sfx 1))))) (minimize-net c-net) (with-file-lines (line "projects:dictionaries;elhuyar-lemma.txt" ;;"projects:cgp;string-net;elhuyar-lemma-a.lisp" ) (let* ((lemma+pos (string-parse line :whitespace ":/"))) (destructuring-bind (lemma . pos-list) lemma+pos (dolist (pos pos-list) (when (string= pos "iz.") (when (zerop (mod (incf count) 1000)) (print (list count lemma))) (case (last-char lemma) (#\a (add-partial-net net (concat (subseq lemma 0 (1- (length lemma))) ":") (list-net a-net))) (#\e (add-partial-net net (concat (subseq lemma 0 (1- (length lemma))) ":") (list-net e-net))) (#\i (add-partial-net net (concat (subseq lemma 0 (1- (length lemma))) ":") (list-net i-net))) (#\o (add-partial-net net (concat (subseq lemma 0 (1- (length lemma))) ":") (list-net o-net))) (#\u (add-partial-net net (concat (subseq lemma 0 (1- (length lemma))) ":") (list-net u-net))) (otherwise (add-partial-net net (concat lemma ":") (list-net c-net))))))))) (print count) (minimize-net net) (setf *basque-noun-partial-net* net))) ;; 779.986 seconds ; 56.225 seconds (setf *basque-noun-partial-net* (unify-string-nets (list *basque-noun-partial-net*) :epsilon-chars ":")) (count-strings *basque-noun-list-net*) ;; 2802524 ; 2794935 (count-strings *basque-noun-partial-net*) ;; 2794935 (count-nodes *basque-noun-list-net*) ;; 23743 ; 25782 (count-nodes *basque-noun-partial-net-test*) ;; 26295 ; 24426 (print-strings *basque-noun-list-net*) (print-strings *basque-noun-partial-net*) (print-strings *basque-noun-partial-net-test*) (nmap-strings *basque-noun-partial-net-test* (lambda (string) (unless (match-string *basque-noun-list-net* string) (print string)))) (nmap-strings *basque-noun-list-net* (lambda (string) (unless (match-string *basque-noun-partial-net-test* string) (print string)))) (%match-string *basque-noun-partial-net* "zurrut:erantz") ;; nil (%match-string *basque-noun-partial-net-test* "zurrut:erantz") ;; nil !! (match-string *basque-noun-list-net* "zurrut:erantz") ;; t (defmethod %match-string ((net list-string-net) string) (with-slots (list-net) net (let ((length (length string))) (declare (fixnum length)) (labels ((walk (pos nodes) (let ((sub-node (find (char string pos) nodes :key #'car))) (cond ((null sub-node) nil) ((< pos (1- length)) (walk (1+ pos) (cdr sub-node))) ((member nil (cdr sub-node)) ; end node marker t) (t (values nil (1+ pos))))))) (walk 0 list-net))))) (list-net *basque-noun-partial-net*) (defparameter *basque-noun-partial-net-test* nil) (let ((count 0) (a-net (make-instance 'list-string-net)) (e-net (make-instance 'list-string-net)) (i-net (make-instance 'list-string-net)) (o-net (make-instance 'list-string-net)) (u-net (make-instance 'list-string-net)) (C-net (make-instance 'list-string-net)) (net (make-instance 'partial-string-net))) ;; a (map-basque-case-forms "a" (lambda (lemma+word+features) (let ((sfx (car lemma+word+features))) (add-string a-net sfx)))) (minimize-net a-net) ;; V (map-basque-case-forms "e" (lambda (lemma+word+features) (let ((sfx (car lemma+word+features))) (add-string e-net sfx)))) (minimize-net e-net) ;; V (map-basque-case-forms "i" (lambda (lemma+word+features) (let ((sfx (car lemma+word+features))) (add-string i-net sfx)))) (minimize-net i-net) ;; V (map-basque-case-forms "o" (lambda (lemma+word+features) (let ((sfx (car lemma+word+features))) (add-string o-net sfx)))) (minimize-net o-net) ;; V (map-basque-case-forms "u" (lambda (lemma+word+features) (let ((sfx (car lemma+word+features))) (add-string u-net sfx)))) (minimize-net u-net) ;; C (map-basque-case-forms "k" (lambda (lemma+word+features) (let ((sfx (car lemma+word+features))) (add-string c-net (subseq sfx 1))))) (minimize-net c-net) (with-file-lines (line "projects:dictionaries;elhuyar-lemma.txt" ;;"projects:cgp;string-net;elhuyar-lemma-a.lisp" ) (let* ((lemma+pos (string-parse line :whitespace ":/"))) (destructuring-bind (lemma . pos-list) lemma+pos (dolist (pos pos-list) (when (string= pos "iz.") (when (zerop (mod (incf count) 1000)) (print (list count lemma))) (case (last-char lemma) (#\a (add-partial-net net (concat (subseq lemma 0 (1- (length lemma))) ":") (list-net a-net))) (#\e (add-partial-net net (concat (subseq lemma 0 (1- (length lemma))) ":") (list-net e-net))) (#\i (add-partial-net net (concat (subseq lemma 0 (1- (length lemma))) ":") (list-net i-net))) (#\o (add-partial-net net (concat (subseq lemma 0 (1- (length lemma))) ":") (list-net o-net))) (#\u (add-partial-net net (concat (subseq lemma 0 (1- (length lemma))) ":") (list-net u-net))) (otherwise (add-partial-net net (concat lemma ":") (list-net c-net))))))))) (print count) ;;(minimize-net net) (setf *basque-noun-partial-net-test* net)) (minimize-net *basque-noun-partial-net-test*) (defparameter *basque-noun-test-table* (make-hash-table :test #'equal)) (with-file-lines (line "projects:dictionaries;elhuyar-lemma.txt") (let* ((lemma+pos (string-parse line :whitespace ":/"))) (destructuring-bind (lemma . pos-list) lemma+pos (dolist (pos pos-list) (when (string= pos "iz.") (if (find (last-char lemma) "aeiou") (pushnew (last-char lemma) (gethash (subseq lemma 0 (1- (length lemma))) *basque-noun-test-table*)) (pushnew :consonant (gethash lemma *basque-noun-test-table*)))))))) (defparameter *suffix-nets* (make-hash-table :test #'equal)) (defun get-suffix-net (endings) (or (gethash endings *suffix-nets*) (let ((suffix-net (make-instance 'list-string-net))) (dolist (ending (print endings)) (map-basque-case-forms (ecase ending (#\a "a") (#\e "e") (#\i "i") (#\o "o") (#\u "u") (:consonant "k")) (lambda (lemma+word+features) (let ((sfx (car lemma+word+features))) (add-string suffix-net (if (eq ending :consonant) (subseq sfx 1) sfx)))))) (minimize-net suffix-net) (setf (gethash endings *suffix-nets*) suffix-net)))) (print-strings (get-suffix-net '(:consonant))) (print-strings *basque-noun-partial-net-test*) (defparameter *consonant-net* nil) (match-string *basque-noun-partial-net-test* "zuzun:dako") (match-string *basque-noun-list-net* "zuzun:dako") (let ((net (make-instance 'partial-string-net)) (count 0)) (clrhash *suffix-nets*) (maphash (lambda (stem endings) (let ((suffix-net (list-net (get-suffix-net endings)))) (incf count) (add-partial-net net (concat stem ":") suffix-net))) *basque-noun-test-table*) (print count) ;;(minimize-net net) (setf *basque-noun-partial-net-test* net)) ;; 262.771 seconds ; 209.930 seconds ; 252.585 ; 52.189 (Time (minimize-net *basque-noun-partial-net-test*)) (count-strings *basque-noun-list-net*) ;; 2802524 (count-strings *basque-noun-partial-net-test*) ;; 2802524 (count-nodes *basque-noun-list-net*) ;; 23743 (count-nodes *basque-noun-partial-net-test*) ;; 26295 ; 24426 ; 115383 ; 24419 ; 24948 ; 23743!!!!! (print-strings *basque-noun-partial-net-test*) |# (defun set-eq (set1 set2) (and (= (length set1) (length set2)) (loop for elt in set1 always (member elt set2)))) (defun intersection-node (node-sets nodes) (dolist (node (car node-sets)) (when (and (loop for set in (cdr node-sets) always (member node set)) (set-eq (car node) nodes)) (return-from intersection-node node)))) (defmethod remove-node-set-marking ((net partial-string-net)) (with-slots (list-net) net (let ((node-table (make-hash-table))) (labels ((unmark (node) (unless (gethash node node-table) (setf (gethash node node-table) t) (cond ((null node) nil) ((listp (car node)) (setf (car node) (caaar node)) (loop for children on (cdr node) when (and (consp (caar children)) (null (caaar children))) do (setf (car children) nil)) (mapc #'unmark (cdr node))) (t nil))))) (mapc #'unmark list-net) nil)))) (defun unify-string-nets (string-nets &key epsilon-chars (iterate-p t)) (declare (optimize (safety 0) (space 0) (speed 3))) (labels ((remove-epsilons (branch &optional nodes) (cond ((null branch) nodes) ((find (caar branch) epsilon-chars) (remove-epsilons (cdr branch) (append (if iterate-p (remove-epsilons (cdar branch) nil) (cdar branch)) nodes))) (t (remove-epsilons (cdr branch) (cons (car branch) nodes))))) (node-sort (label1 label2) (cond ((null label1) nil) ((null label2) t) (t (char< label1 label2))))) (let ((count 0) (nodes (sort (mapcan (lambda (net) (remove-epsilons (list-net net))) string-nets) #'node-sort :key #'car)) (nodelist-table (make-hash-table)) (top-node (list :top))) (labels ((unify (new-node node-list) (when node-list (let* ((label (caar node-list)) ;; nodes whose union defines the new node (equal-label-nodes (loop for node = (car node-list) while (and node-list (if label (eq (car node) label) (null node))) collect node ;;(or node null-node) do (setf node-list (cdr node-list)))) ;; the new node (old-p t) (unified-child-node (or (intersection-node (mapcar (lambda (node) (gethash node nodelist-table)) equal-label-nodes) equal-label-nodes) (setf old-p nil) (let ((intersection-node (list equal-label-nodes))) (dolist (node equal-label-nodes) (push intersection-node (gethash node nodelist-table))) intersection-node)))) (when (zerop (mod (incf count) 10000)) (print count)) (unless old-p (when label (unify unified-child-node (sort (mapcan (lambda (node) (remove-epsilons (cdr node))) equal-label-nodes) #'node-sort :key #'car)))) (push unified-child-node (cdr new-node)) (unify new-node node-list))))) (unify top-node nodes) (let ((unified-net (make-instance 'partial-string-net :list-net (cdr top-node)))) (remove-node-set-marking unified-net) (minimize-net unified-net) unified-net))))) #|| (defparameter *minimal-georgian-noun-net* (unify-string-nets (list dict::*georgian-noun-net*) :epsilon-chars ":")) (remove-node-set-marking *minimal-georgian-noun-net*) (count-nodes *minimal-georgian-noun-net*) ;; 64.464 ; 64108 (count-strings *minimal-georgian-noun-net*) ;; 217.374.744 (of 218.394.710) (print-strings *minimal-georgian-noun-net*) (minimize-net *minimal-georgian-noun-net*) (let ((count 0)) (nmap-strings dict::*georgian-noun-net* (lambda (string) (when (zerop (mod (incf count) 100000)) (print (list string count))) (unless (match-string *minimal-georgian-noun-net* (remove #\: string)) (print string))))) #+test (let ((list '((1) (2 4 3) (4 5)))) (mapcan #'identity list) ) ||# (defmethod calculate-gw-compression-tree ((net list-string-net)) (with-slots (list-net) net (let ((weight-table (make-hash-table)) (weight-list ()) (node-table (make-hash-table))) (labels ((walk (node) (cond ((null node) #+ignore (incf (gethash :end weight-table 0))) ((gethash node node-table) nil) (t (setf (gethash node node-table) node) (incf (gethash (car node) weight-table 0)) (incf (gethash (code-char u::$max-character-code) weight-table 0)) (mapc #'walk (cdr node)))))) (incf (gethash #\Null weight-table 0)) ;; added 5.2.2002 (mapc #'walk list-net)) (maphash (lambda (code freq) (push (cons code freq) weight-list)) weight-table) (make-instance 'u::optimal-tree :weights weight-list)))) (defmethod vectorize-string-net ((net list-string-net)) (with-slots (list-net) net (let ((node-id -1) (node-table (make-hash-table)) (node-vector (make-array 0 :adjustable t :fill-pointer t))) (labels ((walk (node) (let ((subnode-vector (make-array (length node) :initial-element nil)) (id (incf node-id))) (vector-push-extend subnode-vector node-vector) (setf (aref subnode-vector 0) (char-code (car node))) (loop for sub-node in (cdr node) for i from 1 do (setf (aref subnode-vector i) (if sub-node (or (gethash sub-node node-table) (setf (gethash sub-node node-table) (walk sub-node))) 0))) id))) (walk (cons #\Null list-net)) node-vector)))) ;; arc structure: ;; 1. absolute addressing: ;; 1 bit endp, (c-code, 5 bits pointer size, node pointer)* ;; 2. relative addressing (delta = abs(pointer - prev-pointer)): ;; 1 bit endp, (c-code, 5 bits delta size, delta, 1 bit sign)* (defun %compressed-node-size (compression-tree label-state-pairs #-absolute-addresses node-pos) (let ((size 1) ;; endp bit n-size) (dolist (arc label-state-pairs) (when arc (destructuring-bind (label . state) arc (let ((c-bw (u::gw-compress-char compression-tree (char-code label))) (delta #+no-absolute-addresses (abs (- node-pos state)) #-no-absolute-addresses state)) (incf size (length c-bw)) (setf n-size (1- (ceiling (log (1+ delta) 2)))) (incf size 5) (incf size n-size) #+no-absolute-addresses (incf size 1))))) (let ((end-bw (u::gw-compress-char compression-tree u::$max-character-code))) (incf size (length end-bw))) size)) (defun %write-compressed-node (stream compression-tree endp label-state-pairs #+no-absolute-addresses node-pos) (let ((byte (if endp 1 0))) (loop with bit-pos = 1 and n-size for (label . state) in label-state-pairs for c-bw = (u::gw-compress-char compression-tree label) do (let ((delta #+no-absolute-addresses (abs (- node-pos state)) #-no-absolute-addresses state) #+no-absolute-addresses (sign (if (< node-pos state) 0 1))) (multiple-value-setq (byte bit-pos) ;; use SET-BITS-FROM-BW (set-bits array byte bit-pos (u::%bw-code c-bw) (length c-bw))) (setf n-size (1- (ceiling (log (1+ delta) 2)))) (multiple-value-setq (byte bit-pos) (set-bits array byte bit-pos n-size 5)) (multiple-value-setq (byte bit-pos) (set-bits array byte bit-pos (- delta (ash 2 (1- n-size))) n-size)) #+no-absolute-addresses (multiple-value-setq (byte bit-pos) (set-bits array byte bit-pos sign 1))) finally (let ((end-bw (u::gw-compress-char compression-tree u::$max-character-code))) (setf byte (set-bits array byte bit-pos (u::%bw-code end-bw) (length end-bw))))) (vector-push-extend byte array #+allegro *delta*)) array) (defun %set-bits (array n size) (loop for i from (- 1 size) to 0 do (vector-push-extend (logand 1 (ash n i)) array)) array) (defun %write-bits (stream n size) (loop for i from (- 1 size) to 0 do (u::%write-bit (logand 1 (ash n i)) stream))) #+ignore (defun %get-bits (array pos size &optional add-max-bit-p) (declare (optimize (speed 3) (safety 0)) (fixnum pos size)) (let ((val (if add-max-bit-p (the fixnum (ash 1 (the fixnum size))) 0))) (declare (fixnum val)) (dotimes (i size) (declare (fixnum i)) (when (= (the fixnum (sbit array (the fixnum pos))) 1) (setf val (the fixnum (+ val (the fixnum (ash 1 i)))))) (setf pos (the fixnum (1+ pos)))) ;; *** fix for bug in byte-code #+bug (when (and (zerop size) (zerop (the fixnum (mod pos 8)))) (print "bug") (setf pos (the fixnum (+ pos 8)))) (values val pos))) (defun %get-bits (array pos size &optional add-max-bit-p) (declare (optimize (speed 3) (safety 0)) (fixnum pos size)) (let ((val (if add-max-bit-p 1 0))) (declare (fixnum val)) (dotimes (i size) (setf val (+ (ash val 1) (aref array pos))) (incf pos)) (values val pos))) #+ignore (defun %read-bits (stream size &optional add-max-bit-p) (declare (optimize (speed 3) (safety 0)) (fixnum size)) (let ((val (if add-max-bit-p 1 0))) (declare (fixnum val)) (dotimes (i size) (setf val (+ (the fixnum (ash val 1)) (the fixnum (u::read-bit stream))))) val)) ;; node structure: ;; x bits gw-compressed label ;; 1 bit end-p ;; 3 bits length of arc-count ;; x bits arc-count ;; arc-count times: ;; 5 bits pointer length ;; x bits pointer (defun compress-node (array compression-tree code+pointers #+rel-addresses node-pos) (destructuring-bind (code . pointers) code+pointers ;; encode node label (loop for bit across (u::gw-compress-char compression-tree code) do (vector-push-extend bit array)) (cond ((null (car pointers)) (vector-push-extend 1 array) (setf pointers (cdr pointers))) (t (vector-push-extend 0 array))) ;; encode length of arc-count (let* ((arc-count (length pointers)) (n-size (ceiling (log (1+ arc-count) 2)))) (%set-bits array n-size 3) (%set-bits array arc-count #+ignore(- arc-count (ash 2 (1- n-size))) n-size)) ;; encode node arc pointers (loop with n-size for pointer in pointers do (let ((delta #+rel-addresses (abs (- node-pos pointer)) #-rel-addresses pointer) #+rel-addresses (sign (if (< node-pos pointer) 0 1))) (setf n-size (1- (ceiling (log (1+ delta) 2)))) (%set-bits array n-size 5) (%set-bits array (- delta (ash 2 (1- n-size))) n-size) #+rel-addresses (%set-bits array sign 1)))) array) #+test (with-open-file (stream "projects:string-net;test.net" :direction :output :if-exists :supersede :element-type 'bit) (dotimes (i 16) (write-byte 1 stream) (write-byte 0 stream) (write-byte 1 stream) (write-byte 1 stream) )) (defmethod close-bit-stream ((bit-stream u::buffered-bit-stream)) (with-slots (stream u::buffer position) bit-stream (inspect bit-stream) (loop until (= (incf position) 8) do (setf u::buffer (ash u::buffer 1))) (write-byte u::buffer stream) (setf position 0 u::buffer 0))) #+copy (defmethod write-bit ((bit-stream buffered-bit-stream) bit) (with-slots (stream buffer position) bit-stream (setf buffer (ash buffer 1)) (when (= bit 1) (incf buffer 1)) (when (= (incf position) 8) (write-byte buffer stream) (setf position 0 buffer 0)))) (defparameter *rel-addresses* t) (defmethod write-compressed-node ((stream stream) compression-tree code+pointers) (let ((node-pos 0)) (destructuring-bind (code . pointers) code+pointers ;; encode node label (loop for bit across (u::gw-compress-char compression-tree code) do (incf node-pos) (u::%write-bit bit stream)) (incf node-pos) (cond ((null (car pointers)) (u::%write-bit 1 stream) (setf pointers (cdr pointers))) (t (u::%write-bit 0 stream))) ;; encode length of arc-count (let* ((arc-count (length pointers)) (n-size (ceiling (log (1+ arc-count) 2)))) (incf node-pos (+ 3 n-size)) (%write-bits stream n-size 3) (%write-bits stream arc-count n-size)) ;; encode node arc pointers (dolist (pointer pointers) (let ((n-size (1- (ceiling (log (1+ pointer) 2))))) (incf node-pos (+ 5 n-size)) (%write-bits stream n-size 5) (%write-bits stream (- pointer (ash 2 (1- n-size))) n-size)))) node-pos)) ;; does not actually compress the node, only calculates the size of the compressed node (defun compressed-node-size (compression-tree code+pointers) (let ((size 0)) (destructuring-bind (code . pointers) code+pointers ;; encode node label (incf size (1+ (length (u::gw-compress-char compression-tree code)))) (when (null (car pointers)) (setf pointers (cdr pointers))) ;; encode length of arc-count (incf size (+ (ceiling (log (1+ (length pointers)) 2)) 3)) ;; encode node arc pointers (loop for pointer in pointers do (incf size (+ (ceiling (log (1+ pointer) 2)) 4)))) size)) (defun read-decompress-node (stream compression-tree) (collecting (collect (u::gw-decompress-bit-stream-char compression-tree stream)) (when (= 1 (u::read-bit stream)) (collect nil)) (let ((length (u::read-bits stream (u::read-bits stream 3)))) (dotimes (i length) (collect (u::read-bits stream (u::read-bits stream 5) t)))))) #-ecl (defun decompress-node (array compression-tree pos) (collecting (multiple-value-bind (char pos) (u::%gw-decompress-bv-char compression-tree array pos) (collect char) (let ((bit (aref array pos))) (incf pos) (when (= 1 bit) (collect nil))) (let (size length) (multiple-value-setq (size pos) (%get-bits array pos 3)) (multiple-value-setq (length pos) (%get-bits array pos size)) (dotimes (i length) (let (val) (multiple-value-setq (size pos) (%get-bits array pos 5)) (multiple-value-setq (val pos) (%get-bits array pos size t)) (collect val))))))) #+test (calculate-gw-compression-tree *georgian-pronoun-net*) #+test (let ((bv (make-array 0 :element-type 'bit :adjustable t :fill-pointer 0)) (gw-tree (compression-tree *georgian-pronoun-net*))) (compress-node bv gw-tree (list (char-code #\a) nil 1 4 44 99 3 33333333 444444444)) (decompress-node bv (u::gw-tree gw-tree) 0)) #+test (let ((bv (make-array 0 :element-type 'bit :adjustable t :fill-pointer 0))) (%set-bits bv 14 20)) (defmethod compress-net ((net list-string-net) &key &allow-other-keys) (let ((compression-tree (calculate-gw-compression-tree net))) (format t "Garsia-Wachs compression tree calculated.") (with-slots (nodes list-net) net (let ((node-table (make-hash-table)) (prev-pointer-table (make-hash-table)) (pointer-table (make-hash-table)) (node-id -1) (root (cons #\Null list-net)) (prev-size 0) (size 0)) (labels ((walk (node build-bv-p) (setf (gethash node pointer-table) (if build-bv-p (length nodes) size)) (let ((arcs (cons (char-code (car node)) (mapcar (lambda (branch) (when branch (or (gethash branch node-table) (setf (gethash branch node-table) (or (gethash branch prev-pointer-table) (incf node-id)))))) (cdr node))))) (if build-bv-p (compress-node nodes compression-tree arcs) (incf size (compressed-node-size compression-tree arcs))) (mapc (lambda (sub-node) (unless (or (null sub-node) (gethash sub-node pointer-table)) (walk sub-node build-bv-p))) (cdr node))))) (loop do (setf (gethash root node-table) 0) (walk root nil) (clrhash node-table) (rotatef prev-pointer-table pointer-table) (clrhash pointer-table) while (/= prev-size (print size)) do (setf prev-size size size 0)) (setf nodes (make-array size :element-type 'bit :adjustable t :fill-pointer 0) #+ignore-yet(make-array size :element-type 'bit)) (walk root t) nodes))))) (defmethod write-compressed-net ((net list-string-net) compression-tree stream) (with-slots (list-net) net (let ((node-table (make-hash-table)) (prev-pointer-table (make-hash-table)) (pointer-table (make-hash-table)) (node-id -1) (root (cons #\Null list-net)) (prev-size 0) (size 0)) (labels ((normalize-arcs (arcs) (if (or (null (car arcs)) (not (member nil arcs))) arcs (cons nil (remove nil arcs)))) (walk (node write-p) (setf (gethash node pointer-table) size) (let ((arcs (cons (char-code (car node)) (mapcar (lambda (branch) (when branch (or (gethash branch node-table) (setf (gethash branch node-table) (or (gethash branch prev-pointer-table) (incf node-id)))))) (normalize-arcs (cdr node)))))) (incf size (if write-p (write-compressed-node stream compression-tree arcs) (compressed-node-size compression-tree arcs))) (mapc (lambda (sub-node) (unless (or (null sub-node) (gethash sub-node pointer-table)) (walk sub-node write-p))) (cdr node))))) (loop do (setf (gethash root node-table) 0) (walk root nil) (clrhash node-table) (rotatef prev-pointer-table pointer-table) (clrhash pointer-table) while (/= prev-size (print size)) do (setf prev-size size size 0)) (setf size 0) (walk root t) size)))) ;; new #+ignore (defmethod %write-compressed-net ((net list-string-net) stream) (with-slots (list-net compression-tree) net (let ((node-table (make-hash-table)) (prev-pointer-table (make-hash-table)) (pointer-table (make-hash-table)) (node-id -1) ;;(root (cons #\Null list-net)) (prev-size 0) (size 0) (count 0)) (labels ((normalize-arcs (arcs) (if (or (null (car arcs)) (not (member nil arcs))) arcs (cons nil (remove nil arcs)))) (walk (arcs write-p) (setf (gethash arcs pointer-table) size) (let* ((end-p nil) (arcs (collecting (mapc (lambda (arc) (if arc (collect (destructuring-bind (char . arcs) arc (cons char (or (gethash arcs node-table) (setf (gethash arcs node-table) (or (gethash arcs prev-pointer-table) (incf node-id))))))) (setf end-p t))) (normalize-arcs arcs))))) (when arcs (incf count)) (incf size (if write-p (%write-compressed-node stream compression-tree end-p arcs) (%compressed-node-size compression-tree arcs 0)))) ;;(when (eq arcs list-net) (print arcs)) (mapc (lambda (sub-node) (unless (or (null sub-node) (gethash (cdr sub-node) pointer-table)) (walk (cdr sub-node) write-p))) arcs))) (loop do (setf (gethash list-net node-table) 0) (walk list-net nil) (clrhash node-table) (rotatef prev-pointer-table pointer-table) (clrhash pointer-table) (print (list :count count)) while (/= prev-size (print size)) do (setf prev-size size size 0 count 0)) (print size) (setf size 0) ;;(walk list-net t) size)))) #+test (u::flatten-binary-tree (u::gw-tree (compression-tree *nbo-list-net*))) #+test (with-open-file (stream "projects:string-net;test1.btree" :direction :output :if-exists :supersede :element-type 'bit) (u::write-binary-tree (u::gw-tree (compression-tree *nbo-list-net*)) stream)) #+test (with-open-file (stream "projects:string-net;test1.btree" :element-type 'unsigned-byte) (u::read-binary-tree stream)) #+test (with-open-file (stream "projects:string-net;nbo.net" :element-type 'unsigned-byte) (u::read-binary-tree stream)) (defmethod write-string-net ((net string-net) (path string)) (write-string-net net (pathname path))) #-(or allegro ecl) (defmethod write-string-net ((net string-net) (path pathname)) (with-open-file (stream path :direction :output :if-exists :supersede :element-type 'bit) (loop for c across (symbol-name (class-name (class-of net))) for code = (char-code c) do (loop for i from -7 to 0 do (write-byte (logand 1 (ash code i)) stream))) (dotimes (i 8) (write-byte 0 stream)) (write-string-net net stream))) #+allegro (defmethod write-string-net ((net string-net) (path pathname)) (with-open-file (stream path :direction :output :if-exists :supersede :element-type 'unsigned-byte) (let ((bit-stream (make-instance 'u::buffered-bit-stream :stream stream))) (loop for c across (symbol-name (class-name (class-of net))) for code = (char-code c) do (loop for i from -7 to 0 do (u::%write-bit (logand 1 (ash code i)) bit-stream))) (dotimes (i 8) (u::%write-bit 0 bit-stream)) (write-string-net net bit-stream) #-test (dotimes (i 8) (u::%write-bit 0 bit-stream)) #+test (close-bit-stream bit-stream) #+test (dotimes (i 100) (u::%write-bit 0 bit-stream)) #+bug (with-slots (stream u::buffer position) bit-stream (unless (zerop position) ;; flush buffer (write-byte u::buffer stream)))))) (defmethod write-string-net ((net list-string-net) (stream stream)) (let ((compression-tree (calculate-gw-compression-tree net))) (u::write-binary-tree (u::gw-tree compression-tree) stream) (write-compressed-net net compression-tree stream))) (defmethod read-net ((path string) &key (translate-p t) (package :string-net)) (read-net (pathname path) :translate-p translate-p :package package)) (defmethod read-net ((path pathname) &key (translate-p t) (package :string-net)) (format t "~&Reading ~s.~%" path) (with-open-file (stream path :direction :input :element-type 'unsigned-byte); :external-format :iso-8859-1) (read-net stream :translate-p translate-p :package package))) ;;(inspect (read-net "projects:string-net;nets;nbo-lexicon.net")) ;;(inspect (read-net "projects:string-net;nets;nbo-word-like-abbreviations.net")) (defmethod read-net ((stream stream) &key (translate-p t) (package :string-net)) (let* ((class-name (with-output-to-string (str-stream) (loop for byte = (read-byte stream) and i from 0 to 32 until (= byte 0) do (write-char (char-upcase (code-char byte)) str-stream)))) (net (make-instance (intern class-name package)))) (read-string-net net stream :translate-p translate-p))) (defmethod read-string-net ((net list-string-net) stream &key (translate-p t)) (let ((gw-tree (u::read-binary-tree stream))) ;; :translate-fn #'restore-char))) (let ((stream (make-instance 'u::buffered-bit-stream :stream stream)) (node-table (make-hash-table))) (labels ((walk (node-id) (when node-id ;; nil means end-marker (or (gethash node-id node-table) (setf (gethash node-id node-table) (destructuring-bind (label . arcs) (read-decompress-node stream gw-tree) (cons (if translate-p (translate-char-code label) label) (mapcar #'walk arcs)))))))) (setf (list-net net) (cdr (walk 0))) net)))) #+test (string-net::read-net "~/lisp/projects/cgp/nets/nbo-lexicon.net") #+test (defmethod read-compressed-net ((net list-string-net) stream) #+ignore-yet (let ((flattened-tree-size #-mcl 0 #+mcl (parse-integer (read-line stream))) (arc-size #-mcl 0 #+mcl (parse-integer (read-line stream)))) #-mcl (loop for byte = (read-byte stream) until (= byte (char-code #\return)) do (setf flattened-tree-size (+ (- byte (char-code #\0)) (* 10 flattened-tree-size)))) #-mcl (loop for byte = (read-byte stream) until (= byte (char-code #\return)) do (setf arc-size (+ (- byte (char-code #\0)) (* 10 arc-size)))) (with-slots (compression-tree integer-compression-tree arcs) net (setf arcs (make-array (* arc-size 8) :element-type 'bit)) (let ((flattened-tree (make-array flattened-tree-size :element-type #-mcl 'character #+mcl 'base-character)) (flattened-integer-tree (make-array flattened-tree-size :element-type '(unsigned-byte 8)))) (dotimes (i flattened-tree-size) (let ((val #+mcl (read-char stream) #-mcl (read-byte stream))) (setf (aref flattened-tree i) #+mcl val #-mcl(code-char (translate-char-code val)) (aref flattened-integer-tree i) #+mcl(char-code val) #-mcl val))) (setf compression-tree (u::restore-flattened-binary-tree flattened-tree) integer-compression-tree (u::restore-flattened-binary-tree flattened-integer-tree))))) (with-slots (list-net compression-tree) net (let ((node-table (make-hash-table)) (gw-tree (u::gw-tree (compression-tree net)))) (labels ((walk (node-id) (when node-id ;; nil means end-marker (or (gethash node-id node-table) (setf (gethash node-id node-table) (destructuring-bind (label . arcs) (read-decompress-node stream gw-tree) (cons label (mapcar #'walk arcs)))))))) (walk 0))))) #+test (defmethod nmap-strings ((net list-string-net) fn &optional (start 0) string) (declare (optimize (safety 0) (space 0) (speed 3)) (fixnum start)) (with-slots (nodes compression-tree) net (let ((gw-tree (u::gw-tree compression-tree)) (string (or string (make-array 0 :element-type #-mcl 'character #+mcl 'base-character :adjustable t :fill-pointer t)))) (declare (string string)) (labels ((walk (pos) (declare (fixnum pos)) (destructuring-bind (char . arcs) (decompress-node nodes gw-tree pos) (unless (zerop pos) (vector-push-extend char string)) (dolist (arc arcs) (if arc (walk arc) (funcall fn string))) (unless (zerop pos) (decf (fill-pointer string)))))) (walk start))))) #+under-construction (defmethod compress-net ((net list-string-net) &key (iterate t)) (calculate-gw-compression-tree net) (with-slots (nodes compression-tree arcs pointers prev-nodes compression-type) net (setf nodes (vectorize-string-net net) arcs (make-array 0 :element-type 'unsigned-byte :adjustable t :fill-pointer 0)) (let ((pointer-table (make-hash-table)) (node-count -1)) (loop for node across nodes and i from 0 do (setf (gethash i pointer-table) (incf node-count))) ;; adjust arc pointers (loop for node across nodes do (loop for tail on (cdr node) by #'cddr do (setf (cadr tail) (gethash (aref pointers (cadr tail)) pointer-table)))) (let ((new-nodes (make-array (1+ node-count) :element-type 'unsigned-byte))) (loop for node across nodes for pos from 0 with i = -1 do (setf (aref new-nodes (incf i)) (fill-pointer arcs)) (compress-arcs arcs compression-tree (car node) (cdr node) #-absolute-addresses i)) (setf prev-nodes nil ;; prev-nodes (copy-seq new-nodes) ; *** debug only, remove! nodes new-nodes arcs (if iterate arcs (coerce arcs 'simple-vector)) pointers nil ;; pointers arcs ;; *** debug only, remove! compression-type :simple) (when iterate (compress-iterate net nil 1)))))) #+under-construction (defmethod compress-iterate ((net list-string-net) node-table count) (with-slots (compression-tree nodes arcs pointers) net (format t "~%compressing (~d) ... [~d]" count (length arcs)) (unless node-table (setf node-table (make-hash-table)) (loop for node across nodes for i from 0 do (setf (gethash i node-table) node (aref nodes i) i))) (let ((new-arcs (make-array 0 :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0)) (new-node-table (make-hash-table)) (size (length nodes))) (dotimes (i size) (let* ((old-pos (aref nodes i)) (arc (gethash old-pos node-table))) (setf (aref nodes i) arc (gethash (aref nodes i) new-node-table) (fill-pointer new-arcs)) (multiple-value-bind (label-state-pairs endp) (%decode-arcs arcs compression-tree arc #-absolute-addresses old-pos) (loop for tail on label-state-pairs by #'cddr do (setf (car tail) (char-code (car tail)) (cadr tail) (gethash (cadr tail) node-table))) (compress-arcs new-arcs compression-tree endp label-state-pairs #-absolute-addresses arc)))) (let ((old-size (length arcs)) (new-size (length new-arcs))) (setf arcs new-arcs) (if (/= old-size new-size) (compress-iterate net new-node-table (1+ count)) (progn (setf compression-tree (u::gw-tree compression-tree)) (change-class net 'vector-string-net))))))) #| (defparameter *net-vector* (vectorize-string-net *georgian-noun-net*)) (defparameter *pnet-vector* (vectorize-string-net *georgian-pronoun-net*)) (nmap-strings *pnet-vector* (lambda (v) (print v)) 0) (print-strings *georgian-pronoun-net*) (defun write-vector-net-to-file (net-vector file) (with-open-file (stream file :direction :output :if-exists :supersede :external-format :iso-8859-1) (loop for vector across net-vector do (loop for i across vector do (write i :stream stream) (write-char #\Space stream)) (write-char #\newline stream)))) (write-vector-net-to-file (cdr *net-vector*) "projects:georgian-parser;noun-vector-net.txt") (defmethod nmap-strings ((net array) fn &optional start-node string) (declare (optimize (safety 0) (space 0) (speed 3))) (let ((string (or string (make-array 0 :element-type #-mcl 'character #+mcl 'base-character :adjustable t :fill-pointer t)))) (declare (string string)) (labels ((walk (node) (let* ((node-vector (aref net node)) (c (code-char (aref node-vector 0))) (first t)) (vector-push-extend c string) (loop for sub-node across node-vector do (cond (first (setf first nil)) ((zerop sub-node) (funcall fn string)) (t (walk sub-node)))) (decf (fill-pointer string))))) (walk start-node)))) ;;(4.445 seconds) #+test (time (defparameter *nbo-lexicon* (load-string-net "projects:cgp;nets;nbo-lexicon.net"))) (defparameter *nbo-lexicon-list-net* (make-instance 'list-string-net)) (time (let ((count 0)) ;; (1008.988 seconds) (nmap-strings *nbo-lexicon* (lambda (string) (when (zerop (mod (incf count) 10000)) (print count)) (add-string *nbo-lexicon-list-net* string))) (minimize-net *nbo-lexicon-list-net*))) (count-nodes *nbo-lexicon-list-net*) ; 241649 (count-strings *nbo-lexicon-list-net*) ; 868882 (count-nodes *nbo-lexicon*) ; 212199 (count-strings *nbo-lexicon*) ; 868882 10971160 (calculate-gw-compression-tree *nbo-lexicon-list-net*) #+test (with-open-file (stream "projects:cgp;nets;nbo-lexicon.list-net1" :direction :output :if-exists :supersede :element-type 'bit) (time (write-net *nbo-lexicon-list-net* stream))) 16316780 15858748 14044778 13769364 (defparameter *vector-net* nil) (defparameter *list-net* nil) (let ((list-net (make-instance 'list-string-net)) (vector-net (make-instance 'active-string-net)) (strings '("as" "aw" "ams" "lull" "fuff" "rarr" "larr"))) (dolist (string strings) (add-string list-net string)) (minimize-net list-net) (dolist (string strings) (add-string vector-net string)) (minimize-net vector-net) (setf *list-net* list-net) (calculate-gw-compression-tree vector-net) (compress-net vector-net) (setf *vector-net* vector-net) (print (count-nodes list-net)) (print (count-nodes vector-net)) (count-nodes (vector-to-list-net *vector-net*))) (defmethod vector-to-list-net ((net active-string-net)) (with-slots (nodes pointers) net (let ((node-table (make-hash-table))) (labels ((build (pos) (or (gethash pos node-table) (setf (gethash pos node-table) (collecting (destructuring-bind (end-p . codes+arcs) (aref nodes pos) (when end-p (collect nil)) (loop for (code arc) on codes+arcs by #'cddr do (collect (cons (code-char code) (build (aref pointers arc))))))))))) (make-instance 'list-string-net :list-net (build 0)))))) (defmethod vector-to-list-net ((net vector-string-net)) (with-slots (arcs pointers compression-tree) net (let ((node-table (make-hash-table :test #'equal))) (labels ((build (pos) (declare (fixnum pos)) (or (gethash pos node-table) (setf (gethash pos node-table) (collecting (multiple-value-bind (arc-list end-p) (decode-arcs arcs compression-tree pos nil) (when end-p (collect nil)) (loop for (code arc) on arc-list by #'cddr do (collect (or (gethash (cons code arc) node-table) (setf (gethash (cons code arc) node-table) (cons code (build arc)))))))))))) (make-instance 'list-string-net :list-net (build 0)))))) (defmethod vector-to-list-net ((net bit-string-net)) (with-slots (arcs pointers compression-tree) net (let ((node-table (make-hash-table)) (arc-table (make-array 256))) (dotimes (i 256) (setf (aref arc-table i) (make-hash-table))) (labels ((build (pos) (declare (fixnum pos)) (or (gethash pos node-table) (setf (gethash pos node-table) (collecting (multiple-value-bind (arc-list end-p) (bit-decode-arcs arcs compression-tree pos nil) (when end-p (collect nil)) (loop for (code arc) on arc-list by #'cddr do (collect (or (gethash arc (aref arc-table (char-code code))) (setf (gethash arc (aref arc-table (char-code code))) (cons code (build arc)))))))))))) (make-instance 'list-string-net :list-net (build 0)))))) (count-nodes (vector-to-list-net *vector-net*)) (defparameter *nbo-list-net* (vector-to-list-net *nbo-lexicon*)) (count-nodes *nbo-list-net*) ;; 241649 (count-strings *nbo-list-net*) ;; 868882 (print-strings *nbo-list-net*) |# :eof