;;; -*- Mode: LISP; Package: UTILS; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;; Copyright (C) Paul Meurer 2000 - 2002. All rights reserved. ;; paul.meurer@hit.uib.no ;; HIT-centre, University of Bergen ;; ;; Version 0.8 ;; Implementation of the Garsia-Wachs optimal alphabetic tree algorithm ;; according to Donald E. Knuth, The Art of Computer Programming Vol. 3 ;;------------------------------------------------------------------------------------- ;; USAGE: ;; (make-instance 'optimal-tree :weights weights) builds an optimal tree ;; (object) from the list of (char . weight) pairs WEIGHTS. ;; ;;------------------------------------------------------------------------------------- ;; TO DO: a lot..., e.g.: ;; - give it its own package ;; - implement optimal alphabetic trees ;;------------------------------------------------------------------------------------- ;; change later (in-package "UTILS") (defconstant $max-character-code 256) (defclass optimal-tree () (;; GW-TREE is a binary tree whose leaves are characters. ;; Each node is encoded as a list whose CAR and CDR are the left and right daughters, ;; or a character if the node is a leaf node. (gw-tree :initform nil :reader gw-tree) ;; CHAR-WEIGHTS is a list of (char . weight) pairs (char-weights :reader char-weights) ;; Keys are character codes and values bit vectors denoting the path in the ;; GW-TREE to that character (0 means CAR/left daughter, 1 means CDR/right daughter). (bit-vectors :initform (make-array (1+ $max-character-code) :initial-element nil) :accessor gw-bit-vectors))) ;; preliminary (defclass optimal-alphabetic-tree (optimal-tree) ()) ;; does all calculation (defmethod initialize-instance ((tree optimal-tree) &key weights) (with-slots (char-weights gw-tree) tree (call-next-method) (setf char-weights (sort-weights tree weights) gw-tree (gw-tree-to-list (make-gw-tree tree))) (set-gw-bit-vectors tree) tree)) ;; used internally (defstruct gw-node weight value left-link right-link) (defmethod sort-weights ((tree optimal-tree) weights) (sort weights #'> :key #'cdr)) ;; don't sort for an alphabetic tree! (defmethod sort-weights ((tree optimal-alphabetic-tree) weights) weights) ;; The main function. ;; Not very lispy... straightforward implementation of Knuth's algorithm. ;; beware of typo in the book! ;; Exercise: rewrite in lisp style. ;; The code is not very efficient O(n^2), but sufficient for our purpose. ;; The labels C1 ... refer to the algorithm description in Knuth's book. (defmethod make-gw-tree ((tree optimal-tree)) (with-slots (char-weights) tree (let* ((leaf-count (length char-weights)) (nodes (make-array (* leaf-count 2))) (pointers (make-array (+ leaf-count 1) :initial-element nil)) (pos -1) (infinity 0) ;-) ; set to one more than the sum of all weights (tt 1) (m (1- leaf-count))) (labels ((combine (&optional (k tt)) ; C1 ;; C2 ; create new node (incf m) (setf (gw-node-left-link (aref nodes m)) (aref pointers (1- k)) (gw-node-right-link (aref nodes m)) (aref pointers k) (gw-node-weight (aref nodes m)) (+ (gw-node-weight (aref pointers (1- k))) (gw-node-weight (aref pointers k)))) ;; C3 (decf tt) (loop for j from k to tt do (setf (aref pointers j) ; ??? (aref pointers (1+ j)))) ;; C4 (let ((j (- k 2))) (loop while (and (>= j 0) (< (gw-node-weight (aref pointers j)) (gw-node-weight (aref nodes m)))) do (setf (aref pointers (1+ j)) (aref pointers j)) (decf j)) ;; C5 (setf (aref pointers (1+ j)) (aref nodes m)) (when (and (> j 0) (<= (gw-node-weight (aref pointers (1- j))) (gw-node-weight (aref nodes m)))) (combine j))))) ;; G1 ; initialize nodes (loop for (value . weight) in char-weights do (setf (aref nodes (incf pos)) (make-gw-node :weight weight :value value)) (incf infinity weight)) (incf infinity) (dotimes (i leaf-count) (setf (aref nodes (incf pos)) (make-gw-node))) (setf (aref pointers 0) (aref nodes (1- (* leaf-count 2))) (gw-node-weight (aref pointers 0)) infinity (aref pointers 1) (aref nodes 0)) ;; G2 (loop for j from 1 to (1- leaf-count) do (loop until (> (gw-node-weight (aref pointers (1- tt))) (gw-node-weight (aref nodes j))) do (combine tt)) (incf tt) (setf (aref pointers tt) (aref nodes j))) ;; G3 (loop do (combine) until (= tt 1)) (aref nodes (* (1- leaf-count) 2)))))) (defmethod make-gw-tree ((tree optimal-alphabetic-tree)) (let ((tree (call-next-method))) ;; disentangle the tree tree)) ;; rewrite! #+ignore (defun print-gw-tree (tree) (terpri) (let ((cost 0)) (labels ((walk (branch level) (when branch (walk (gw-node-right-link branch) (1+ level)) (dotimes (i (* level 5)) (write-char #\Space)) (format t "~d " (gw-node-weight branch)) (if (gw-node-value branch) (progn (write-char (gw-node-value branch)) (terpri) (incf cost (* level (gw-node-weight branch)))) (format t "~%" (gw-node-weight branch))) (walk (gw-node-left-link branch) (1+ level))))) (walk tree 0)) cost)) #+no-longer-used (defmethod gw-tree-cost ((tree t)) "computes the cost of a tree" (let ((cost 0)) (labels ((walk (branch level) (cond ((gw-node-value branch) (incf cost (* level (gw-node-weight branch)))) (t (walk (gw-node-right-link branch) (1+ level)) (walk (gw-node-left-link branch) (1+ level)))))) (walk tree 0)) cost)) (defmethod gw-tree-cost ((tree optimal-tree)) "computes the cost of a tree" (with-slots (gw-tree char-weights) tree (let ((abs-cost 0) (weight-sum 0)) (labels ((walk (branch level) (cond ((not (consp branch)) (let ((weight (cdr (assoc branch char-weights)))) (incf abs-cost (* level weight)) (incf weight-sum weight))) (t (walk (car branch) (1+ level)) (walk (cdr branch) (1+ level)))))) (walk gw-tree 0)) (coerce (/ abs-cost weight-sum) 'double-float)))) ;; changes the representation of the tree to use nested lists ;; instead of gw-node structures (defun gw-tree-to-list (tree) (labels ((walk (branch) (or (gw-node-value branch) (cons (walk (gw-node-right-link branch)) (walk (gw-node-left-link branch)))))) (walk tree))) (defmethod set-gw-bit-vectors ((tree optimal-tree)) (with-slots (gw-tree bit-vectors) tree (labels ((walk (branch bit-list) (cond ((eq branch :end) (setf (aref bit-vectors $max-character-code) (coerce (reverse bit-list) 'bit-vector))) ((not (consp branch)) (setf (aref bit-vectors (char-code branch)) (coerce (reverse bit-list) 'bit-vector))) (t (walk (car branch) (cons 0 bit-list)) (walk (cdr branch) (cons 1 bit-list)))))) (walk gw-tree ())))) ;; returns a bit-vector (defmethod gw-compress-char ((tree optimal-tree) c-code) (with-slots (bit-vectors) tree (aref bit-vectors c-code))) (defun %bw-code (bw) (declare (bit-vector bw)) (loop with code fixnum = 0 for bit fixnum across bw and i from 0 do (incf code (ash bit i)) finally (return (the fixnum code)))) ;(%bw-code #*1110010011000100010) ;; treats the initial bits of C-CODE as encoding (defmethod gw-decompress-char ((tree optimal-tree) c-code) (declare (fixnum c-code)) (with-slots (gw-tree) tree (%gw-decompress-char gw-tree (the fixnum c-code)))) #+test (defun %gw-decompress-char (gw-tree c-code &optional (size 0)) (declare (fixnum c-code)) (cond ((not (consp gw-tree)) (values gw-tree size)) ((zerop (logand (the fixnum c-code) 1)) (%gw-decompress-char (car gw-tree) (the fixnum (ash (the fixnum c-code) -1)) (1+ size))) (t (%gw-decompress-char (cdr gw-tree) (the fixnum (ash (the fixnum c-code) -1)) (1+ size))))) ;; extra fast (defun %gw-decompress-char (gw-tree c-code) (declare (fixnum c-code)) (cond ((not (consp gw-tree)) gw-tree) ((zerop (logand (the fixnum c-code) 1)) (%gw-decompress-char (car gw-tree) (the fixnum (ash (the fixnum c-code) -1)))) (t (%gw-decompress-char (cdr gw-tree) (the fixnum (ash (the fixnum c-code) -1)))))) (defun %gw-decompress-bv-char (gw-tree bv pos) (cond ((not (consp gw-tree)) (values gw-tree pos)) ((zerop (aref bv pos)) (%gw-decompress-bv-char (car gw-tree) bv (the fixnum (1+ pos)))) (t (%gw-decompress-bv-char (cdr gw-tree) bv (the fixnum (1+ pos)))))) (defclass buffered-bit-stream (#-(or ecl clisp sbcl) stream) ((stream :initform nil :initarg :stream) (buffer :initform 0); :type 'integer) ;; ?? (position :initform 0); :type 'integer) )) (defmethod write-bit ((bit-stream buffered-bit-stream) bit) (with-slots (stream buffer position) bit-stream (setf buffer (ash buffer 1)) (when (= bit 1) (incf buffer 1)) (when (= (incf position) 8) (write-byte buffer stream) (setf position 0 buffer 0)))) (defmethod read-bit ((bit-stream buffered-bit-stream)) (declare (optimize (safety 0) (space 0) (speed 3))) (with-slots (stream buffer position) bit-stream (when (zerop position) (setf position 8 buffer (the fixnum (read-byte stream)))) (the fixnum (logand (ash buffer (the fixnum (- (the fixnum (decf position))))) 1)))) (defmethod read-bits ((bit-stream buffered-bit-stream) count &optional add-max-bit-p) (declare (optimize (speed 3) (safety 0)) (fixnum count)) (with-slots (stream buffer position) bit-stream (let ((val (if add-max-bit-p 1 0))) (declare (fixnum val)) (dotimes (i count) (when (zerop position) (setf position 8 buffer (the fixnum (read-byte stream)))) (setf val (+ (the fixnum (ash val 1)) (the fixnum (logand (ash buffer (the fixnum (- (the fixnum (decf position))))) 1))))) val))) (defun gw-decompress-bit-stream-char (gw-tree buffered-bit-stream) (declare (optimize (safety 0) (space 0) (speed 3))) (if (consp gw-tree) (let ((bit (read-bit buffered-bit-stream))) (declare (fixnum bit)) (gw-decompress-bit-stream-char (if (zerop bit) (car gw-tree) (cdr gw-tree)) buffered-bit-stream)) gw-tree)) (defun %gw-get-char (gw-tree array pos bit-pos) (declare (optimize (speed 3) (safety 0)) (fixnum pos bit-pos)) (let ((byte (aref array pos))) (declare (fixnum byte)) (values (loop with new-byte-p = nil unless (consp gw-tree) return gw-tree when new-byte-p do (setf byte (the fixnum (aref array (the fixnum pos))) new-byte-p nil) do (if (zerop (the fixnum (logand (the fixnum byte) (the fixnum (ash 1 bit-pos))))) (setf gw-tree (car gw-tree)) (setf gw-tree (cdr gw-tree))) when (= (the fixnum (incf bit-pos)) 8) do (setf bit-pos 0 pos (the fixnum (1+ pos)) new-byte-p t)) pos bit-pos))) (defun %gw-bit-get-char (gw-tree array pos) (declare (optimize (speed 3) (safety 0)) (fixnum pos) (bit-vector array)) (values (loop unless (consp gw-tree) return gw-tree do (if (zerop (the fixnum (sbit array (the fixnum pos)))) (setf gw-tree (car gw-tree)) (setf gw-tree (cdr gw-tree))) (incf pos)) pos)) ;(%gw-get-char (gw-tree *a-z-tree*) #(128 161 5) 0 7) ;; efficient sequential storing ;; destructively modifies vector (defun flatten-binary-tree (tree) "computes a flat (string) representation of a binary tree" (let ((vector (make-array 1 :fill-pointer t :adjustable t :initial-element (1+ $max-character-code)))) (labels ((flatten (branch) (cond ((eq branch :end) (vector-push-extend $max-character-code vector)) ((not (consp branch)) (vector-push-extend (char-code branch) vector)) (t (incf (aref vector (1- (fill-pointer vector)))) (flatten (car branch)) (vector-push-extend (1+ $max-character-code) vector) (flatten (cdr branch)))))) (flatten tree) (let ((string (make-string (* 2 (length vector)) :element-type 'base-string))) (loop for n across vector with pos = -1 do (setf (char string (incf pos)) (code-char (logand n 255)) (char string (incf pos)) (code-char (ash n -8)))) string)))) #+allegro (defun %write-bit (bit stream) (write-bit stream bit)) #-allegro (defun %write-bit (bit stream) (write-byte bit stream)) (defun write-binary-tree (tree stream) ;; stream is bit stream! "computes a flat (string) representation of a binary tree" (let ((prev-int (1+ $max-character-code))) (labels ((flatten (branch) (cond ((eq branch :end) (loop for i from 15 downto 0 do (%write-bit (logand 1 (ash prev-int (- i))) stream)) (setf prev-int $max-character-code)) ((not (consp branch)) (loop for i from 15 downto 0 do (%write-bit (logand 1 (ash prev-int (- i))) stream)) (setf prev-int (char-code branch))) (t (incf prev-int) (flatten (car branch)) (loop for i from 15 downto 0 do (%write-bit (logand 1 (ash prev-int (- i))) stream)) (setf prev-int (1+ $max-character-code)) (flatten (cdr branch)))))) (flatten tree) (loop for i from 15 downto 0 do (%write-bit (logand 1 (ash prev-int (- i))) stream))))) (defmethod read-binary-tree ((stream stream) &key translate-fn) ;; stream is unsigned-byte stream! "restores the binary tree from flat stream representation" (let ((value (+ (* 256 (read-byte stream)) (read-byte stream)))) (labels ((restore () (cond ((<= value $max-character-code) (setf value (+ (* 256 (read-byte stream)) (read-byte stream))) (restore)) ((= value (1+ $max-character-code)) (setf value (+ (* 256 (read-byte stream)) (read-byte stream))) (cond ((= value $max-character-code) :end) (translate-fn (funcall translate-fn (code-char value))) (t (code-char value)))) (t (decf value) (cons (restore) (restore)))))) (restore)))) ;; new (defmethod restore-flattened-binary-tree ((string string)) "restores the binary tree from flat string representation" (let ((pos 0) (vector (make-array (/ (length string) 2)))) (loop for i from 0 to (1- (length vector)) with pos = -1 do (setf (aref vector i) (+ (* 256 (char-code (char string (incf pos)))) (char-code (char string (incf pos)))))) (labels ((restore () (let ((value (aref vector pos))) (cond ((<= value $max-character-code) (incf pos) (restore)) ((= value (1+ $max-character-code)) (let ((code (aref vector (incf pos)))) (if (= code $max-character-code) :end (code-char code)))) (t (decf (aref vector pos)) (cons (restore) (restore))))))) (restore)))) (defmethod %restore-flattened-binary-tree ((string string)) "restores the binary tree from flat string representation" (let ((pos 0) (vector (make-array (/ (length string) 2)))) (loop for i from 0 to (1- (length vector)) with pos = -1 do (setf (aref vector i) (+ (char-code (char string (incf pos))) (* 256 (char-code (char string (incf pos))))))) (labels ((restore () (let ((value (aref vector pos))) (cond ((<= value $max-character-code) (incf pos) (restore)) ((= value (1+ $max-character-code)) (let ((code (aref vector (incf pos)))) (if (= code $max-character-code) :end (code-char code)))) (t (decf (aref vector pos)) (cons (restore) (restore))))))) (restore)))) (defmethod restore-flattened-binary-tree ((array array)) "restores the binary tree from flat string representation" (let ((pos 0) (vector (make-array (/ (length array) 2)))) (loop for i from 0 to (1- (length vector)) with pos = -1 do (setf (aref vector i) (+ (aref array (incf pos)) (* 256 (aref array (incf pos)))))) (labels ((restore () (let ((value (aref vector pos))) (cond ((<= value $max-character-code) (incf pos) (restore)) ((= value (1+ $max-character-code)) (let ((code (aref vector (incf pos)))) (if (= code $max-character-code) :end code))) (t (decf (aref vector pos)) (cons (restore) (restore))))))) (restore)))) #|| nicer version, shows the algorithm more clearly, but less space-efficient ;; prepares tree for storing in file (defun flatten-binary-tree (tree) (let ((vector (make-array 0 :fill-pointer t :adjustable t))) (labels ((flatten (tree) (cond ((not (consp tree)) (vector-push-extend tree vector)) (t (vector-push-extend 0 vector) (flatten (car tree)) (vector-push-extend 1 vector) (flatten (cdr tree)))))) (flatten tree) vector))) ;; restores it from flat representation (defun restore-flattened-binary-tree (vector) (let ((pos -1)) (labels ((restore () (let ((value (aref vector (incf pos)))) (cond ((eq value 0) (cons (restore) (restore))) ((eq value 1) (restore)) (t value))))) (restore)))) ||# ;;; testing ;; example from the book #+test (defparameter *weights* '((#\* . 186) (#\A . 64) (#\B . 13) (#\C . 22) (#\D . 32) (#\E . 103) (#\F . 21) (#\G . 15) (#\H . 47) (#\I . 57) (#\J . 1) (#\K . 5) (#\L . 32) (#\M . 20) (#\N . 57) (#\O . 63) (#\P . 15) (#\Q . 1) (#\R . 48) (#\S . 51) (#\T . 80) (#\U . 23) (#\V . 8) (#\W . 18) (#\X . 1) (#\Y . 16) (#\Z . 1))) #+test (defparameter *a-z-tree* (make-instance 'optimal-tree :weights *weights*)) #+test (defparameter *a-z-tree* (make-instance 'optimal-alphabetic-tree :weights *weights*)) #+test (gw-tree-cost *a-z-tree*) #+test (dotimes (i 100) (write-char (gw-decompress-char *a-z-tree* i))) #+test (print-gw-tree (make-gw-tree *weights*)) #+test (gw-tree-cost (make-instance 'optimal-tree :weights *weights*))