;;;-*- Mode: Lisp; Package: DAT -*- ;; ;; Copyright (C) Paul Meurer 2002-2004. All rights reserved. ;; paul.meurer@aksis.uib.no ;; Aksis, University of Bergen ;; ;; Implementation of Patricia trees ;; (Donald Knuth: The Art of Computer Programming Vol. 3 pp. 498) ;; ;;------------------------------------------------------------------------------------- ;; TO DO: ;; ;; - efficiency ;; ;;------------------------------------------------------------------------------------- (in-package :dat) #+debug (defun %write-byte (byte stream) (print (list :pos (file-position stream) :byte byte)) (write-byte byte stream)) (defclass pt-basic-node () ((key :initform nil :initarg :key :accessor pt-key) (llink :initform nil :initarg :llink :accessor pt-llink) (ltag :initform nil :initarg :ltag :accessor pt-ltag))) (defclass pt-header (pt-basic-node) ()) (defclass pt-string-header (pt-header) ((text :initform nil :initarg :text :accessor pt-text))) (defclass string-vector () ((string-vector :initform nil :initarg :string-vector :accessor string-vector) (text-size :initform nil :reader text-length) (string-count :initform nil :initarg string-count) (max-string-size :initform (- (expt 2 24) 1)) (last-string-size :initform nil :initarg last-string-size) (position :initform 0 :accessor sv-position))) (defmethod initialize-instance :after ((sv string-vector) &key file ((:max-string-size mss)) end-char &allow-other-keys) (with-slots (string-vector text-size string-count max-string-size last-string-size) sv (multiple-value-setq (string-vector text-size string-count max-string-size last-string-size) (with-open-file (stream file) (read-large-file stream mss end-char))))) (defclass pt-string-vector-header (pt-header) ((string-vector :initform nil :initarg :string-vector :accessor pt-string-vector) (node-vector :initform nil :initarg :node-vector :accessor pt-node-vector))) ;; new 8.6.2002 (defclass pt-file-vector-header (pt-header) ((text-file :initform nil :initarg :text-file :accessor pt-text-file) (node-file :initform nil :initarg :node-file :accessor pt-node-file) (node-vector :initform nil :initarg :node-vector :accessor pt-node-vector))) (defclass pt-file-node-header (pt-header) ((text-file :initform nil :initarg :text-file :accessor pt-text-file) (node-file :initform nil :initarg :node-file :accessor pt-node-file) (node-vector :initform nil :initarg :node-vector :accessor pt-node-vector))) (defmethod initialize-instance :after ((pt-header pt-string-vector-header) &key file max-string-size end-char &allow-other-keys) (setf (pt-string-vector pt-header) (make-instance 'string-vector :file file :max-string-size max-string-size :end-char end-char))) (defclass pt-file-header (pt-header) ((file :initform nil :initarg :file :accessor pt-file))) (defclass pt-node (pt-basic-node) ((rlink :initform nil :initarg :rlink :accessor pt-rlink) (rtag :initform nil :initarg :rtag :accessor pt-rtag) (skip :initform nil :initarg :skip :accessor pt-skip))) (defmethod print-object ((node pt-basic-node) stream) (print-unreadable-object (node stream :type nil :identity nil) (format stream "~d" (pt-key node)))) (defmethod print-object ((node pt-node) stream) (print-unreadable-object (node stream :type nil :identity nil) (with-slots (key skip ltag rtag) node (format stream "pt key: ~d, skip: ~d, tag: ~d/~d" key skip ltag rtag)))) (defmethod first-differing-bit ((str1 string) (str2 string) start1 start2) (loop for i from start1 for j from start2 until (= j (length str2)) do (let ((ci (char str1 i)) (cj (char str2 j))) (when (char/= ci cj) (return-from first-differing-bit (+ (* 8 (- i start1)) (let ((cci (char-code ci)) (ccj (char-code cj))) (loop for i from 0 when (/= (logand cci 1) (logand ccj 1)) do (return i) do (setf cci (ash cci -1) ccj (ash ccj -1)))))))))) (defmethod first-differing-bit ((sv string-vector) (str2 string) start1 start2) (declare ;;(optimize (speed 3) (safety 0)) (fixnum start2) (integer start1)) (with-slots (string-vector string-count max-string-size last-string-size) sv (multiple-value-bind (s-count s-start1) (floor start1 (the fixnum max-string-size)) (declare (fixnum s-count s-start1)) (let ((str1 (aref string-vector s-count))) (loop for i fixnum from s-start1 for j fixnum from start2 until (= j (length str2)) do (when (= i (the fixnum max-string-size)) (setf i 0 str1 (aref string-vector (incf s-count)))) (let ((ci (char str1 i)) (cj (char str2 j))) (when (char/= ci cj) (return-from first-differing-bit (+ (the fixnum (* 8 (the fixnum (- j start2)))) (the fixnum (let ((cci (char-code ci)) (ccj (char-code cj))) (declare (fixnum cci ccj)) (loop for i fixnum from 0 when (/= (logand cci 1) (logand ccj 1)) do (return i) do (setf cci (ash cci -1) ccj (ash ccj -1)))))))))))))) ;; sv1 and sv2 are supposed to be eq (defmethod first-differing-bit ((sv1 string-vector) (sv2 string-vector) start1 start2) (declare ;;(optimize (speed 3) (safety 0)) (integer start1 start2)) (with-slots (string-vector string-count max-string-size last-string-size) sv1 (multiple-value-bind (s-count1 s-start1) (floor start1 max-string-size) (multiple-value-bind (s-count2 s-start2) (floor start2 max-string-size) (let ((str1 (aref string-vector s-count1)) (str2 (aref string-vector s-count2)) (delta s-count1)) (loop for i from s-start1 for j from s-start2 for common-length from 0 until (and (= i last-string-size) (= s-count1 (1- string-count)) (incf *iterations* common-length) (incf *iterations-count*) (setf *max-iterations* (max *max-iterations* common-length))) do (when (= i max-string-size) (setf i 0 str1 (aref string-vector (incf s-count1))) (incf delta)) (when (= j max-string-size) (setf j 0 str2 (aref string-vector (incf s-count2)))) (let ((ci (char str1 i)) (cj (char str2 j))) (when (char/= ci cj) (incf *iterations* common-length) (incf *iterations-count*) (setf *max-iterations* (max *max-iterations* common-length)) (return-from first-differing-bit (+ (* 8 (+ (- i start1) (* delta max-string-size))) (let ((cci (char-code ci)) (ccj (char-code cj))) (loop for i from 0 when (/= (logand cci 1) (logand ccj 1)) do (return i) do (setf cci (ash cci -1) ccj (ash ccj -1)))))))))))))) (defparameter *iterations* 0) (defparameter *max-iterations* 0) (defparameter *iterations-count* 0) (defmethod first-differing-bit-profile ((sv1 string-vector) (sv2 string-vector) start1 start2) (declare ;;(optimize (speed 3) (safety 0)) (integer start1 start2)) (with-slots (string-vector string-count max-string-size last-string-size) sv1 (multiple-value-bind (s-count1 s-start1) (floor start1 max-string-size) (multiple-value-bind (s-count2 s-start2) (floor start2 max-string-size) (let ((str1 (aref string-vector s-count1)) (str2 (aref string-vector s-count2)) (delta s-count1)) (loop for i from s-start1 for j from s-start2 for common-length from 0 until (and (= i last-string-size) (= s-count1 (1- string-count)) (incf *iterations* common-length) (incf *iterations-count*) (setf *max-iterations* (max *max-iterations* common-length))) do (when (= i max-string-size) (setf i 0 str1 (aref string-vector (incf s-count1))) (incf delta)) (when (= j max-string-size) (setf j 0 str2 (aref string-vector (incf s-count2)))) (let ((ci (char str1 i)) (cj (char str2 j))) (when (char/= ci cj) (incf *iterations* common-length) (incf *iterations-count*) (setf *max-iterations* (max *max-iterations* common-length)) (return-from first-differing-bit-profile (+ (* 8 (+ (- i start1) (* delta max-string-size))) (let ((cci (char-code ci)) (ccj (char-code cj))) (loop for i from 0 when (/= (logand cci 1) (logand ccj 1)) do (return i) do (setf cci (ash cci -1) ccj (ash ccj -1)))))))))))))) (defmethod first-differing-bit ((stream stream) (str2 string) start1 start2) (file-position stream start1) (loop for j from start2 until (= j (length str2)) do (let ((ci (read-char stream nil #\Null)) (cj (char str2 j))) (when (char/= ci cj) (return-from first-differing-bit (+ (* 8 (- j start2)) (let ((cci (char-code ci)) (ccj (char-code cj))) (loop for i from 0 when (/= (logand cci 1) (logand ccj 1)) do (return i) do (setf cci (ash cci -1) ccj (ash ccj -1)))))))))) (defmethod first-differing-bit ((stream1 stream) (stream2 stream) start1 start2) (file-position stream1 start1) (file-position stream2 start2) (loop for ci = (read-char stream1 nil #\Null) for cj = (read-char stream2 nil #\Null) for i from start1 do (when (char/= ci cj) (return-from first-differing-bit (+ (* 8 (- i start1)) (let ((cci (char-code ci)) (ccj (char-code cj))) (loop for i from 0 when (/= (logand cci 1) (logand ccj 1)) do (return i) do (setf cci (ash cci -1) ccj (ash ccj -1))))))) while (and (char/= ci #\Null) (char/= cj #\Null)))) (defmethod nth-bit-profile ((string string) n start) (declare ;;(optimize (speed 3) (safety 0)) #+ignore (fixnum n start)) (multiple-value-bind (pos i) (floor n 8) (logand 1 (ash (char-code (char string (+ pos start))) (- i))))) (defmethod nth-bit-profile ((sv string-vector) n start) (declare ;;(optimize (speed 3) (safety 0)) #+ignore (fixnum n start)) (with-slots (string-vector string-count max-string-size last-string-size) sv (multiple-value-bind (pos i) (floor n 8) (multiple-value-bind (s-count s-pos) (floor (+ pos start) max-string-size) (let ((string (aref string-vector s-count))) (logand 1 (ash (char-code (char string s-pos)) (- i)))))))) (defmethod nth-bit ((string string) n start) (declare ;;(optimize (speed 3) (safety 0)) #+ignore (fixnum n start)) (multiple-value-bind (pos i) (floor n 8) (logand 1 (ash (char-code (char string (+ pos start))) (- i))))) (defmethod nth-bit ((sv string-vector) n start) (declare ;;(optimize (speed 3) (safety 0)) #+ignore (fixnum n start)) (with-slots (string-vector string-count max-string-size last-string-size) sv (multiple-value-bind (pos i) (floor n 8) (multiple-value-bind (s-count s-pos) (floor (+ pos start) max-string-size) (let ((string (aref string-vector s-count))) (logand 1 (ash (char-code (char string s-pos)) (- i)))))))) (defmethod nth-bit ((stream stream) n start) (declare ;;(optimize (speed 3) (safety 0)) #+ignore (fixnum n)) (multiple-value-bind (pos i) (floor n 8) (file-position stream (+ pos start)) (logand 1 (ash (char-code (read-char stream nil #\Null)) (- i))))) (defmethod text-length ((text string)) (length text)) (defmethod text-length ((text stream)) (1+ (file-length text))) (defmethod search-patricia-tree ((pt pt-string-header) string &key (start 0) n return-bit-p build-p count) (let ((n (or n (* 8 (- (length string) start))))) ;; number of bits in string (with-slots (text) pt (%search-patricia-tree pt text string start n return-bit-p build-p count)))) (defmethod search-patricia-tree ((pt pt-string-vector-header) string &key (start 0) n return-bit-p build-p count) (let ((n (or n (* 8 (- (length string) start))))) ;; number of bits in string (with-slots (string-vector) pt (%search-patricia-tree pt string-vector string start n return-bit-p build-p count)))) (defmethod search-patricia-tree ((pt pt-file-vector-header) string &key (start 0) n return-bit-p build-p count) (let ((n (or n (* 8 (- (length string) start))))) ;; number of bits in string (with-slots (text-file) pt (with-open-file (stream text-file) (%search-patricia-tree pt stream string start n return-bit-p build-p count))))) (defmethod search-patricia-tree ((pt pt-file-header) string &key (start 0) n return-bit-p build-p count) (let ((n (or n (* 8 (- (text-length string) start))))) ;; number of bits in string (with-slots (file) pt (with-open-file (stream file) (%search-patricia-tree pt stream string start n return-bit-p build-p count))))) ;; count = NIL: only one occurrence is returned ;; count = T: all occurrences are counted or returned as list of positions ;; count = integer (number of occurrences to return) ;; second return value indicates whether all occurrences are found ;; return-as = :list (list of positions) or :count (count occurrences) (defmethod search-patricia-tree ((pt pt-file-node-header) string &key (start 0) n return-bit-p build-p count (return-as :list)) (let ((n (or n (* 8 (- (text-length string) start))))) ;; number of bits in string (with-slots (text-file node-file) pt (with-open-file (text-stream text-file) (with-open-file (node-stream node-file :element-type '(unsigned-byte 8)) (%search-file-patricia-tree pt node-stream text-stream string start n return-bit-p build-p count return-as)))))) ;; Basic Algorithm P (p. 500) (defmethod %search-patricia-tree ((pt pt-header) text string start n return-bit-p build-p count) (declare (ignore count)) (labels ((move-left (pt j) (let* ((qt pt) (pt (pt-llink pt))) ;; P2 (cond ((= 1 (pt-ltag qt)) (compare pt qt j)) ;; P6 (t ;; P3 skip bits (incf j (pt-skip pt)) (if (>= j n) (compare pt qt j) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt j) (move-right pt j))))))) (move-right (pt j) (let* ((qt pt) (pt (pt-rlink pt))) ;; P5 (cond ((= 1 (pt-rtag qt)) (compare pt qt j)) ;; P6 (t ;; P3 skip bits (incf j (pt-skip pt)) (if (>= j n) (compare pt qt j) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt j) (move-right pt j))))))) (compare (pt qt j) (let ((first-diff-bit-pos (first-differing-bit text string (pt-key pt) start))) (cond ((null first-diff-bit-pos) (pt-key pt)) (return-bit-p (values pt qt (nth-bit string first-diff-bit-pos start) (1- first-diff-bit-pos) j)) ((< n first-diff-bit-pos) (pt-key pt)) (build-p (values first-diff-bit-pos (pt-key pt) start)) (t nil))))) (move-left pt 0))) (defun build-patricia-tree (&key text file (keep-file-in-memory-p t) (end-char #\null) keys key-fn max-string-size node-file duplicate-threshold duplicates-fn) (assert (or text file)) (let ((header (cond (text (make-instance 'pt-string-header :text text :key 0 :ltag 1)) (keep-file-in-memory-p (make-instance 'pt-string-vector-header :file file :key 0 :ltag 1 :max-string-size max-string-size :end-char end-char)) (t ;; everything on disk (make-instance 'pt-file-node-header :text-file file :node-file (or node-file (concat file ".pt")) :key 0 :ltag 1)) #+test (t (make-instance 'pt-file-vector-header :text-file file :key 0 :ltag 1)) #+old (t (make-instance 'pt-file-header :file file :key 0 :ltag 1))))) (cond (text (%build-patricia-tree header :text text :keys keys :key-fn key-fn :duplicates-fn duplicates-fn)) (keep-file-in-memory-p (%build-patricia-tree header :text (pt-string-vector header) :file file :keys keys :key-fn key-fn :duplicate-threshold duplicate-threshold)) (t (with-open-file (stream file) (with-open-file (stream1 file) (with-open-file (stream2 file) (%build-patricia-tree header :stream stream :stream1 stream1 :stream2 stream2 :keys keys :key-fn key-fn :duplicate-threshold duplicate-threshold)))))))) (defmethod %build-patricia-tree ((header pt-header) &key text keys key-fn duplicates-fn stream stream1 stream2) (setf (pt-llink header) header) (let ((count 0) (text-length (text-length (or text stream)))) (labels ((add (key) (when (zerop (mod (incf count) 10000)) (print count)) (multiple-value-bind (first-diff-bit-pos pos1 pos2) (%search-patricia-tree header (or text stream1) (or text stream2) key (* 8 (- text-length key)) nil t nil) (let ((node (make-instance 'pt-node :key key)) (tag nil)) (when duplicates-fn (funcall duplicates-fn text pos2 pos1 first-diff-bit-pos)) (multiple-value-bind (pt qt bit l j) (%search-patricia-tree header (or text stream1) (or text stream2) key first-diff-bit-pos t t nil) (if (eq (pt-llink qt) pt) (setf (pt-llink qt) node tag (pt-ltag qt) (pt-ltag qt) 0) (setf (pt-rlink qt) node tag (pt-rtag qt) (pt-rtag qt) 0)) (if (zerop bit) (setf (pt-ltag node) 1 (pt-llink node) node (pt-rtag node) tag (pt-rlink node) pt) (setf (pt-rtag node) 1 (pt-rlink node) node (pt-ltag node) tag (pt-llink node) pt)) (if (= tag 1) (setf (pt-skip node) (+ 1 (- l j))) (setf (pt-skip node) (+ 1 (- l j) (pt-skip pt)) (pt-skip pt) (- j l 1)))))))) (cond (keys (dolist (key keys) (add key))) (key-fn (loop for key = (funcall key-fn (or text stream)) while key when (not (zerop key)) do (add key))) (t (dotimes (i (- text-length 2)) (add (1+ i))))) header))) (defclass node-vector () ((node-vector :initform nil :initarg :node-vector :accessor node-vector) (node-size :initform nil :initarg :node-size) (node-count :initform nil :initarg :node-count) (sub-vector-node-count :initform nil :initarg :sub-vector-node-count) (ltag-pos :initform 0) (ltag-size :initform 1) (rtag-pos :initform 1) (rtag-size :initform 1) (skip-pos :initform 2) (skip-size :initform nil :initarg :skip-size) (key-pos :initform nil :initarg :key-pos) (key-size :initform nil :initarg :key-size) (llink-pos :initform nil :initarg :llink-pos) (llink-size :initform nil :initarg :llink-size) (rlink-pos :initform nil :initarg :rlink-pos) (rlink-size :initform nil :initarg :rlink-size))) ;; (write-patricia-tree *wittgenstein* "projects:dat;mecs.pt") (defmethod write-patricia-tree ((pt pt-string-vector-header) file) (with-open-file (stream file :direction :output :if-exists :supersede :element-type '(unsigned-byte 8)) (with-slots (node-vector) pt (with-slots (node-vector node-size skip-size key-size llink-size) node-vector (write-byte node-size stream) (write-byte skip-size stream) (write-byte key-size stream) (write-byte llink-size stream) (loop for vector across node-vector do (loop for byte across vector do (write-byte byte stream))))))) (defmethod write-patricia-tree ((pt pt-file-vector-header) file) (with-open-file (stream file :direction :output :if-exists :supersede :element-type '(unsigned-byte 8)) (with-slots (node-vector) pt (with-slots (node-vector node-size skip-size key-size llink-size) node-vector (write-byte node-size stream) (write-byte skip-size stream) (write-byte key-size stream) (write-byte llink-size stream) (loop for vector across node-vector do (loop for byte across vector do (write-byte byte stream))))))) (defun read-patricia-tree (node-file text-file) (with-open-file (node-stream node-file :element-type '(unsigned-byte 8)) (with-open-file (text-stream text-file) (let* ((node-size (read-byte node-stream)) (skip-size (read-byte node-stream)) (key-size (read-byte node-stream)) (llink-size (read-byte node-stream)) (text-size (file-length text-stream)) (llink-pos (+ 2 skip-size (ceiling (log text-size 2)))) (node-vector (make-instance 'node-vector :node-size node-size :skip-size skip-size :key-pos (+ 2 skip-size) :key-size key-size :llink-pos llink-pos :llink-size llink-size :rlink-pos (+ llink-pos llink-size) :rlink-size llink-size)) (pt (make-instance 'pt-file-node-header :text-file text-file :node-file node-file :node-vector node-vector))) pt)))) #+(or allegro pcl) (defun move-left (pt-nsv pt-pos j rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p) (let* ((qt-nsv pt-nsv) (qt-pos pt-pos) (pt (node-integer qt-nsv qt-pos llink-pos llink-size))) ;; P2 (multiple-value-bind (pt-nsv pt-pos) (nsv-and-pos node-vector pt sub-vector-node-count node-size) (cond ((= 1 (pt-nsv-ltag qt-nsv qt-pos)) (compare pt pt-nsv pt-pos qt-nsv qt-pos j text string key-pos key-size start return-bit-p n build-p)) ;; P6 (t ;; P3 skip bits (incf j (node-integer pt-nsv pt-pos skip-pos skip-size)) (if (>= j n) (compare pt pt-nsv pt-pos qt-nsv qt-pos j text string key-pos key-size start return-bit-p n build-p) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt-nsv pt-pos j rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p) (move-right pt-nsv pt-pos j rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p)))))))) #+(or allegro pcl) (defun move-right (pt-nsv pt-pos j rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p) #+debug(print (list :move-right pt-nsv pt-pos j)) (let* ((qt-nsv pt-nsv) (qt-pos pt-pos) (pt (node-integer qt-nsv qt-pos rlink-pos rlink-size))) ;; P5 (multiple-value-bind (pt-nsv pt-pos) (nsv-and-pos node-vector pt sub-vector-node-count node-size) #+debug(print (list :pt pt :pt-nsv-ltag (pt-nsv-ltag qt-nsv qt-pos) qt-pos)) (cond ((= 1 (pt-nsv-rtag qt-nsv qt-pos)) (compare pt pt-nsv pt-pos qt-nsv qt-pos j text string key-pos key-size start return-bit-p n build-p)) ;; P6 (t ;; P3 skip bits (incf j (node-integer pt-nsv pt-pos skip-pos skip-size)) (if (>= j n) (compare pt pt-nsv pt-pos qt-nsv qt-pos j text string key-pos key-size start return-bit-p n build-p) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt-nsv pt-pos j rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p) (move-right pt-nsv pt-pos j rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p)))))))) #+(or allegro pcl) (defun move-left (pt-nsv pt-pos j rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p) #+debug(print (list :move-left pt-nsv pt-pos j)) (let* ((qt-nsv pt-nsv) (qt-pos pt-pos) (pt (node-integer qt-nsv qt-pos llink-pos llink-size))) ;; P2 (multiple-value-bind (pt-nsv pt-pos) (nsv-and-pos node-vector pt sub-vector-node-count node-size) #+debug(print (list :pt pt :pt-nsv-ltag (pt-nsv-ltag qt-nsv qt-pos) qt-pos)) (cond ((= 1 (pt-nsv-ltag qt-nsv qt-pos)) (compare pt pt-nsv pt-pos qt-nsv qt-pos j text string key-pos key-size start return-bit-p n build-p)) ;; P6 (t ;; P3 skip bits (incf j (node-integer pt-nsv pt-pos skip-pos skip-size)) #+debug(print (list :j j :n n :pt-pos pt-pos :skip-pos skip-pos)) (if (>= j n) (compare pt pt-nsv pt-pos qt-nsv qt-pos j text string key-pos key-size start return-bit-p n build-p) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt-nsv pt-pos j rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p) (move-right pt-nsv pt-pos j rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p)))))))) #+(or allegro pcl) (defun move-left-profile (pt-nsv pt-pos j rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p) (let* ((qt-nsv pt-nsv) (qt-pos pt-pos) (pt (node-integer-profile qt-nsv qt-pos llink-pos llink-size))) ;; P2 (multiple-value-bind (pt-nsv pt-pos) (nsv-and-pos-profile node-vector pt sub-vector-node-count node-size) (cond ((= 1 (pt-nsv-ltag-profile qt-nsv qt-pos)) (compare-profile pt pt-nsv pt-pos qt-nsv qt-pos j text string key-pos key-size start return-bit-p n build-p)) ;; P6 (t ;; P3 skip bits (incf j (node-integer-profile pt-nsv pt-pos skip-pos skip-size)) (if (>= j n) (compare-profile pt pt-nsv pt-pos qt-nsv qt-pos j text string key-pos key-size start return-bit-p n build-p) ;; P4 test bit (if (zerop (nth-bit-profile string j start)) (move-left-profile pt-nsv pt-pos j rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p) (move-right-profile pt-nsv pt-pos j rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p)))))))) #+(or allegro pcl) (defun move-right-profile (pt-nsv pt-pos j rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p) #+debug(print (list :move-right pt-nsv pt-pos j)) (let* ((qt-nsv pt-nsv) (qt-pos pt-pos) (pt (node-integer-profile qt-nsv qt-pos rlink-pos rlink-size))) ;; P5 (multiple-value-bind (pt-nsv pt-pos) (nsv-and-pos-profile node-vector pt sub-vector-node-count node-size) #+debug(print (list :pt pt :pt-nsv-ltag (pt-nsv-ltag qt-nsv qt-pos) qt-pos)) (cond ((= 1 (pt-nsv-rtag-profile qt-nsv qt-pos)) (compare-profile pt pt-nsv pt-pos qt-nsv qt-pos j text string key-pos key-size start return-bit-p n build-p)) ;; P6 (t ;; P3 skip bits (incf j (node-integer-profile pt-nsv pt-pos skip-pos skip-size)) (if (>= j n) (compare-profile pt pt-nsv pt-pos qt-nsv qt-pos j text string key-pos key-size start return-bit-p n build-p) ;; P4 test bit (if (zerop (nth-bit-profile string j start)) (move-left-profile pt-nsv pt-pos j rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p) (move-right-profile pt-nsv pt-pos j rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p)))))))) #+(or allegro pcl) (defun compare (pt pt-nsv pt-pos qt-nsv qt-pos j text string key-pos key-size start return-bit-p n build-p) #+debug(print (list :compare pt-pos qt-pos j)) (let ((first-diff-bit-pos (first-differing-bit text string (node-integer pt-nsv pt-pos key-pos key-size) start))) (cond ((null first-diff-bit-pos) (node-integer pt-nsv pt-pos key-pos key-size)) (return-bit-p (values pt pt-nsv pt-pos qt-nsv qt-pos (nth-bit string first-diff-bit-pos start) (1- first-diff-bit-pos) j)) ((< n first-diff-bit-pos) (node-integer pt-nsv pt-pos key-pos key-size)) (build-p first-diff-bit-pos) (t nil)))) #+(or allegro pcl) (defun compare-profile (pt pt-nsv pt-pos qt-nsv qt-pos j text string key-pos key-size start return-bit-p n build-p) #+debug(print (list :compare pt-pos qt-pos j)) (let ((first-diff-bit-pos (first-differing-bit-profile text string (node-integer-profile pt-nsv pt-pos key-pos key-size) start))) (cond ((null first-diff-bit-pos) (node-integer-profile pt-nsv pt-pos key-pos key-size)) (return-bit-p (values pt pt-nsv pt-pos qt-nsv qt-pos (nth-bit-profile string first-diff-bit-pos start) (1- first-diff-bit-pos) j)) ((< n first-diff-bit-pos) (node-integer-profile pt-nsv pt-pos key-pos key-size)) (build-p first-diff-bit-pos) (t nil)))) #+(or allegro) (defmethod %search-patricia-tree ((pt pt-string-vector-header) text string start n return-bit-p build-p count) (declare (ignore count)) (with-slots (node-vector) pt (with-slots (node-vector node-size node-count sub-vector-node-count last-sub-vector-size ltag-pos ltag-size rtag-pos rtag-size skip-pos skip-size key-pos key-size llink-pos llink-size rlink-pos rlink-size) node-vector (multiple-value-bind (pt-nsv pt-pos) (nsv-and-pos node-vector 0 sub-vector-node-count node-size) (move-left pt-nsv pt-pos 0 rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p))))) #+(or allegro pcl) (defmethod %search-patricia-tree-profile ((pt pt-string-vector-header) text string start n return-bit-p build-p count) (declare (ignore count)) (with-slots (node-vector) pt (with-slots (node-vector node-size node-count sub-vector-node-count last-sub-vector-size ltag-pos ltag-size rtag-pos rtag-size skip-pos skip-size key-pos key-size llink-pos llink-size rlink-pos rlink-size) node-vector (multiple-value-bind (pt-nsv pt-pos) (nsv-and-pos-profile node-vector 0 sub-vector-node-count node-size) (move-left-profile pt-nsv pt-pos 0 rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p))))) #+allegro (defmethod %search-patricia-tree ((pt pt-file-vector-header) text string start n return-bit-p build-p count) (declare (ignore count)) (with-slots (node-vector) pt (with-slots (node-vector node-size node-count sub-vector-node-count last-sub-vector-size ltag-pos ltag-size rtag-pos rtag-size skip-pos skip-size key-pos key-size llink-pos llink-size rlink-pos rlink-size) node-vector (multiple-value-bind (pt-nsv pt-pos) (nsv-and-pos node-vector 0 sub-vector-node-count node-size) (move-left pt-nsv pt-pos 0 rlink-pos rlink-size llink-pos llink-size node-vector sub-vector-node-count node-size skip-pos skip-size n string start text key-pos key-size return-bit-p build-p))))) ;; labels() does a lot of consing in allegro! #-(or allegro) (defmethod %search-patricia-tree ((pt pt-string-vector-header) text string start n return-bit-p build-p count) (declare (ignore count) (integer start n)) (with-slots (node-vector) pt (with-slots (node-vector node-size node-count sub-vector-node-count last-sub-vector-size ltag-pos ltag-size rtag-pos rtag-size skip-pos skip-size key-pos key-size llink-pos llink-size rlink-pos rlink-size) node-vector (labels ((move-left (pt-nsv pt-pos j) (let* ((qt-nsv pt-nsv) (qt-pos pt-pos) (pt (node-integer qt-nsv qt-pos llink-pos llink-size))) ;; P2 (multiple-value-bind (pt-nsv pt-pos) (nsv-and-pos node-vector pt sub-vector-node-count node-size) (cond ((= 1 (pt-nsv-ltag qt-nsv qt-pos)) (compare pt pt-nsv pt-pos qt-nsv qt-pos j)) ;; P6 (t ;; P3 skip bits (incf j (node-integer pt-nsv pt-pos skip-pos skip-size)) (if (>= j n) (compare pt pt-nsv pt-pos qt-nsv qt-pos j) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt-nsv pt-pos j) (move-right pt-nsv pt-pos j)))))))) (move-right (pt-nsv pt-pos j) (let* ((qt-nsv pt-nsv) (qt-pos pt-pos) (pt (node-integer qt-nsv qt-pos rlink-pos rlink-size))) ;; P5 (multiple-value-bind (pt-nsv pt-pos) (nsv-and-pos node-vector pt sub-vector-node-count node-size) (cond ((= 1 (pt-nsv-rtag qt-nsv qt-pos)) (compare pt pt-nsv pt-pos qt-nsv qt-pos j)) ;; P6 (t ;; P3 skip bits (incf j (node-integer pt-nsv pt-pos skip-pos skip-size)) (if (>= j n) (compare pt pt-nsv pt-pos qt-nsv qt-pos j) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt-nsv pt-pos j) (move-right pt-nsv pt-pos j)))))))) (compare (pt pt-nsv pt-pos qt-nsv qt-pos j) (let ((first-diff-bit-pos (first-differing-bit text string (node-integer pt-nsv pt-pos key-pos key-size) start))) (cond ((null first-diff-bit-pos) (node-integer pt-nsv pt-pos key-pos key-size)) (return-bit-p (values pt pt-nsv pt-pos qt-nsv qt-pos (nth-bit string first-diff-bit-pos start) (1- first-diff-bit-pos) j)) ((< n first-diff-bit-pos) (node-integer pt-nsv pt-pos key-pos key-size)) (build-p (values first-diff-bit-pos (node-integer pt-nsv pt-pos key-pos key-size) start)) (t nil))))) (multiple-value-bind (pt-nsv pt-pos) (nsv-and-pos node-vector 0 sub-vector-node-count node-size) (move-left pt-nsv pt-pos 0)))))) #+test (defmacro multiple-value-bind (varlist values-form &body body) (let ((ignore (make-symbol "IGNORE"))) `(multiple-value-call #'(lambda (&optional ,@varlist &rest ,ignore) (declare (ignore ,ignore)) nil ,@body) ,values-form))) #+test (defmacro multiple-value-bind ((&rest bindings) values-form &body body) (with-gensyms (mv-list) `(let ((,mv-list (multiple-value-list ,values-form))) (declare (dynamic-extent ,mv-list)) (let ,bindings ,@(loop for i from 0 and val in bindings collect `(setq ,val (nth ,i ,mv-list))) ,@body)))) #|| (LET ((#:G1000 (MULTIPLE-VALUE-LIST (VALUES 1 2)))) (LET (A B) (SETQ B (NTH 1 #:G1000)) (SETQ A (NTH 0 #:G1000)) NIL (LIST A B))) (print (multiple-value-bind (a b) (values 1 2) (list a b))) ||# #+test (defparameter *wit* (read-patricia-tree "projects:dat;mecs.pt" "projects:mlcd;mecs;mecs.strings")) #+test (let ((pos (search-patricia-tree *wit* "allerhand"))) (when pos (with-open-file (text-stream "projects:mlcd;mecs;mecs.strings") (file-position text-stream (max 0 (- pos 20))) (with-output-to-string (stream) (loop for i from 0 to 50 for c = (read-char text-stream nil nil) do (write-char c stream)))))) #+test (time (search-patricia-tree-all *wit* "Homo" 100)) #+test (with-open-file (text-stream "projects:mlcd;mecs;mecs.strings") (map-patricia-tree *wit* "Logik" (lambda (pos) (file-position text-stream (max 0 (- pos 20))) (loop for i from 0 to 50 for c = (read-char text-stream nil nil) do (write-char c *standard-output*)) (terpri)))) #+copy (defmethod search-patricia-tree-all ((pt pt-file-node-header) string &optional (count t) start-node) (declare (ignore start-node)) (let* ((start 0) (n (* 8 (- (text-length string) start)))) ;; number of bits in string (with-slots (text-file node-file) pt (with-open-file (text-stream text-file) (with-open-file (node-stream node-file :element-type '(unsigned-byte 8)) (%search-file-patricia-tree pt node-stream text-stream string start n nil nil count)))))) (defmethod map-patricia-tree ((pt pt-file-vector-header) string fun) (with-slots (text-file node-file) pt (with-open-file (text-stream text-file) (with-open-file (node-stream node-file :element-type '(unsigned-byte 8)) (%map-file-patricia-tree pt node-stream text-stream :string string :fun fun))))) (defun pt-file-ltag (node-stream pos) (declare (optimize (speed 3) (safety 0))) (file-position node-stream (+ pos 4)) (if (logbitp 7 (the fixnum (read-byte node-stream))) 1 0)) (defun pt-file-rtag (node-stream pos) (declare (optimize (speed 3) (safety 0))) (file-position node-stream (+ pos 4)) (if (logbitp 6 (the fixnum (read-byte node-stream))) 1 0)) (defun file-bytes (stream) (file-position stream 4) (loop for b = (read-byte stream nil nil) while b collect b)) (defmethod %search-file-patricia-tree ((pt pt-file-node-header) node-stream text-stream string start n return-bit-p build-p count return-as) (let ((fni-list ()) (count (if (eq count t) -1 count)) (occurrences 0)) (with-slots (node-vector) pt (with-slots (node-size ltag-pos ltag-size rtag-pos rtag-size skip-pos skip-size key-pos key-size llink-pos llink-size rlink-pos rlink-size) node-vector (labels ((move-left (pt-pos j recurse-p) (let* ((qt-pos pt-pos) (pt (file-node-integer node-stream qt-pos llink-pos llink-size)) (pt-pos (* pt node-size))) ;; P2 #+debug(print (list :move-left pt-pos j qt-pos llink-pos llink-size)) (cond ((and recurse-p (eq count 0)) nil) ((= 1 (pt-file-ltag node-stream qt-pos)) (cond (recurse-p (decf count) (if (eq return-as :count) (incf occurrences) (push (file-node-integer node-stream pt-pos key-pos key-size) fni-list))) (t (compare pt pt-pos qt-pos j nil)))) ;; P6 (recurse-p (move-left pt-pos 0 t) (move-right pt-pos 0 t)) (t ;; P3 skip bits (incf j (file-node-integer node-stream pt-pos skip-pos skip-size)) (if (>= j n) (compare pt pt-pos qt-pos j count) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt-pos j nil) (move-right pt-pos j nil))))))) (move-right (pt-pos j recurse-p) (let* ((qt-pos pt-pos) (pt (file-node-integer node-stream qt-pos rlink-pos rlink-size)) (pt-pos (* pt node-size))) ;; P5 #+debug(print (list :move-right pt-pos j recurse-p qt-pos rlink-pos rlink-size)) (cond ((and recurse-p (eq count 0)) nil) ((= 1 (pt-file-rtag node-stream qt-pos)) (cond (recurse-p (decf count) (if (eq return-as :count) (incf occurrences) (push (file-node-integer node-stream pt-pos key-pos key-size) fni-list))) (t (compare pt pt-pos qt-pos j nil)))) ;; P6 (recurse-p (move-left pt-pos 0 t) (move-right pt-pos 0 t)) (t ;; P3 skip bits (incf j (file-node-integer node-stream pt-pos skip-pos skip-size)) (if (>= j n) (compare pt pt-pos qt-pos j count) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt-pos j nil) (move-right pt-pos j nil))))))) (compare (pt pt-pos qt-pos j recurse-p) #+debug(print (list :compare pt-pos qt-pos j recurse-p)) (let* ((start0 (file-node-integer node-stream pt-pos key-pos key-size)) (first-diff-bit-pos (first-differing-bit text-stream string start0 start))) (cond ((null first-diff-bit-pos) (cond (recurse-p (move-left pt-pos j t) (move-right pt-pos j t)) (count (decf count) (if (eq return-as :count) (incf occurrences) (push (file-node-integer node-stream pt-pos key-pos key-size) fni-list))) (t (file-node-integer node-stream pt-pos key-pos key-size)))) (return-bit-p (values pt pt-pos qt-pos (nth-bit string first-diff-bit-pos start) (1- first-diff-bit-pos) j)) ((< n first-diff-bit-pos) (cond (recurse-p (move-left pt-pos 0 t) (move-right pt-pos 0 t)) (count (if (eq return-as :count) (incf occurrences) (push (file-node-integer node-stream pt-pos key-pos key-size) fni-list))) (t (file-node-integer node-stream pt-pos key-pos key-size)))) (build-p (values first-diff-bit-pos start0 start)) (t nil))))) (cond ((null count) (move-left 0 0 nil)) ((eq return-as :count) (progn (move-left 0 0 nil) (values occurrences ;; all found? (not (eq count 0))))) (t (progn (move-left 0 0 nil) (values fni-list (not (eq count 0))))))))))) #+ignore (defmethod %search-patricia-tree-all ((pt pt-string-vector-header) node-stream text-stream &key string max-count) (let* ((count 0) (start 0) (n (* 8 (- (length string) start)))) ;; number of bits in string (with-slots (node-vector) pt (with-slots (node-size ltag-pos ltag-size rtag-pos rtag-size skip-pos skip-size key-pos key-size llink-pos llink-size rlink-pos rlink-size) node-vector (collecting (labels ((move-left (pt-pos j) (let* ((qt-pos pt-pos) (pt (file-node-integer node-stream qt-pos llink-pos llink-size)) (pt-pos (+ 4 (* pt node-size)))) ;; P2 (cond ((= 1 (pt-file-ltag node-stream qt-pos)) (compare pt-pos nil)) ;; P6 (t ;; P3 skip bits (incf j (file-node-integer node-stream pt-pos skip-pos skip-size)) (if (>= j n) (compare pt-pos t) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt-pos j) (move-right pt-pos j))))))) (move-right (pt-pos j) (let* ((qt-pos pt-pos) (pt (file-node-integer node-stream qt-pos rlink-pos rlink-size)) (pt-pos (+ 4 (* pt node-size)))) ;; P5 (cond ((= 1 (pt-file-rtag node-stream qt-pos)) (compare pt-pos nil)) ;; P6 (t ;; P3 skip bits (incf j (file-node-integer node-stream pt-pos skip-pos skip-size)) (if (>= j n) (compare pt-pos t) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt-pos j) (move-right pt-pos j))))))) (move-left-rec (pt-pos) (let* ((qt-pos pt-pos) (pt (file-node-integer node-stream qt-pos llink-pos llink-size)) (pt-pos (+ 4 (* pt node-size)))) ;; P2 (cond ((and max-count (= count max-count)) nil) ((= 1 (pt-file-ltag node-stream qt-pos)) (incf count) (collect (file-node-integer node-stream pt-pos key-pos key-size))) (t (move-left-rec pt-pos) (move-right-rec pt-pos))))) (move-right-rec (pt-pos) (let* ((qt-pos pt-pos) (pt (file-node-integer node-stream qt-pos rlink-pos rlink-size)) (pt-pos (+ 4 (* pt node-size)))) ;; P5 (cond ((and max-count (= count max-count)) nil) ((= 1 (pt-file-rtag node-stream qt-pos)) (incf count) (collect (file-node-integer node-stream pt-pos key-pos key-size))) (t (move-left-rec pt-pos) (move-right-rec pt-pos))))) (compare (pt-pos &optional recurse-p) (let ((first-diff-bit-pos (first-differing-bit text-stream string (file-node-integer node-stream pt-pos key-pos key-size) start))) (cond ((or (null first-diff-bit-pos) (< n first-diff-bit-pos)) (cond (recurse-p (move-left-rec pt-pos) (move-right-rec pt-pos)) (t (collect (file-node-integer node-stream pt-pos key-pos key-size))))) (t nil))))) (move-left 4 0))))))) (defmethod %search-file-patricia-tree-all ((pt pt-file-vector-header) node-stream text-stream &key string max-count) (let* ((count 0) (start 0) (n (* 8 (- (length string) start)))) ;; number of bits in string (with-slots (node-vector) pt (with-slots (node-size ltag-pos ltag-size rtag-pos rtag-size skip-pos skip-size key-pos key-size llink-pos llink-size rlink-pos rlink-size) node-vector (collecting (labels ((move-left (pt-pos j) (let* ((qt-pos pt-pos) (pt (file-node-integer node-stream qt-pos llink-pos llink-size)) (pt-pos (+ 4 (* pt node-size)))) ;; P2 (cond ((= 1 (pt-file-ltag node-stream qt-pos)) (compare pt-pos nil)) ;; P6 (t ;; P3 skip bits (incf j (file-node-integer node-stream pt-pos skip-pos skip-size)) (if (>= j n) (compare pt-pos t) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt-pos j) (move-right pt-pos j))))))) (move-right (pt-pos j) (let* ((qt-pos pt-pos) (pt (file-node-integer node-stream qt-pos rlink-pos rlink-size)) (pt-pos (+ 4 (* pt node-size)))) ;; P5 (cond ((= 1 (pt-file-rtag node-stream qt-pos)) (compare pt-pos nil)) ;; P6 (t ;; P3 skip bits (incf j (file-node-integer node-stream pt-pos skip-pos skip-size)) (if (>= j n) (compare pt-pos t) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt-pos j) (move-right pt-pos j))))))) (move-left-rec (pt-pos) (let* ((qt-pos pt-pos) (pt (file-node-integer node-stream qt-pos llink-pos llink-size)) (pt-pos (+ 4 (* pt node-size)))) ;; P2 (cond ((and max-count (= count max-count)) nil) ((= 1 (pt-file-ltag node-stream qt-pos)) (incf count) (collect (file-node-integer node-stream pt-pos key-pos key-size))) (t (move-left-rec pt-pos) (move-right-rec pt-pos))))) (move-right-rec (pt-pos) (let* ((qt-pos pt-pos) (pt (file-node-integer node-stream qt-pos rlink-pos rlink-size)) (pt-pos (+ 4 (* pt node-size)))) ;; P5 (cond ((and max-count (= count max-count)) nil) ((= 1 (pt-file-rtag node-stream qt-pos)) (incf count) (collect (file-node-integer node-stream pt-pos key-pos key-size))) (t (move-left-rec pt-pos) (move-right-rec pt-pos))))) (compare (pt-pos &optional recurse-p) (let ((first-diff-bit-pos (first-differing-bit text-stream string (file-node-integer node-stream pt-pos key-pos key-size) start))) (cond ((or (null first-diff-bit-pos) (< n first-diff-bit-pos)) (cond (recurse-p (move-left-rec pt-pos) (move-right-rec pt-pos)) (t (collect (file-node-integer node-stream pt-pos key-pos key-size))))) (t nil))))) (move-left 4 0))))))) (defmethod %map-file-patricia-tree ((pt pt-file-vector-header) node-stream text-stream &key string fun) (let* ((start 0) (n (* 8 (- (length string) start)))) ;; number of bits in string (with-slots (node-vector) pt (with-slots (node-size ltag-pos ltag-size rtag-pos rtag-size skip-pos skip-size key-pos key-size llink-pos llink-size rlink-pos rlink-size) node-vector (values (labels ((move-left (pt-pos j) (let* ((qt-pos pt-pos) (pt (file-node-integer node-stream qt-pos llink-pos llink-size)) (pt-pos (+ 4 (* pt node-size)))) ;; P2 (cond ((= 1 (pt-file-ltag node-stream qt-pos)) (compare pt-pos nil)) ;; P6 (t ;; P3 skip bits (incf j (file-node-integer node-stream pt-pos skip-pos skip-size)) (if (>= j n) (compare pt-pos t) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt-pos j) (move-right pt-pos j))))))) (move-right (pt-pos j) (let* ((qt-pos pt-pos) (pt (file-node-integer node-stream qt-pos rlink-pos rlink-size)) (pt-pos (+ 4 (* pt node-size)))) ;; P5 (cond ((= 1 (pt-file-rtag node-stream qt-pos)) (compare pt-pos nil)) ;; P6 (t ;; P3 skip bits (incf j (file-node-integer node-stream pt-pos skip-pos skip-size)) (if (>= j n) (compare pt-pos t) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt-pos j) (move-right pt-pos j))))))) (move-left-rec (pt-pos) (let* ((qt-pos pt-pos) (pt (file-node-integer node-stream qt-pos llink-pos llink-size)) (pt-pos (+ 4 (* pt node-size)))) ;; P2 (cond ((= 1 (pt-file-ltag node-stream qt-pos)) (funcall fun (file-node-integer node-stream pt-pos key-pos key-size))) (t (move-left-rec pt-pos) (move-right-rec pt-pos))))) (move-right-rec (pt-pos) (let* ((qt-pos pt-pos) (pt (file-node-integer node-stream qt-pos rlink-pos rlink-size)) (pt-pos (+ 4 (* pt node-size)))) ;; P5 (cond ((= 1 (pt-file-rtag node-stream qt-pos)) (funcall fun (file-node-integer node-stream pt-pos key-pos key-size))) (t (move-left-rec pt-pos) (move-right-rec pt-pos))))) (compare (pt-pos &optional recurse-p) (let ((first-diff-bit-pos (first-differing-bit text-stream string (file-node-integer node-stream pt-pos key-pos key-size) start))) (cond ((or (null first-diff-bit-pos) (< n first-diff-bit-pos)) (cond (recurse-p (move-left-rec pt-pos) (move-right-rec pt-pos)) (t (funcall fun (file-node-integer node-stream pt-pos key-pos key-size))))) (t nil))))) (move-left 4 0) nil)))))) (defmethod search-patricia-tree-all ((pt pt-file-vector-header) string &optional count start-node) (declare (ignore start-node)) (with-slots (text-file node-file) pt (with-open-file (text-stream text-file) (with-open-file (node-stream node-file :element-type '(unsigned-byte 8)) (%search-file-patricia-tree-all pt node-stream text-stream :string string :max-count count ;;:start-node start-node ))))) (defmethod search-patricia-tree-all ((pt pt-file-node-header) string &optional (count t) start-node) (declare (ignore start-node)) (let* ((start 0) (n (* 8 (- (text-length string) start)))) ;; number of bits in string (with-slots (text-file node-file) pt (with-open-file (text-stream text-file) (with-open-file (node-stream node-file :element-type '(unsigned-byte 8)) (%search-file-patricia-tree pt node-stream text-stream string start n nil nil count nil)))))) ;; change "string" to something else! (defun make-node-vector (text-size key-count &key (skip-size 12) max-sub-vector-size) (multiple-value-bind (node-size key-size llink-pos llink-size rlink-pos rlink-size) (node-size text-size key-count skip-size) (let* ((sub-vector-node-count (floor (or max-sub-vector-size (- (expt 2 24) 1)) node-size)) ;; multiple of NODE-SIZE (max-sub-vector-size (* node-size sub-vector-node-count)) (sub-vector-count (ceiling key-count sub-vector-node-count)) (last-node-count (mod key-count sub-vector-node-count)) (node-vector (make-array sub-vector-count))) (print (list :needed (* key-count node-size) :node-size node-size :max-sub-vector-size max-sub-vector-size :sub-vector-count sub-vector-count :skip-size skip-size :key-pos (+ 2 skip-size) :key-size key-size :llink-pos llink-pos :llink-size llink-size :rlink-pos rlink-pos :rlink-size rlink-size)) (loop for i from 0 to (1- sub-vector-count) do (print i) (let ((sub-string (make-array (if (= i (1- sub-vector-count)) (* last-node-count node-size) max-sub-vector-size) :element-type '(unsigned-byte 8) :initial-element 0))) (setf (aref node-vector i) sub-string))) (make-instance 'node-vector :node-vector node-vector :node-size node-size :node-count key-count :sub-vector-node-count sub-vector-node-count ;;:last-sub-vector-size :skip-size skip-size :key-pos (+ 2 skip-size) :key-size key-size :llink-pos llink-pos :llink-size llink-size :rlink-pos rlink-pos :rlink-size rlink-size)))) ;;(time (progn (defparameter *s* (make-array 2048 :element-type '(unsigned-byte 8))) nil)) ;;(time (progn (defparameter *s* (make-string 2048 :element-type 'base-character)) nil)) #+test (inspect (make-node-vector 103 51 :max-sub-vector-size 11)) ;; byte size of nodes (defun node-size (text-size key-count &optional (skip-size 12)) (values ;; node-size (ceiling (/ (+ 2 skip-size (ceiling (log text-size 2)) (* 2 (ceiling (log key-count 2)))) 8)) ;; key-size (ceiling (log text-size 2)) ;; llink-pos (+ 2 skip-size (ceiling (log text-size 2))) ;; llink-size (ceiling (log key-count 2)) ;; rlink-pos (+ 2 skip-size (ceiling (log text-size 2)) (ceiling (log key-count 2))) ;; rlink-size (ceiling (log key-count 2)))) ;; node layout: ;; ;; LTAG 1 bit ;; RTAG 1 bit ;; SKIP skip-size bits ;; KEY ln(text-size) bits ;; LLINK ln(key-count) bits ;; RLINK ln(key-count) bits (defparameter *max-offset* 0) (defparameter *max-skip* 0) (defparameter *max-length* 0) (defun node-integer (nsv pos offset length) (declare ;;(optimize (speed 3) (safety 0)) (integer pos offset length)) (setf *max-offset* (the integer (max offset (the integer *max-offset*)))) (setf *max-length* (the integer (max length (the integer *max-length*)))) (multiple-value-bind (offset-byte offset-bit) (floor offset 8) (declare (integer offset-byte offset-bit)) (let* ((int 0) (byte-pos (+ pos offset-byte)) (byte (aref nsv byte-pos))) (declare (integer int byte-pos byte)) (setf int (the integer (if (zerop offset-bit) byte (logand byte (the integer (ash 255 (the integer (- offset-bit)))))))) (decf length (the integer (- 8 offset-bit))) (loop while (> length 0) do (setf int (the integer (+ (the integer (ash int 8)) (the integer (aref nsv (the integer (incf byte-pos))))))) (decf length 8)) (ash int length)))) (defun node-integer-profile (nsv pos offset length) (declare ;;(optimize (speed 3) (safety 0)) (integer pos offset length)) (setf *max-offset* (the integer (max offset (the integer *max-offset*)))) (setf *max-length* (the integer (max length (the integer *max-length*)))) (multiple-value-bind (offset-byte offset-bit) (floor offset 8) (declare (integer offset-byte offset-bit)) (let* ((int 0) (byte-pos (+ pos offset-byte)) (byte (aref nsv byte-pos))) (declare (integer int byte-pos byte)) (setf int (the integer (if (zerop offset-bit) byte (logand byte (the integer (ash 255 (the integer (- offset-bit)))))))) (decf length (the integer (- 8 offset-bit))) (loop while (> length 0) do (setf int (the integer (+ (the integer (ash int 8)) (the integer (aref nsv (the integer (incf byte-pos))))))) (decf length 8)) (ash int length)))) (defun file-node-integer (node-stream pos offset length) (declare ;;(optimize (speed 3) (safety 0)) #+ignore (fixnum offset length)) (multiple-value-bind (offset-byte offset-bit) (floor offset 8) (declare (fixnum offset-byte offset-bit)) (let* ((int 0) (byte-pos (+ pos offset-byte 4))) (file-position node-stream byte-pos) (let ((byte (read-byte node-stream))) (declare (fixnum int byte)) ;;(print (list :byte byte :byte-pos byte-pos)) (setf int (if (zerop offset-bit) byte (logand byte (ash 255 (the fixnum (- offset-bit)))))) (decf length (the fixnum (- 8 offset-bit))) (loop while (> length 0) do (setf int (+ (ash int 8) (read-byte node-stream))) (decf length 8)) (ash int length))))) #+test (let ((*print-base* 2)) (print (deposit-field #b11110111 #b00111000 #b11011010))) #+test (let ((*print-base* 2)) (print 128) (print (byte 1 7))) (defun pt-nsv-ltag (nsv pos) (declare ;;(optimize (speed 3) (safety 0)) (array nsv) (fixnum pos)) (if (logbitp 7 (aref nsv pos)) 1 0)) (defun (setf pt-nsv-ltag) (tag nsv pos) (declare ;;(optimize (speed 3) (safety 0)) (array nsv) (fixnum pos)) (let ((byte-spec (byte 1 7))) (declare (dynamic-extent byte-spec)) (setf (aref nsv pos) (deposit-field (ash tag 7) byte-spec (aref nsv pos))))) (defun pt-nsv-rtag (nsv pos) (declare ;;(optimize (speed 3) (safety 0)) (array nsv) (fixnum pos)) (if (logbitp 6 (aref nsv pos)) 1 0)) (defun (setf pt-nsv-rtag) (tag nsv pos) (declare ;;(optimize (speed 3) (safety 0)) (array nsv) (fixnum pos)) (let ((byte-spec (byte 1 6))) (declare (dynamic-extent byte-spec)) (setf (aref nsv pos) (deposit-field (ash tag 6) byte-spec (aref nsv pos))))) (defun pt-nsv-ltag-profile (nsv pos) (declare ;;(optimize (speed 3) (safety 0)) (array nsv) (fixnum pos)) (if (logbitp 7 (aref nsv pos)) 1 0)) (defun (setf pt-nsv-ltag-profile) (tag nsv pos) (declare ;;(optimize (speed 3) (safety 0)) (array nsv) (fixnum pos)) (let ((byte-spec (byte 1 7))) (declare (dynamic-extent byte-spec)) (setf (aref nsv pos) (deposit-field (ash tag 7) byte-spec (aref nsv pos))))) (defun pt-nsv-rtag-profile (nsv pos) (declare ;;(optimize (speed 3) (safety 0)) (array nsv) (fixnum pos)) (if (logbitp 6 (aref nsv pos)) 1 0)) (defun (setf pt-nsv-rtag-profile) (tag nsv pos) (declare ;;(optimize (speed 3) (safety 0)) (array nsv) (fixnum pos)) (let ((byte-spec (byte 1 6))) (declare (dynamic-extent byte-spec)) (setf (aref nsv pos) (deposit-field (ash tag 6) byte-spec (aref nsv pos))))) (defun file-pt-nsv-ltag (nsv pos) (declare (optimize (speed 3) (safety 0))) (file-position nsv (+ pos 4)) (if (logbitp 7 (the fixnum (read-byte nsv))) 1 0)) (defun (setf file-pt-nsv-ltag) (tag nsv pos) (declare (optimize (speed 3) (safety 0)) (fixnum tag)) (file-position nsv (+ pos 4)) (let ((byte (the fixnum (read-byte nsv))) (byte-spec (byte 1 7))) (declare (dynamic-extent byte-spec)) (file-position nsv (+ pos 4)) (write-byte (deposit-field (the fixnum (ash tag 7)) byte-spec byte) nsv))) (defun file-pt-nsv-rtag (nsv pos) (declare (optimize (speed 3) (safety 0))) (file-position nsv (+ pos 4)) (if (logbitp 6 (the fixnum (read-byte nsv))) 1 0)) (defun (setf file-pt-nsv-rtag) (tag nsv pos) (declare (optimize (speed 3) (safety 0)) (fixnum tag)) (file-position nsv (+ pos 4)) (let ((byte (the fixnum (read-byte nsv))) (byte-spec (byte 1 6))) (declare (dynamic-extent byte-spec)) (file-position nsv (+ pos 4)) (write-byte (deposit-field (the fixnum (ash tag 6)) byte-spec byte) nsv))) #+test (let ((*print-base* 2)) (let ((nsv #(#b00111101 #b01011010 #b00111101))) (setf (pt-nsv-ltag nsv 1) 1 (pt-nsv-rtag nsv 1) 0) (print nsv))) #+test (let ((offset-bit 0) (length 3)) (let ((delta (min 0 (- 8 offset-bit length))) (fifi (max 0 (- 8 offset-bit length)))) (print (list (+ delta length) fifi)) (let ((*print-base* 2)) (print (byte (+ (min 0 (- 8 offset-bit length)) length) (max 0 (- 8 offset-bit length)))) (print (ash (ash 255 (- length 8)) (- 8 offset-bit length)))))) #+test (let ((*print-base* 2) (length 8)) (Print (byte 8 (- 8 length))) (Print (ash 255 (- 8 length)))) #+ignore (node-integer (setf (node-integer #(128 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0) 0 24 3) 1) 0 24 3) (defun (setf node-integer) (value nsv pos offset length) (declare ;;(optimize (speed 3) (safety 0)) #+ignore(fixnum offset length value)) (assert (and (>= value 0) (>= pos 0))) (multiple-value-bind (offset-byte offset-bit) (floor offset 8) (let ((byte-pos (+ pos offset-byte)) (byte-spec (byte (+ (min 0 (- 8 offset-bit length)) length) (max 0 (- 8 offset-bit length))))) (declare (dynamic-extent byte-spec)) #+debug(print (list :byte-pos byte-pos)) ;; 2 (setf (aref nsv byte-pos) (deposit-field (ash value (- 8 offset-bit length)) byte-spec (aref nsv byte-pos))) (decf length (- 8 offset-bit)) (loop while (> length 8) do (setf (aref nsv (incf byte-pos)) (logand 255 (ash value (- 8 length)))) (decf length 8)) (when (> length 0) (let ((byte-spec (byte 8 (- 8 length)))) (declare (dynamic-extent byte-spec)) (setf (aref nsv (incf byte-pos)) (deposit-field (logand 255 (ash value (- 8 length))) byte-spec (aref nsv byte-pos)))))) nsv)) (defun (setf node-integer-profile) (value nsv pos offset length) (declare ;;(optimize (speed 3) (safety 0)) #+ignore(fixnum offset length value)) (assert (and (>= value 0) (>= pos 0))) (multiple-value-bind (offset-byte offset-bit) (floor offset 8) (let ((byte-pos (+ pos offset-byte)) (byte-spec (byte (+ (min 0 (- 8 offset-bit length)) length) (max 0 (- 8 offset-bit length))))) (declare (dynamic-extent byte-spec)) #+debug(print (list :byte-pos byte-pos)) ;; 2 (setf (aref nsv byte-pos) (deposit-field (ash value (- 8 offset-bit length)) byte-spec (aref nsv byte-pos))) (decf length (- 8 offset-bit)) (loop while (> length 8) do (setf (aref nsv (incf byte-pos)) (logand 255 (ash value (- 8 length)))) (decf length 8)) (when (> length 0) (let ((byte-spec (byte 8 (- 8 length)))) (declare (dynamic-extent byte-spec)) (setf (aref nsv (incf byte-pos)) (deposit-field (logand 255 (ash value (- 8 length))) byte-spec (aref nsv byte-pos)))))) nsv)) (defun (setf file-node-integer) (value node-stream pos offset length) (declare (optimize (speed 3) (safety 0)) #+ignore(fixnum offset length value)) (assert (and (>= value 0) (>= pos 0))) (multiple-value-bind (offset-byte offset-bit) (floor offset 8) (let ((byte-pos (+ pos offset-byte 4)) (byte-spec (byte (+ (min 0 (- 8 offset-bit length)) length) (max 0 (- 8 offset-bit length))))) (declare (dynamic-extent byte-spec)) #+debug(print (list :byte-pos (- byte-pos 4))) ;; 2 (file-position node-stream byte-pos) (let ((byte (read-byte node-stream))) (file-position node-stream byte-pos) (write-byte (deposit-field (ash value (- 8 offset-bit length)) byte-spec byte) node-stream)) (decf length (- 8 offset-bit)) (loop while (> length 8) do (write-byte (logand 255 (ash value (- 8 length))) node-stream) (incf byte-pos) (decf length 8)) (when (> length 0) (let ((byte-spec (byte 8 (- 8 length)))) (declare (dynamic-extent byte-spec)) #+pcl(file-position node-stream (1+ byte-pos)) (let* ((node-byte (read-byte node-stream)) (byte (deposit-field (logand 255 (ash value (- 8 length))) byte-spec node-byte))) #+debug(print (list :byte byte :value value :length length :byte-spec byte-spec :node-byte node-byte)) #+debug(print (list :file-pos (file-position node-stream) :byte-pos (+ 1 -4 byte-pos))) (file-position node-stream (1+ byte-pos)) (write-byte byte node-stream))))) node-stream)) (defun nsv-and-pos (node-vector pt sub-vector-node-count node-size) #+ignore (declare (optimize (speed 3) (safety 0)) (array node-vector) (fixnum pt sub-vector-node-count node-size)) (multiple-value-bind (pt-nsv pt-pos) (floor pt sub-vector-node-count) (values (aref node-vector pt-nsv) (the fixnum (* pt-pos node-size))))) (defun nsv-and-pos-profile (node-vector pt sub-vector-node-count node-size) #+ignore (declare (optimize (speed 3) (safety 0)) (array node-vector) (fixnum pt sub-vector-node-count node-size)) (multiple-value-bind (pt-nsv pt-pos) (floor pt sub-vector-node-count) (values (aref node-vector pt-nsv) (the fixnum (* pt-pos node-size))))) #+test (let* ((*print-base* 2) (nsv #(#b11111111 #b0 #b0)) (offset 10)) (setf (node-integer nsv 0 offset 4) 11) (print nsv) (print (node-integer nsv 0 offset 4))) #+test (search-patricia-tree *spt* "s and") #+test (defparameter *spt* (build-patricia-tree :file "projects:dat;test1.txt")) ;; ********* (defmethod %search-patricia-tree-all ((pt pt-string-vector-header) text string start n) (let ((n (or n (* 8 (- (length string) start))))) ;; number of bits in string (with-slots (node-vector) pt (with-slots (node-vector node-size node-count sub-vector-node-count last-sub-vector-size ltag-pos ltag-size rtag-pos rtag-size skip-pos skip-size key-pos key-size llink-pos llink-size rlink-pos rlink-size) node-vector (collecting (labels ((move-left (pt-nsv pt-pos j) (let* ((qt-nsv pt-nsv) (qt-pos pt-pos) (pt (node-integer qt-nsv qt-pos llink-pos llink-size))) ;; P2 (multiple-value-bind (pt-nsv pt-pos) (nsv-and-pos node-vector pt sub-vector-node-count node-size) (cond ((= 1 (pt-nsv-ltag qt-nsv qt-pos)) (compare pt-nsv pt-pos nil)) ;; P6 (t ;; P3 skip bits (incf j (node-integer pt-nsv pt-pos skip-pos skip-size)) (if (>= j n) (compare pt-nsv pt-pos t) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt-nsv pt-pos j) (move-right pt-nsv pt-pos j)))))))) (move-right (pt-nsv pt-pos j) (let* ((qt-nsv pt-nsv) (qt-pos pt-pos) (pt (node-integer qt-nsv qt-pos rlink-pos rlink-size))) ;; P5 (multiple-value-bind (pt-nsv pt-pos) (nsv-and-pos node-vector pt sub-vector-node-count node-size) (cond ((= 1 (pt-nsv-rtag qt-nsv qt-pos)) (compare pt-nsv pt-pos nil)) ;; P6 (t ;; P3 skip bits (incf j (node-integer pt-nsv pt-pos skip-pos skip-size)) (if (>= j n) (compare pt-nsv pt-pos t) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt-nsv pt-pos j) (move-right pt-nsv pt-pos j)))))))) (move-left-rec (pt-nsv pt-pos) (let* ((qt-nsv pt-nsv) (qt-pos pt-pos) (pt (node-integer qt-nsv qt-pos llink-pos llink-size))) ;; P2 (multiple-value-bind (pt-nsv pt-pos) (nsv-and-pos node-vector pt sub-vector-node-count node-size) (cond ((= 1 (pt-nsv-ltag qt-nsv qt-pos)) (collect (node-integer pt-nsv pt-pos key-pos key-size))) (t (move-left-rec pt-nsv pt-pos) (move-right-rec pt-nsv pt-pos)))))) (move-right-rec (pt-nsv pt-pos) (let* ((qt-nsv pt-nsv) (qt-pos pt-pos) (pt (node-integer qt-nsv qt-pos rlink-pos rlink-size))) ;; P5 (multiple-value-bind (pt-nsv pt-pos) (nsv-and-pos node-vector pt sub-vector-node-count node-size) (cond ((= 1 (pt-nsv-rtag qt-nsv qt-pos)) (collect (node-integer pt-nsv pt-pos key-pos key-size))) (t (move-left-rec pt-nsv pt-pos) (move-right-rec pt-nsv pt-pos)))))) (compare (pt-nsv pt-pos recurse-p) (let ((first-diff-bit-pos (first-differing-bit text string (node-integer pt-nsv pt-pos key-pos key-size) start))) (cond ((or (null first-diff-bit-pos) (< n first-diff-bit-pos)) (cond (recurse-p (move-left-rec pt-nsv pt-pos) (move-right-rec pt-nsv pt-pos)) (t (collect (node-integer pt-nsv pt-pos key-pos key-size))))) (t nil))))) (multiple-value-bind (pt-nsv pt-pos) (nsv-and-pos node-vector 0 sub-vector-node-count node-size) #+debug(print (list :pt-nsv pt-nsv :pt-pos pt-pos)) (move-left pt-nsv pt-pos 0)))))))) #+test (defparameter *spt* (build-patricia-tree :file "projects:dat;test.txt" :max-string-size 10)) (defmethod %build-patricia-tree ((header pt-string-vector-header) &key text file keys key-fn duplicate-threshold) (format t "~&Allocating...") (print text) (let* ((text-length (text-length text)) (key-count (cond (keys (length keys)) (key-fn (let ((count 1)) (if file (with-open-file (stream file) (loop for key = (funcall key-fn stream) while key when (not (zerop key)) do (incf count))) (loop for key = (funcall key-fn text) while key when (not (zerop key)) do (incf count))) (print count))) (t text-length))) (node-vector (make-node-vector text-length key-count))) (setf (pt-node-vector header) node-vector) (format t "~&Everything allocated.") (%build-allocated-patricia-tree header :text text ;; :file file :keys keys :key-fn key-fn :duplicate-threshold duplicate-threshold) #+debug (profiler:with-profiling (:type :space) (%build-allocated-patricia-tree header :text text ;; :file file :keys keys :key-fn key-fn)))) (defmethod %build-patricia-tree ((header pt-file-vector-header) &key stream stream1 stream2 keys key-fn duplicate-threshold &allow-other-keys) (format t "~&Allocating...") (let* ((text-length (text-length stream)) (key-count (cond (keys (length keys)) (key-fn (let ((count 1)) (loop for key = (funcall key-fn stream) while key when (not (zerop key)) do (incf count)) (print count))) (t text-length))) (node-vector (make-node-vector text-length key-count))) (setf (pt-node-vector header) node-vector) (format t "~&Everything allocated.") #-orig (%build-allocated-patricia-tree header :stream stream :stream1 stream1 :stream2 stream2 :keys keys :key-fn key-fn :duplicate-threshold duplicate-threshold) #+debug (profiler:with-profiling (:type :space) (%build-allocated-patricia-tree header :stream stream :stream1 stream1 :stream2 stream2 :keys keys :key-fn key-fn)))) (defmethod %build-patricia-tree ((header pt-file-node-header) &key stream stream1 stream2 keys key-fn duplicate-threshold) (format t "~&Allocating...") (let* ((skip-size 12) (text-length (text-length stream)) (key-count (cond (keys (length keys)) (key-fn (let ((count 1)) (loop for key = (funcall key-fn stream) while key when (not (zerop key)) do (incf count)) (print count))) (t text-length)))) ;;(setf (pt-node-file header) node-file) (with-open-file (node-stream (pt-node-file header) :element-type '(unsigned-byte 8) :if-exists :supersede :if-does-not-exist :create :direction :io) (multiple-value-bind (node-size key-size llink-pos llink-size rlink-pos rlink-size) (node-size text-length key-count skip-size) (print (list :needed (* key-count node-size) :node-size node-size :skip-size skip-size :key-pos (+ 2 skip-size) :key-size key-size :llink-pos llink-pos :llink-size llink-size :rlink-pos rlink-pos :rlink-size rlink-size)) (setf (pt-node-vector header) (make-instance 'node-vector :node-vector node-stream :node-size node-size :node-count key-count :skip-size skip-size :key-pos (+ 2 skip-size) :key-size key-size :llink-pos llink-pos :llink-size llink-size :rlink-pos rlink-pos :rlink-size rlink-size)) (write-byte node-size node-stream) (write-byte skip-size node-stream) (write-byte key-size node-stream) (write-byte llink-size node-stream) (loop for i from 0 to (1- (* key-count node-size)) do (write-byte 0 node-stream))) (format t "~&Everything allocated.") #-orig (%build-allocated-patricia-tree header :stream stream :stream1 stream1 :stream2 stream2 :keys keys :key-fn key-fn :duplicate-threshold duplicate-threshold) #+debug (profiler:with-profiling (:type :space) (%build-allocated-patricia-tree header :stream stream :stream1 stream1 :stream2 stream2 :keys keys :key-fn key-fn))))) (defmethod %build-allocated-patricia-tree ((header pt-file-node-header) &key stream stream1 stream2 keys key-fn duplicate-threshold) (let* ((node 0) (text-length (text-length stream)) (duplicate-length nil) (duplicates-list ()) (prev-key 0)) (with-slots (node-vector node-size node-count ltag-pos ltag-size rtag-pos rtag-size skip-pos skip-size key-pos key-size llink-pos llink-size rlink-pos rlink-size) (pt-node-vector header) (setf (file-node-integer node-vector 0 llink-pos llink-size) 0 (file-node-integer node-vector 0 key-pos key-size) 0 (file-pt-nsv-ltag node-vector 0) 1) (labels ((add (key) (when (zerop (mod node 10000)) (print node)) (multiple-value-bind (first-diff-bit-pos start1 start2) (%search-file-patricia-tree header node-vector stream1 stream2 key (* 8 (- text-length key)) nil t nil nil) (let ((tag nil)) (cond ((and duplicate-threshold (> first-diff-bit-pos (* 8 duplicate-threshold))) (setf duplicate-length (floor first-diff-bit-pos 8)) (push (list duplicate-length start1 start2) duplicates-list) (format t "~&Duplicate of length ~d found at positions ~d and ~d." duplicate-length start1 start2)) (t (incf node) (multiple-value-bind (pt pt-pos qt-pos bit l j) (%search-file-patricia-tree header node-vector stream1 stream2 key first-diff-bit-pos t t nil nil) (let ((node-pos (* node node-size))) ;;(print (list :set-fni node-pos key-pos key-size key :pt pt-pos qt-pos bit l j)) (setf (file-node-integer node-vector node-pos key-pos key-size) key) (if (= (file-node-integer node-vector qt-pos llink-pos llink-size) pt) (setf (file-node-integer node-vector qt-pos llink-pos llink-size) node tag (file-pt-nsv-ltag node-vector qt-pos) (file-pt-nsv-ltag node-vector qt-pos) 0) (setf (file-node-integer node-vector qt-pos rlink-pos rlink-size) node tag (file-pt-nsv-rtag node-vector qt-pos) (file-pt-nsv-rtag node-vector qt-pos) 0)) (if (zerop bit) (setf (file-pt-nsv-ltag node-vector node-pos) 1 (file-node-integer node-vector node-pos llink-pos llink-size) node (file-pt-nsv-rtag node-vector node-pos) tag (file-node-integer node-vector node-pos rlink-pos rlink-size) pt) (setf (file-pt-nsv-rtag node-vector node-pos) 1 (file-node-integer node-vector node-pos rlink-pos rlink-size) node (file-pt-nsv-ltag node-vector node-pos) tag (file-node-integer node-vector node-pos llink-pos llink-size) pt)) (if (= tag 1) (setf (file-node-integer node-vector node-pos skip-pos skip-size) (+ 1 (- l j))) (setf (file-node-integer node-vector node-pos skip-pos skip-size) (+ 1 (- l j) (file-node-integer node-vector pt-pos skip-pos skip-size)) (file-node-integer node-vector pt-pos skip-pos skip-size) (- j l 1))))))))))) (cond (keys (dolist (key keys) (add key))) (key-fn (file-position stream 0) (loop for key = (funcall key-fn stream) while (and key (< key text-length)) when (not (zerop key)) do (cond (duplicate-length (decf duplicate-length (- key prev-key)) (when (minusp duplicate-length) (setf duplicate-length nil))) (t ;;(print (list :adding key)) (add key))) (setf prev-key key))) (t (time (dotimes (i (- text-length 1)) ;; 2 (add (1+ i)))))) (values header duplicates-list))))) (defmethod %build-allocated-patricia-tree ((header pt-string-vector-header) &key text file keys key-fn duplicate-threshold) (let* ((node 0) (text-length (text-length text)) (duplicate-length nil) (duplicates-list ()) (prev-key 0)) (with-slots (node-vector node-size node-count sub-vector-node-count last-sub-vector-size ltag-pos ltag-size rtag-pos rtag-size skip-pos skip-size key-pos key-size llink-pos llink-size rlink-pos rlink-size) (pt-node-vector header) (setf (node-integer (aref node-vector 0) 0 llink-pos llink-size) 0 (node-integer (aref node-vector 0) 0 key-pos key-size) 0 (pt-nsv-ltag (aref node-vector 0) 0) 1) (labels ((add (key) (when (zerop (mod node 50000)) (unless (zerop *iterations-count*) (print (list node (float (/ *iterations* *iterations-count*)) *max-iterations*))) (setf *iterations* 0 *max-iterations* 0 *iterations-count* 0)) (multiple-value-bind (first-diff-bit-pos start1 start2) (%search-patricia-tree header text text key (* 8 (- text-length key)) nil t nil) (let ((tag nil)) (cond ((and duplicate-threshold (> first-diff-bit-pos (* 8 duplicate-threshold))) (setf duplicate-length (floor first-diff-bit-pos 8)) (push (list duplicate-length start1 start2) duplicates-list) (format t "~&Duplicate of length ~d found at positions ~d and ~d." duplicate-length start1 start2)) (t (incf node) (multiple-value-bind (pt pt-nsv pt-pos qt-nsv qt-pos bit l j) (%search-patricia-tree header text text key first-diff-bit-pos t t nil) (multiple-value-bind (node-nsv node-pos) (nsv-and-pos node-vector node sub-vector-node-count node-size) (setf (node-integer node-nsv node-pos key-pos key-size) key) (if (= (node-integer qt-nsv qt-pos llink-pos llink-size) pt) (setf (node-integer qt-nsv qt-pos llink-pos llink-size) node tag (pt-nsv-ltag qt-nsv qt-pos) (pt-nsv-ltag qt-nsv qt-pos) 0) (setf (node-integer qt-nsv qt-pos rlink-pos rlink-size) node tag (pt-nsv-rtag qt-nsv qt-pos) (pt-nsv-rtag qt-nsv qt-pos) 0)) (if (zerop bit) (setf (pt-nsv-ltag node-nsv node-pos) 1 (node-integer node-nsv node-pos llink-pos llink-size) node (pt-nsv-rtag node-nsv node-pos) tag (node-integer node-nsv node-pos rlink-pos rlink-size) pt) (setf (pt-nsv-rtag node-nsv node-pos) 1 (node-integer node-nsv node-pos rlink-pos rlink-size) node (pt-nsv-ltag node-nsv node-pos) tag (node-integer node-nsv node-pos llink-pos llink-size) pt)) (if (= tag 1) (setf (node-integer node-nsv node-pos skip-pos skip-size) (+ 1 (- l j))) (setf (node-integer node-nsv node-pos skip-pos skip-size) (+ 1 (- l j) (node-integer pt-nsv pt-pos skip-pos skip-size)) (node-integer pt-nsv pt-pos skip-pos skip-size) (- j l 1))))))))))) (cond (keys (dolist (key keys) (add key))) (key-fn (if file (with-open-file (stream file) (loop for key = (funcall key-fn stream) while (and key (< key text-length)) when (not (zerop key)) do (add key))) (loop for key = (funcall key-fn text) while key when (not (zerop key)) do (cond (duplicate-length (decf duplicate-length (- key prev-key)) (when (minusp duplicate-length) (setf duplicate-length nil))) (t (add key))) (setf prev-key key)))) (t (time (dotimes (i (- text-length 1)) ;; 2 (add (1+ i)))))) (values header duplicates-list))))) (defmethod %build-allocated-patricia-tree ((header pt-file-vector-header) &key stream stream1 stream2 keys key-fn) (let* ((node 0) (text-length (text-length stream))) (with-slots (node-vector node-size node-count sub-vector-node-count last-sub-vector-size ltag-pos ltag-size rtag-pos rtag-size skip-pos skip-size key-pos key-size llink-pos llink-size rlink-pos rlink-size) (pt-node-vector header) (setf (node-integer (aref node-vector 0) 0 llink-pos llink-size) 0 (node-integer (aref node-vector 0) 0 key-pos key-size) 0 (pt-nsv-ltag (aref node-vector 0) 0) 1) (labels ((add (key) (when (zerop (mod node 10000)) (print node)) (let ((first-diff-bit-pos (%search-patricia-tree header stream1 stream2 key (* 8 (- text-length key)) nil t nil)) (tag nil)) (incf node) (multiple-value-bind (pt pt-nsv pt-pos qt-nsv qt-pos bit l j) (%search-patricia-tree header stream1 stream2 key first-diff-bit-pos t t nil) (multiple-value-bind (node-nsv node-pos) (nsv-and-pos node-vector node sub-vector-node-count node-size) (setf (node-integer node-nsv node-pos key-pos key-size) key) (if (= (node-integer qt-nsv qt-pos llink-pos llink-size) pt) (setf (node-integer qt-nsv qt-pos llink-pos llink-size) node tag (pt-nsv-ltag qt-nsv qt-pos) (pt-nsv-ltag qt-nsv qt-pos) 0) (setf (node-integer qt-nsv qt-pos rlink-pos rlink-size) node tag (pt-nsv-rtag qt-nsv qt-pos) (pt-nsv-rtag qt-nsv qt-pos) 0)) (if (zerop bit) (setf (pt-nsv-ltag node-nsv node-pos) 1 (node-integer node-nsv node-pos llink-pos llink-size) node (pt-nsv-rtag node-nsv node-pos) tag (node-integer node-nsv node-pos rlink-pos rlink-size) pt) (setf (pt-nsv-rtag node-nsv node-pos) 1 (node-integer node-nsv node-pos rlink-pos rlink-size) node (pt-nsv-ltag node-nsv node-pos) tag (node-integer node-nsv node-pos llink-pos llink-size) pt)) (if (= tag 1) (setf (node-integer node-nsv node-pos skip-pos skip-size) (+ 1 (- l j))) (setf (node-integer node-nsv node-pos skip-pos skip-size) (+ 1 (- l j) (node-integer pt-nsv pt-pos skip-pos skip-size)) (node-integer pt-nsv pt-pos skip-pos skip-size) (- j l 1)))))))) (cond (keys (dolist (key keys) (add key))) (key-fn (file-position stream 0) (loop for key = (funcall key-fn stream) while (and key (< key text-length)) when (not (zerop key)) do (add key))) (t (time (dotimes (i (- text-length 1)) ;; 2 (add (1+ i)))))) header)))) (defmethod search-patricia-tree-all ((pt pt-file-header) string &optional (start 0) n) (with-slots (file) pt (with-open-file (stream file) (%search-patricia-tree-all pt stream string start n)))) (defmethod search-patricia-tree-all ((pt pt-string-header) string &optional (start 0) n) (with-slots (text) pt (%search-patricia-tree-all pt text string start n))) (defmethod search-patricia-tree-all ((pt pt-string-vector-header) string &optional (start 0) n) (with-slots (string-vector) pt (%search-patricia-tree-all pt string-vector string start n))) (defmethod %search-patricia-tree-all ((pt pt-header) text string start n) (let ((n (or n (* 8 (- (length string) start))))) ;; number of bits in string (collecting (labels ((move-left (pt j) (let* ((qt pt) (pt (pt-llink pt))) ;; P2 (cond ((= 1 (pt-ltag qt)) (compare pt nil)) ;; P6 (t ;; P3 skip bits (incf j (pt-skip pt)) (if (>= j n) (compare pt t) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt j) (move-right pt j))))))) (move-right (pt j) (let* ((qt pt) (pt (pt-rlink pt))) ;; P5 (cond ((= 1 (pt-rtag qt)) (compare pt nil)) ;; P6 (t ;; P3 skip bits (incf j (pt-skip pt)) (if (>= j n) (compare pt t) ;; P4 test bit (if (zerop (nth-bit string j start)) (move-left pt j) (move-right pt j))))))) (move-left-rec (pt) (let* ((qt pt) (pt (pt-llink pt))) ;; P2 (cond ((= 1 (pt-ltag qt)) (collect (pt-key pt))) (t (move-left-rec pt) (move-right-rec pt))))) (move-right-rec (pt) (let* ((qt pt) (pt (pt-rlink pt))) ;; P5 (cond ((= 1 (pt-rtag qt)) (collect (pt-key pt))) (t (move-left-rec pt) (move-right-rec pt))))) (compare (pt &optional recurse-p) (let ((first-diff-bit-pos (first-differing-bit text string (pt-key pt) start))) (cond ((or (null first-diff-bit-pos) (< n first-diff-bit-pos)) (cond (recurse-p (move-left-rec pt) (move-right-rec pt)) (t (collect (pt-key pt))))) (t nil))))) (move-left pt 0))))) ;;(make-string (- (expt 2 24) 1) :element-type :base-character) (defun read-large-file (stream &optional max-string-size end-char) ;;(declare (optimize (speed 3) (safety 0))) (let* ((max-string-size (or max-string-size (- (expt 2 24) 1))) (size (if end-char (1+ (file-length stream)) (file-length stream))) (sub-string-count (ceiling size max-string-size)) (last-sub-string-size (mod size max-string-size)) (string-vector (make-array sub-string-count)) (count 0)) (loop for i from 0 to (1- sub-string-count) do (let ((sub-string (make-string (if (= i (1- sub-string-count)) last-sub-string-size max-string-size) :element-type #+mcl 'base-character #-mcl 'character))) (setf (aref string-vector i) sub-string) #-mcl (loop for j fixnum from 0 to (1- max-string-size) for c = (read-char stream nil nil) when (and end-char (null c) (= i (1- sub-string-count))) do (setf (schar sub-string (the fixnum j)) end-char) while c do (setf (schar sub-string (the fixnum j)) c) #+test (when (zerop (mod (incf count) 10000000)) (print count))) #+mcl (multiple-value-bind (reader-function value) (ccl::stream-reader stream) (loop for j fixnum from 0 to (1- max-string-size) for c = (funcall reader-function value) when (and end-char (null c) (= i (1- sub-string-count))) do (setf (schar sub-string (the fixnum j)) end-char) while c do (setf (schar sub-string (the fixnum j)) c))))) (values string-vector size sub-string-count max-string-size last-sub-string-size))) ;; 36.010 seconds, 48,978,768 bytes ;; 32.711 seconds w. decl. ;; 14,151 with stream-reader #+test (time (defparameter *large-string* (with-open-file (stream "projects:mlcd;mecs;mecs.strings") (read-large-file stream)))) ;; (log 20000000 2) #+test (node-size 20000000 20000000 12) #+test (defparameter *spt* (build-patricia-tree :file "projects:dat;test.txt" :max-string-size 10)) #+test (search-patricia-tree *spt* "s and") #+test (search-patricia-tree-all *spt* "mad") #+test (defparameter *pt* (build-patricia-tree :text "this is the house that jack built?" :keys ;;'(8 12 18 23 28) '(5 8 12 18 23 28))) #+test (search-patricia-tree *pt* "house that j") #+test (defparameter *pt* (build-patricia-tree :text "ich glaub ich muss gleich gehn.")) #+test (search-patricia-tree-all *pt* "ich") #+test (defparameter *spt* (build-patricia-tree :file "projects:dat;test2.txt")) ;; 1,284 ;; 1,021 ;; 0.369 #+test (search-patricia-tree-all *pt* "seven") #+test (defparameter *pt* (build-patricia-tree "ae#")) #+test (search-patricia-tree *pt* "seven") #+test (nth-bit "asdf" 9 0) #+test (defparameter *pt* (build-patricia-tree "aaaaaaaaaaaaa#")) #+test (search-patricia-tree-all *pt* "ba") #+test (search-patricia-tree *pt* "baa") #+test (let ((count 0)) (with-file-lines (line "projects:mlcd;mecs;mecs.strings") (incf count (count #\home line))) count) #+test (time (defparameter *wittgenstein* (build-patricia-tree :file "projects:mlcd;mecs;mecs.strings" ;;:key-fn #+ignore (lambda (stream) (loop for char = (read-char stream nil nil) while char when (or (char= char #\Space) (char= char #\Home)) do (return (file-position stream))))))) #+test (search-patricia-tree *wittgenstein* "Claus Hui") #+test (with-open-file (stream "/usr/local/cwb/corpora/avis/delkorpus" :direction :output :if-exists :supersede) (let ((count 0)) (block lines (with-file-lines (line "/usr/local/cwb/corpora/avis/korpus") (when (= (incf count) 100000000) (return-from lines)) (write-line line stream))))) #+test (with-open-file (stream "projects:dat;data;delkorpus" :direction :output :if-exists :supersede) (let ((count 0)) (block lines (with-file-lines (line "projects:dat;data;korpus") (when (= (incf count) 10000000) (return-from lines)) (write-line line stream))))) #+test (MAKE-NODE-VECTOR 228828091 40000000) #+test (describe *a*) #+test (print (search-patricia-tree-all *avis* "Dato")) #+test (write-patricia-tree *avis* "/usr/local/cwb/corpora/avis/delkorpus.pt") #+test (defparameter *a* (read-patricia-tree "/usr/local/cwb/corpora/avis/delkorpus.pt" "/usr/local/cwb/corpora/avis/delkorpus")) ;; prints concordance of search string (defun print-context (file context-list) (with-open-file (stream file) (dolist (pos context-list) (file-position stream (max (- pos 40) 0)) (print (with-output-to-string (str-stream) (loop for i from 0 to 100 for char = (read-char stream nil nil) while char do (when (char= char #\Newline) (setf char #\Space)) (write-char char str-stream))))))) #+test (print-context "/usr/local/cwb/corpora/avis/delkorpus" (search-patricia-tree *a* "fisk" :count t)) #+test (print (length (search-patricia-tree *a* "f" :count t))) #+test (alien:def-alien-variable ("dynamic_space_size" dynamic-space-size) c-call::int) #+test (describe (pt-node-vector *a*)) (defparameter *avis* nil) (defparameter *duplicates-list* nil) ;;(print *duplicates-list*) #+test (multiple-value-setq (*avis* *duplicates-list*) (build-avis-patricia-tree :file "/usr/local/cwb/corpora/avis/delkorpus00" :duplicate-threshold 1000)) (defun build-avis-patricia-tree (&key file duplicate-threshold) (let ((count 0)) (build-patricia-tree :file #+macosx "projects:dat;data;delkorpus0" #-macosx file :keep-file-in-memory-p nil :key-fn (lambda (stream &optional print-line-p) (add-avis-line stream (incf count) print-line-p)) :duplicate-threshold duplicate-threshold))) (defmethod add-avis-line ((stream stream) &optional (count 1) print-line-p) (declare (ignore print-line-p)) (loop for char = (read-char stream nil nil) while char when (char= char #\Newline) do #-test (when (zerop (mod count 1000000)) (file-position stream)) (return (file-position stream)))) (defmethod read-sv-char ((sv string-vector) &optional pos) (with-slots (string-vector string-count max-string-size last-string-size position) sv (when (< (or pos position) (text-length sv)) (multiple-value-bind (s-count s-pos) (floor (or pos position) (the fixnum max-string-size)) (declare (fixnum s-count s-pos)) (prog1 (char (aref string-vector s-count) s-pos) (if pos (setf position (1+ pos)) (incf position))))))) (defmethod add-avis-line ((sv string-vector) &optional (count 1) print-line-p) (loop for char = (read-sv-char sv) while char when print-line-p do (write-char char) when (char= char #\Newline) do (when (zerop (mod count 100000)) (print (list (sv-position sv) :time (now)))) (return (sv-position sv)))) #+test (profile:unprofile) #+test (profile:profile-all) #+test (profile:report-time) #+test (with-open-file (stream #+macosx "projects:dat;data;delkorpus1" #-macosx "/usr/local/cwb/corpora/avis/korpus-uten-dn" :direction :output :if-exists :supersede) (let ((count 0) (in-dn-p nil)) (block lines (with-file-lines (line #+macosx "projects:dat;data;korpus" #-macosx "/usr/local/cwb/corpora/avis/korpus") (when (and (> (length line) 2) (string= line " (length line) 2) (string= line " count (+ 13685000 141500)) (write-line line))))))) #+test (with-open-file (stream #+macosx "projects:dat;data;delkorpus00" #-macosx "/usr/local/cwb/corpora/avis/delkorpus00" :direction :output :if-exists :supersede) (let ((count 0) (in-dn-p nil)) (block lines (with-file-lines (line #+macosx "projects:dat;data;korpus" #-macosx "/usr/local/cwb/corpora/avis/korpus") #+ignore (when (and (> (length line) 2) (string= line "