;;;-*- Mode: Lisp; Package: STRING-NET -*- (in-package :string-net) #+test (dict::close-all-pheaps-and-windows) #+test (defparameter *avar* (wood:open-pheap "projects:dictionaries;Awarisch (Saidov).sgmd")) (defparameter *avar-index* nil) (defparameter *lak-index* nil) (defclass xml-net-dictionary () ((index-path :initform nil :initarg :index-path :accessor dictionary-index-path) (index :initform nil :accessor dictionary-index) (xml-path :initform nil :initarg :xml-path :accessor dictionary-xml-path))) (defmethod initialize-instance :after ((dictionary xml-net-dictionary) &key index-path) (with-slots (index) dictionary (setf index (read-net index-path :translate-p nil)))) ;;#+test (defparameter *avar-dict* (make-instance 'xml-net-dictionary :index-path "projects:dictionaries;avar;avar.net" :xml-path "projects:dictionaries;avar;avar.xml")) (defparameter *lak-dict* (make-instance 'xml-net-dictionary :index-path "projects:dictionaries;lak;lak.net" :xml-path "projects:dictionaries;lak;lak.xml")) #+test (write-dictionary-xml-and-index *avar* "projects:dictionaries;avar;avar.xml" "projects:dictionaries;avar;avar.net") (defmacro do-stream-chunk-lines ((line stream start &optional end) &body body) (with-gensyms (pos) `(let ((,pos ,start)) (file-position ,stream ,pos) (loop for ,line = (read-line ,stream nil nil) while ,line do (setf ,pos (file-position ,stream)) (when (and ,end (< ,end ,pos)) (setf ,line (subseq ,line 0 (1+ (- (length ,line) (- ,pos ,end)))))) (progn ,@body) while (or (null ,end) (< ,pos ,end)))))) ;; 8046868 ;; 5263748 rel. offsets ;; 6753597 utf-8 (defparameter *monaco-cauc-to-unicode-table* (make-hash-table)) (defun monaco-cauc-to-unicode-char (c) (or (gethash c *monaco-cauc-to-unicode-table*) (setf (gethash c *monaco-cauc-to-unicode-table*) (let* ((pos (position c "ÄÅÇÉÑÖÜáàâäãåçéèêëíìîïñóòôªõúùûüý·’“”ÂÊÁËÈÍÎÏÌÓÔ•ÒÚÛÙž–—¯˜™š¸›œþ")) (unicode (when pos (+ pos 1040)))) (or unicode (let ((code (getf '(#\ð 1025 #\Þ 1105 #\¢ #\ý #\£ #\ #\ƒ #\Ë #\± #\Ó #\¾ #\Û #\„ #\š #\µ #\› #\ #\œ #\ø #\þ #\Ž #\Þ #\¬ #\Í #\¿ #\Î #\¡ #\Ï #\Ÿ #\Ô #\ˆ #\Ò #\‰ #\Ú #\ #\ž #\« #\– #\» #\— #\¦ #x0406 #+ignore #x04c0) c))) (cond ((characterp code) (monaco-cauc-to-unicode-char code)) ((null code) (char-code c)) (t code)))))))) (defun monaco-cauc-to-unicode (string) (with-output-to-string (stream) (loop for c across string do (xml::write-unicode-to-utf-8 (monaco-cauc-to-unicode-char c) stream)))) #+mcl (defun write-dictionary-xml-and-index (pheap xml-file index-file) (let ((pos 0) (count 0) (net (make-instance 'string-net::list-string-net))) (with-open-file (xml-stream xml-file :direction :output :if-exists :supersede) (wood:p-map-btree (dict::btree (wood:p-load (wood:root-object pheap))) (lambda (key value) (let* ((entry (wood:p-load value)) (key (remove-if (lambda (c) (find c "*0123456789:")) (multiple-value-bind (word language) (dict::unmark-word key) (loop for i from 0 to (1- (length word)) do (setf (char word i) (dict::strip-diacritics (or (dict::upper-to-lowercase (char word i) language) (char word i)) language))) word))) (entry (monaco-cauc-to-unicode entry) #+ignore (with-output-to-string (stream) (loop for c across entry do (xml::write-unicode-to-utf-8 (monaco-cauc-to-unicode c) stream)))) (entry-length (1+ (length entry))) (index (concat (monaco-cauc-to-unicode key) ":" (write-to-string pos) ":" (write-to-string entry-length)))) (incf pos entry-length) (when (zerop (mod (incf count) 100)) (print index)) (string-net::add-string net index) (write-line entry xml-stream))))) (setf *avar-index* net) (minimize-net net) (write-string-net net index-file))) #+test (print (get-word-entries *avar-dict* (monaco-cauc-to-unicode "þÚËÁÂ"))) (defmethod get-word-entries ((dictionary xml-net-dictionary) word) (with-slots (xml-path index) dictionary (with-open-file (xml-stream xml-path) (map-string-values index word (lambda (pair) (destructuring-bind (start delta) (string-parse pair :whitespace ":") (let* ((start (parse-integer start)) (end (+ start (parse-integer delta)))) (do-stream-chunk-lines (line xml-stream start end) (print line))))))))) #+test (with-open-file (stream "Unicode.html" :direction :output :if-exists :supersede) #m(html (head (title "Unicode-code -> UTF-8") (meta/ :http-equiv "Content-type" :content "text/htm" :charset "UTF-8")) (body (h3 "Unicode-code -> UTF-8") ((table :style "font-family: Arial") #L(loop for i from 20 to 10000 by 10 do #m(tr #L(dotimes (j 10) #m((td :align "right") #L(write (+ i j) :stream stream)) #m((td :bgcolor "#e080e0") #L(xml::write-unicode-to-utf-8 (+ i j) stream))))))))) #+test (print-strings (dictionary-index *avar-dict*)) ;; EOF