;;; -*- Mode: LISP; Package: CL-USER; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;; change later (in-package "CL-USER") (um:use-module :sql "sql:sql;sql-system") (use-package :sql) (initialize-database-type) ;(disable-sql-reader-syntax) (enable-sql-reader-syntax) ; (disconnect) (connect "Oracle" :user-id "system" :password "gvprckvnis" :if-exists :warn-old) #+test (select [oppslag] [grunnform] [tag] [ordliknande] :from [tagger v-tagger-forkortingar] ;:where [= [ordliknande] "N"] ;:where [< [rownum] 10] ) (defparameter *abbreviations* (make-instance 'active-string-net)) (store-net *abbreviations* "projects:cgp;multitagger;abbreviations.net") (defparameter *word-like-abbreviations* (make-instance 'active-string-net)) (store-net *word-like-abbreviations* "projects:cgp;multitagger;word-like-abbreviations.net") (defparameter *titles* (make-instance 'active-string-net)) (store-net *titles* "projects:cgp;multitagger;titles.net") (defparameter *symbols* (make-instance 'active-string-net)) (store-net *symbols* "projects:cgp;multitagger;symbols.net") (defparameter *expressions* (make-instance 'active-string-net)) (store-net *expressions* "projects:cgp;multitagger;expressions.net") ;(defparameter *lemmata* (make-instance 'active-string-net)) (let ((count 0) (string-net *expressions* ;; *symbols* ;; *titles* ;; *abbreviations* ;; *word-like-abbreviations* )) (do-query ((word base-form tag) [select [oppslag] [grunnform] [tag] :from [tagger v-tagger-uttrykk] ;:where [= [ordliknande] "Y"] ]) (add-string string-net (u:concat word ":" base-form ":" tag)) (incf count) (when (zerop (mod count 1000)) (format t "~%~5d ~a" count word))) (print count) (print (count-strings string-net)) (print (count-nodes string-net)) (minimize-tree string-net) (calculate-compression-mapping string-net) (compress-net string-net :iterate t) (count-strings string-net)) (defparameter *test-net* (make-instance 'huge-active-string-net)) (let ((count 0) (string-net *test-net*)) (do-query ((word base-form tag) [select [oppslag] [grunnform] [tag] :from [tagger v-tagger-uttrykk]]) (add-string string-net (u:concat word ":" base-form ":" tag)) (incf count) (when (zerop (mod count 1000)) (format t "~%~5d ~a" count word))) (print count) #+ignore (progn (print (count-strings string-net)) (print (count-nodes string-net)) (minimize-tree string-net) (calculate-compression-mapping string-net) (compress-net string-net :iterate t) (count-strings string-net))) (minimize-tree *test-net*) (calculate-compression-mapping *test-net*) (compress-net *test-net* :iterate t) (print-strings *test-net*) (defparameter *abbreviations* (load-string-net "projects:cgp;multitagger;abbreviations.net")) (defparameter *word-like-abbreviations* (load-string-net "projects:cgp;multitagger;word-like-abbreviations.net")) (defparameter *titles* (load-string-net "projects:cgp;multitagger;titles.net")) (defparameter *symbols* (load-string-net "projects:cgp;multitagger;symbols.net")) (defparameter *expressions* (load-string-net "projects:cgp;multitagger;expressions.net")) (loop for (net file) on (list *fullforms* "projects:cgp;multitagger;fullforms.text" *lemmata* "projects:cgp;nets;lemmata.text" *abbreviations* "projects:cgp;multitagger;abbreviations.text" *word-like-abbreviations* "projects:cgp;multitagger;word-like-abbreviations.text" *titles* "projects:cgp;multitagger;titles.text" *symbols* "projects:cgp;multitagger;symbols.text" *expressions* "projects:cgp;multitagger;expressions.text") by #'cddr do (with-open-file (stream file :direction :output :if-exists :supersede) (map-strings net (lambda (line) (write-line line stream))))) #+test (loop for (file net-file) on (list "projects:cgp;multitagger;fullforms.text" "projects:cgp;multitagger;fullforms.net" "projects:cgp;multitagger;lemmata.text" "projects:cgp;multitagger;lemmata.net" "projects:cgp;multitagger;abbreviations.text" "projects:cgp;multitagger;abbreviations.net" "projects:cgp;multitagger;word-like-abbreviations.text" "projects:cgp;multitagger;word-like-abbreviations.net" "projects:cgp;multitagger;titles.text" "projects:cgp;multitagger;titles.net" "projects:cgp;multitagger;symbols.text" "projects:cgp;multitagger;symbols.net" "projects:cgp;multitagger;expressions.text" "projects:cgp;multitagger;expressions.net") by #'cddr do (let ((string-net (make-instance 'active-string-net)) (count 0)) (print file) (u:with-file-lines (line file) (add-string string-net line) (when (zerop (mod (incf count) 1000)) (format t "~%~5d ~a" count line))) (print count) (minimize-tree string-net) #-garsia-wachs (calculate-compression-mapping string-net) #+garsia-wachs (calculate-gw-compression-tree string-net) (compress-net string-net :iterate t) (print (count-strings string-net)) (store-net string-net net-file)))