;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;; ;; Copyright (C) Paul Meurer 2004. All rights reserved. ;; paul.meurer@aksis.uib.no ;; Aksis, University of Bergen ;; (in-package :cgp) (defparameter *part-of-speech* #(subst ;; 0 adj ;; 1 verb ;; 2 adv ;; 3 prep ;; 4 interj ;; 5 (det kvant) ;; 6 (det dem) ;; 7 det ;; 8 (pron poss) ;; 9 (pron pers) ;; 10 (pron refl) ;; 11 pron ;; 12 inf-merke ;; 13 sbu ;; 14 fork ;; 15 foreign ;; 16 ukjent ;; 17 ;; 18 ;; 19 clb ;; 20 konj ;; 21 ;; 22 ;; 23 ;; 24 symb));; 25 (defun part-of-speech-code (fv) (loop for i from 0 for pos across *part-of-speech* when (or (and (listp pos) (has-features-p fv pos)) (has-feature-p fv pos)) do (return-from part-of-speech-code i)) #+orig (let ((*package* (find-package :cgp))) (print fv) (warn "No pos code found for ~(~s~) (in CGP::PART-OF-SPEECH-CODE)." (code-features fv))) 0) #|| ;; Train a HMM corresponding to a unigram statistics (defun build-pos-bigram-hmm-from-correct-corpus (stream) (let ((vector (make-array 0 :adjustable t :fill-pointer 0)) (size (length *part-of-speech*)) (*cg* *nbo-cg*) (*tagger* *nbo-tagger*)) (with-stream-sentences (sentence stream) (map-tokens sentence (lambda (token) ;;(print token) (loop for reading in (token-features token) when (and reading (has-feature-p (cdr reading) ')) do (return (vector-push-extend (part-of-speech-code (cdr reading)) vector)))))) (print (map 'list (lambda (code) (aref *part-of-speech* code)) vector)) (hmm-train (coerce vector 'simple-vector) (hmm::make-random-hmm size size) :iterations 30 :debug t))) #+test (defparameter *nbo-pos-bigram-hmm* (let ((*cg* *nbo-cg*) (*tagger* *nbo-tagger*)) (with-open-file (stream "projects:cgp;training;delkorpbm-fork.cor" ;;"projects:cgp;training;test.cor" :direction :input) (print (build-pos-bigram-hmm-from-correct-corpus stream))))) #+test (print (map 'list (lambda (code) (let ((*cg* *nbo-cg*) (*tagger* *nbo-tagger*)) (aref *part-of-speech* code))) (hmm::hmm-generate *nbo-pos-bigram-hmm* 1000))) #+test (hmm::pretty-print-hmm *nbo-pos-bigram-hmm1*) ||# ;; mistakes in test corpus: r15 -> rl15, rl9 (defun %encode-features (&rest features) (let ((feature-code (make-array (code-vector-length) :element-type 'bit :initial-element 0))) (dolist (feature features) (let ((feature (case feature ;;(nøyt 'noeyt) (1 '\1) (2 '\2) (3 '\3) ;;(høflig 'hoeflig) ;;(ubøy 'uboey) ;;('@løs-np '@loes-np) (otherwise feature)))) (let* ((code (feature-code feature))) (when code (setf (sbit feature-code code) 1))))) feature-code)) ;;; bigram implementation from scratch (defun build-pos-bigram-tree-from-correct-corpus (stream) (let ((tree (dat::make-string-tree)) (*cg* *nbo-cg*) (*tagger* *nbo-tagger*) (bigram (make-string 3)) (word-frequency (dat::make-string-tree))) (with-stream-sentences (sentence stream) (setf (char bigram 1) #\Null (char bigram 2) #\Null) (map-tokens sentence (lambda (token) (loop for reading in (token-features token) when (and reading (has-feature-p (cdr reading) ')) do (setf (char bigram 0) (char bigram 1) (char bigram 1) (char bigram 2) (char bigram 2) (code-char (1+ (part-of-speech-code (cdr reading))))) (incf (dat::string-tree-get tree bigram 0)) (incf (dat::string-tree-get tree bigram 0 1)) (incf (dat::string-tree-get word-frequency (concat ;;(token-value token) (car reading) "#" (string (code-char (1+ (part-of-speech-code (cdr reading)))))) 0)) #+ignore ;; only last reading (return nil))))) (values tree word-frequency))) (defvar *nbo-bigram-tree*) (defvar *nbo-word-frequency-tree*) (let ((*cg* *nbo-cg*) (*tagger* *nbo-tagger*)) (with-open-file (stream "projects:cgp;training;delkorpbm-fork.cor" :external-format :iso-8859-1) (multiple-value-setq (*nbo-bigram-tree* *nbo-word-frequency-tree*) (build-pos-bigram-tree-from-correct-corpus stream)))) #+test (print *nbo-word-frequency-tree*) ;; destructively changes bv1 (defun feature-intersection (bv1 bv2) (loop for i from 0 for b1 across bv1 for b2 across bv2 when (and (not (zerop b1)) (zerop b2)) do (setf (bit bv1 i) 0)) bv1) #+test (print (feature-intersection #*0 #*1)) (defmethod sentence-filter-features ((sentence sentence) &key feature-filter &allow-other-keys) (when feature-filter (map-tokens sentence (lambda (token) (when (stringp (token-value token)) (loop for reading in (token-features token) when (cdr reading) do (feature-intersection (cdr reading) feature-filter)))))) sentence) (defmethod collapse-readings ((sentence sentence) &key &allow-other-keys) (map-tokens sentence (lambda (token) (unless (insignificant-p token) (collapse-readings token))))) (defmethod collapse-readings ((token token) &key &allow-other-keys) (let ((collapse-list '((mask fem);; . m/f) (ent fl)))) (with-slots (features) token (loop for (f . rest) on features when f do (loop for rf in rest with b1 and b2 when (and rf (string= (car f) (car rf))) when (block loop (loop for bit1 across (cdr f) for bit2 across (cdr rf) for i from 0 do (cond ((and (= bit1 1) (zerop bit2)) (when b1 (return-from loop)) (setf b1 i)) ((and (zerop bit1) (= bit2 1)) (when b2 (return-from loop)) (setf b2 i))) finally (return t))) when (and b1 b2) do (loop for cl in collapse-list when (or (and (= (feature-code (car cl)) b1) (= (feature-code (cadr cl)) b2)) (and (= (feature-code (car cl)) b2) (= (feature-code (cadr cl)) b1))) do (if (cddr cl) (setf (bit (cdr f) b1) 0 (bit (cdr f) b2) 0 (bit (cdr f) (feature-code (cddr cl))) 1) (setf (bit (cdr f) b1) 1 (bit (cdr f) b2) 1)) (setf (car rest) nil))))))) (defmethod sort-readings-by-ngram-frequency ((sentence sentence) &key total-disambiguate-p feature-filter collapse-readings-p &allow-other-keys) (let ((bigram (make-string 3 :initial-element #\Null))) (map-tokens sentence (lambda (token) (when (stringp (token-value token)) (setf (char bigram 0) (char bigram 1) (char bigram 1) (char bigram 2)) (let ((readings.frequencies (collecting (loop for reading in (token-features token) when reading do (setf (char bigram 2) (code-char (1+ (part-of-speech-code (cdr reading))))) (let ((bigram-freq (dat::string-tree-get *nbo-bigram-tree* bigram 0)) (unigram-freq (dat::string-tree-get *nbo-bigram-tree* bigram 0 1)) (w-pos-freq (dat::string-tree-get *nbo-word-frequency-tree* (concat (car reading) ;;(token-value token) "#" (string (code-char (1+ (part-of-speech-code (cdr reading)))))) 0))) (collect (list reading ;;(* bigram-freq (+ 1 (* 10 w-pos-freq))) ;; linear combination (+ (* 10 bigram-freq) unigram-freq (* 120 w-pos-freq)) bigram-freq unigram-freq w-pos-freq))))))) (let ((sorted-readings.frequencies (sort readings.frequencies #'> :key #'cadr))) #+debug(print (mapcar #'cdr sorted-readings.frequencies)) (let ((readings (if total-disambiguate-p (list (caar sorted-readings.frequencies)) (mapcar #'car sorted-readings.frequencies)))) (setf (char bigram 2) (if (token-features token) (code-char (1+ (part-of-speech-code (cdr (car readings))))) #\Null)) (setf (token-features token) readings))) #+ignore (when feature-filter (loop for reading in (token-features token) when (cdr reading) do (feature-intersection (cdr reading) feature-filter))))))) (when collapse-readings-p (collapse-readings sentence)) (apply-feature-filter sentence feature-filter) sentence)) #+test (disambiguate-from-string ;; "Dette er tre fisker som spiser fisk." "Hva gjør dere her?" ;;:tagging-niveau :multi-tagging :cg *nbo-cg* :print-function (lambda (s &rest rest) (sort-readings-by-ngram-frequency s :disambiguate-p t) (apply #'print-sentence s rest))) #|| (print *nbo-bigram-tree*) (print (hmm::generate-start-state *nbo-pos-bigram-hmm*)) (hmm::generate-symbol (print (hmm::generate-start-state *nbo-pos-bigram-hmm*)) *nbo-pos-bigram-hmm*) (defun file-to-vector (file) (with-open-file (stream file) (let* ((n (file-length stream)) (vector (make-array n))) (dotimes (i n vector) (let ((char (char-downcase (read-char stream)))) (setf (svref vector i) (case char (#\Space 0) (#\æ 27) (#\ø 28) (#\å 29) (otherwise (min (max 0 (- (char-code char) 96)) 26))))))))) (print (char-code #\z)) (defun markov-test (file &key (hmm (hmm::make-random-hmm 30 30)) (iterations 5) (debug t)) (let ((sequence (file-to-vector file))) (hmm-train sequence hmm :iterations iterations :debug debug))) (defparameter *hmm* (markov-test "hmmtext.txt")) (describe *hmm*) (hmm::test-hmm-maximal-path *hmm* 5) (multiple-value-bind (seq states) (hmm::hmm-generate *hmm* 1000) (print (map 'string (lambda (c) (cond ((zerop c) #\Space) ((= c 27) #\æ) ((= c 28) #\ø) ((= c 29) #\å) (t (code-char (+ 96 c))))) seq)) states) ||# :eof