;; -*- Mode: lisp; Syntax: ansi-common-lisp; Package: XLE; Base: 10; Readtable: augmented-readtable -*- ;;;; XLE-Web users are granted the rights to distribute and use this software ;;;; as governed by the terms of the Lisp Lesser GNU Public License ;;;; (http://opensource.franz.com/preamble.html), also known as the LLGPL. ;;;; Author: Paul Meurer (paul.meurer@aksis.uib.no) ;;;; Aksis, Unifob, University of Bergen, Norway ;;;; http://www.aksis.uib.no (in-package :XLE) ;; struct 'RegExp { (define-foreign-type RegExp ;; RegExpType type; (type :unsigned-int) ;; int size; (size :int) ;; int char_pos; /* the left character position of an RE_LABEL */ (char-pos :int) ;; int16 label; /* used by RE_LABEL */ (label :short) ;; RegExp *op1; /* used by everything but RE_LABEL */ (op1 :unsigned-long #+orig(* RegExp)) ;; RegExp *op2; /* used by RE_SEQUENCE & RE_ALTSET */ (op2 :unsigned-long #+orig(* RegExp)) ;;}; ) ;; RegExp *re_option(RegExp *re1, HEAPptr heap); (define-foreign-function "re_option" ((re1 :unsigned-long) (heap :unsigned-long) ) :unsigned-long :lisp-name re-option :module +xle-module-path+) ;; RegExp *re_altset(RegExp *re1, RegExp *re2, HEAPptr heap); (define-foreign-function "re_altset" ((re1 :unsigned-long) (re2 :unsigned-long) (heap :unsigned-long) ) :unsigned-long :lisp-name re-altset :module +xle-module-path+) ;; RegExp *re_sequence(RegExp *re1, RegExp *re2, HEAPptr heap); (define-foreign-function "re_sequence" ((re1 :unsigned-long) (re2 :unsigned-long) (heap :unsigned-long) ) :unsigned-long :lisp-name re-sequence :module +xle-module-path+) ;; RegExp *re_zerostar(RegExp *re1, HEAPptr heap); (define-foreign-function "re_zerostar" ((re1 :unsigned-long) (heap :unsigned-long) ) :unsigned-long :lisp-name re-zerostar :module +xle-module-path+) ;; RegExp *re_symbol_pair(char *upper, char *lower, HEAPptr heap); ;; /* Create a regular expression that represents the upper and */ ;; /* lower symbols of a two-level pair. If upper = lower, then */ ;; /* this is equivalent to a one-level symbol. Epsilon is */ ;; /* represented by NULL or "", not "0". When re_symbol_pair */ ;; /* is being used to produce output for analyze_to_regexp in */ ;; /* a grammar library, upper and lower should be the same. */ (define-foreign-function "re_symbol_pair" ((upper #+sbcl(sb-alien::c-string :external-format :iso-8859-1) #+allegro string-ptr) (lower #+sbcl(sb-alien::c-string :external-format :iso-8859-1) #+allegro string-ptr) (heap :unsigned-long) ) :unsigned-long :lisp-name %re-symbol-pair :module +xle-module-path+) (define-foreign-function "re_symbol_pair_pos" ((upper #+sbcl(sb-alien::c-string :external-format :iso-8859-1) #+allegro string-ptr) (lower #+sbcl(sb-alien::c-string :external-format :iso-8859-1) #+allegro string-ptr) ;;(upper :unsigned-long) ;;(lower :unsigned-long) (pos :int) (heap :unsigned-long) ) :unsigned-long :lisp-name %re-symbol-pair-pos :module +xle-module-path+) ;; used?? (defmacro %with-cstr ((ptr str) &body body) `(let ((,ptr #+allegro(excl::string-to-char* ,str) #+sbcl(string-to-char* ,str))) (unwind-protect (progn ,@body) #+ignore (excl:aclfree ,ptr)))) (defvar *allocated-strings*) #+test (print (string-to-char* "upper")) (defmacro with-allocated-strings (() &body body) (with-gensyms (ptr) `(let ((*allocated-strings* ())) (unwind-protect (progn ,@body) (dolist (,ptr *allocated-strings*) #+allegro (excl:aclfree ,ptr) #+sbcl (sb-alien:free-alien ,ptr)))))) #+sbcl (defun string-to-char* (str) (print (list :str str)) (let ((ptr (sb-alien:make-alien #+ignore char (sb-alien::unsigned 8) (1+ (length str))) )) (loop for i from 0 for c across str do (setf (sb-alien:deref ptr i) (if (< (char-code c) 1128) (char-code c) (- 255 (char-code c))))) (setf (sb-alien:deref ptr (length str)) 0) (print (list :ptr ptr)) ptr)) #+test (with-allocated-strings () (let* ( (heap (init-heap 124 100 "RegExp")) (re1 (%re-symbol-pair-pos "füfö" "fifix" 0 heap)) (re2 (re-symbol-pair "lolo" "lolox" heap)) (regexp ;;(re-symbol-pair "fifi" "fifix" heap) (re-sequence re1 re2 heap))) (print (list re1 re2 regexp)) (print :hier) (print (sb-alien::sap-alien (sb-alien::int-sap regexp) RegExp)) (print (foreign-slot regexp RegExp type)) (print-net-as-regexp (regexp-to-fsm regexp) 0 0 1) (free-heap heap))) (defun re-symbol-pair (upper lower heap) #+allegro (let ((upper-ptr #+allegro(excl::string-to-char* upper) #+sbcl(string-to-char* upper)) (lower-ptr #+allegro(excl::string-to-char* lower) #+sbcl(string-to-char* lower))) (push upper-ptr *allocated-strings*) (push lower-ptr *allocated-strings*) (%re-symbol-pair upper-ptr lower-ptr heap)) #+sbcl (%re-symbol-pair upper lower heap)) (defun re-symbol-pair-pos (upper lower pos heap) ;;(print (list upper lower pos)) #+allegro (let ((upper-ptr #+allegro(excl::string-to-char* upper) #+sbcl(string-to-char* upper)) (lower-ptr #+allegro(excl::string-to-char* lower) #+sbcl(string-to-char* lower))) (push upper-ptr *allocated-strings*) (push lower-ptr *allocated-strings*) (if *use-parse-regexp-p* (%re-symbol-pair-pos upper-ptr lower-ptr pos heap) (%re-symbol-pair upper-ptr lower-ptr heap))) #+sbcl (if *use-parse-regexp-p* (%re-symbol-pair-pos upper lower pos heap) (%re-symbol-pair upper lower heap))) ;; HEAPptr init_heap(size_t cell_size, size_t block_size, char *name); (define-foreign-function "init_heap" ((cell-size :unsigned-int) (block-size :unsigned-int) (name string-ptr) ) :unsigned-long :lisp-name init-heap :module +xle-module-path+) ;; void free_heap(HEAPptr heap); (define-foreign-function "free_heap" ((heap :unsigned-long) ) #+allegro :void #+sbcl ffc::void :lisp-name free-heap :module +xle-module-path+) ;; void reset_heap(HEAPptr heap); (define-foreign-function "reset_heap" ((heap :unsigned-long) ) #+allegro :void #+sbcl ffc::void :lisp-name reset-heap :module +xle-module-path+) ;; Graph *parse_regexp(Chart *parser, RegExp *regexp, char *root_cat, ;; int no_morphology); ;; /* Parse a regular expression (as defined in regexp.h). If */ ;; /* no_morphology, then the regexp should represent the two-level */ ;; /* output that the morphology would normally produce. Use the */ ;; /* Tcl function "morphemes {...} twolevel" to determine what */ ;; /* symbol pairs are the output of the morphology. */ (define-foreign-function "parse_regexp" ((parser :unsigned-long) (regexp :unsigned-long) (root-cat string-ptr) (no-morphology :int) ) :unsigned-long :lisp-name parse-regexp :module +xle-module-path+) ;; NETptr regexp_to_fsm(RegExp *regexp); (define-foreign-function "regexp_to_fsm" ((regexp :unsigned-long) ) :unsigned-long :lisp-name regexp-to-fsm :module +xle-module-path+) (defun morph-server::build-morph-seq-re (heap lower upper positions &optional regexp) (cond ((cdr lower) (re-sequence (re-symbol-pair-pos (car lower) (car upper) (or (car positions) 0) heap) (morph-server::build-morph-seq-re heap (cdr lower) (cdr upper) (cdr positions) regexp) heap)) (regexp (re-sequence (re-symbol-pair-pos (car lower) (car upper) (or (car positions) 0) heap) regexp heap)) (t (re-symbol-pair-pos (car lower) (car upper) (or (car positions) 0) heap)))) (defun morph-server::build-morph-or-re (heap re1 re2 &optional regexp) (if regexp (re-sequence (re-altset re1 re2 heap) regexp heap) (re-altset re1 re2 heap))) #+test (let ((heap (init-heap 96 100 "RegExp"))) (print (morph-server::norgram-morphology-regexp "sov" nil :heap heap :regexp (re-symbol-pair "" "@" heap)))) #+test (with-allocated-strings () (let ((heap (init-heap 1240 2000 "RegExp"))) (print (re-symbol-pair-pos "fifi" "fifix" 0 heap)))) #+test (with-allocated-strings () (let ((heap (init-heap 1240 2000 "RegExp"))) (print (re-symbol-pair "fifi" "fifix" heap)))) ;; (format nil "~{[~a]~}" (mapcar #'analyzed-token-regexp analyzed-sentence-list)) #+test (with-allocated-strings () (build-morph-regexp "M/B Bitihorn")) ;; simpler (defun build-morph-regexp (sentence &key cg-preparse-p) (let* ((heap #+orig(init-heap 96 100 "RegExp") (init-heap 128 100 "RegExp") ) ;; not sure how to determine exact size of RegExp (segments (dat:make-string-tree)) (tokenized-list (morph-server::norgram-tokenize-sentence sentence :segments segments :preparse-p cg-preparse-p :heap t))) #+debug(format t "~&tokenized-list: ~s~%" tokenized-list) (labels ((segment-regexp (seg-list &key regexp inner-boundary segment-type) #+debug(print (list :seg-list seg-list)) (when seg-list (destructuring-bind (str pos &optional id dcp next-hyphen-p unanalized-hyphen-p) (car seg-list) (declare (ignore unanalized-hyphen-p)) (let* ((length (when id (length (dat::string-tree-get segments str)))) (segment-end-pos (if pos (+ pos 1 (length str)) 0)) (morph-regexp (morph-server::norgram-morphology-regexp (if (and id (> length 1)) (format nil "~a\\~d" str id) str) segments :segment-type segment-type :pos (when pos (1+ pos)) :heap heap :add-+token-p t))) (cond ((cdr seg-list) (re-sequence morph-regexp (re-sequence (re-symbol-pair-pos "" "TB" segment-end-pos heap) (segment-regexp (cdr seg-list) :regexp regexp :inner-boundary inner-boundary :segment-type (case segment-type ((:first :middle) (if (cddr seg-list) :middle :last)) ((:first-hyphen :middle-hyphen) (if (cddr seg-list) :middle-hyphen :last-hyphen)) (otherwise segment-type))) heap) heap)) (regexp (re-sequence morph-regexp (re-sequence (re-symbol-pair-pos "" "TB" segment-end-pos heap) regexp heap) heap)) (t ;; morph-regexp (re-sequence morph-regexp (re-symbol-pair-pos "" "TB" segment-end-pos heap) heap) )))))) (seq-regexp (dis &optional regexp) (let* ((rest-token-regexp (segment-regexp (cdr dis))) (first-token-regexp (cond ((cadddr (cddr (caar dis))) ;; unanalyzed hyphenated compound (ugly cddrerrarardar!) (segment-regexp (car dis) :regexp rest-token-regexp) ;;"~{~a~^ ~}" ) ((cadddr (cdaar dis)) (segment-regexp (car dis) :regexp rest-token-regexp :segment-type (when (cdar dis) :first-hyphen)) ;;"~{~a~^+~}-" ) (t (segment-regexp (car dis) :regexp rest-token-regexp :segment-type (when (cdar dis) :first)) ;;"~{~a~^+~}" )))) first-token-regexp)) (analyzed-token-regexp (disjunction-list) (if (cdr disjunction-list) (re-altset (seq-regexp (car disjunction-list)) (analyzed-token-regexp (cdr disjunction-list)) heap) (seq-regexp (car disjunction-list)))) (sentence-regexp (tokenized-list) (if (cdr tokenized-list) (re-sequence (analyzed-token-regexp (car tokenized-list)) (sentence-regexp (cdr tokenized-list)) heap) (analyzed-token-regexp (car tokenized-list))))) (let* ((regexp (if tokenized-list (sentence-regexp tokenized-list) (re-symbol-pair-pos "" "TB" 0 heap)))) #+debug(print-net-as-regexp (regexp-to-fsm regexp) 0 0 1) (values regexp heap))))) :eof