;;;-*- Mode: Lisp; Package: REGEX -*- ;;; ;;; Regular expression parser. (in-package regex) #+ignore (eval-when (load compile eval) (use-package '(parser finite-state-automata)) (export '(compile-regexp string-match substring-match))) #+ignore (eval-when (load compile eval) (use-package '(:parser :common-lisp :common-lisp-user :fsa :fsa-standard :standard-states :standard-symbols :list-sets :skip-list :skip-list-relations ))) ;;====================== ;; finite state automata #|| ; bit-vectors don't work well with deterministic automata (declaim (inline simple-bit-vector-order)) (defun simple-bit-vector-order (bv1 bv2) (declare #.cl-user::*highly-optimized*) (dotimes (i (length bv1)) (declare (fixnum i)) (when (/= (sbit bv1 i) (sbit bv2 i)) (return-from simple-bit-vector-order (zerop (sbit bv1 i))))) :equal) ||# (defclass regexp-symbols () ()) (defmethod symbol-order-fn ((fsa regexp-symbols)) (lambda (s1 s2) (declare #.cl-user::*highly-optimized*) ;; sort chars before fixnums before strings before keywords (etypecase s1 (character (etypecase s2 (character (if (char= (the character s1) (the character s2)) :equal #+excl (and (char< (the character s1) (the character s2)) t) #-excl (char< (the character s1) (the character s2)))) ((or fixnum simple-string keyword) t))) (fixnum (etypecase s2 (character nil) (fixnum (if (= (the fixnum s1) (the fixnum s2)) :equal (< (the fixnum s1) (the fixnum s2)))) ((or simple-string keyword) t))) (simple-string (etypecase s2 ((or character fixnum) nil) (simple-string (if (string= s1 s2) :equal (and (string< s1 s2) t))) (keyword t))) (keyword (etypecase s2 ((or character fixnum simple-string) nil) (keyword (if (eq s1 s2) :equal (and (string< (symbol-name s1) (symbol-name s2)) t))))) #+ignore (simple-bit-vector (etypecase s2 ((or character fixnum simple-string) nil) (simple-bit-vector (simple-bit-vector-order s1 s2))))))) ;(inspect (make-instance 'regexp-fsa)) (defclass regexp-fsa (fixnum-states regexp-symbols skip-list-relations list-sets) ()) (defclass regexp-nfa (regexp-fsa nfa) ()) (defclass regexp-dfa (regexp-fsa dfa) ()) (defmethod make-nfa ((fsa regexp-fsa)) (make-instance 'regexp-nfa)) (defmethod make-dfa ((fsa regexp-fsa)) (make-instance 'regexp-dfa)) (defmacro child-edges (edge) `(parser::parse-edge-found ,edge)) (defmacro leaf-edge-p (edge) `(not (listp (child-edges ,edge)))) (defun edge-type (edge) (or (if (leaf-edge-p edge) (u:string-to-keyword (symbol-name (parser::parse-edge-rule edge))) ;(parser::parse-edge-rule edge) ) (parser::rule-type (parser::parse-edge-rule edge)) :skip)) (defmethod parse-regexp ((regexp string) &key (chart *regexp-chart*)) (let ((tokens (tokenize-string regexp chart :reverse-p (reverse-p chart)))) (initialize-chart chart tokens) (get-top-nodes chart 'expression))) ;;============================= ;; regular expressions compiler (defmethod dfa-compile-parsed-regexp ((chart regexp-chart) (fsa regexp-fsa)) ;;(let ((cache (make-hash-table :test 'equal))) (labels ((walk (edge) (let ((children (remove-if #'(lambda (e) (and (parser::parse-edge-p e) #|| #+mcl(ccl::structurep e) #+pcl(pcl::structurep e) #+allegro(excl::structurep e) #+lispworks(structurep e)||# (characterp (child-edges e)))) (child-edges edge)))) (case (edge-type edge) (:skip (walk (car children))) (:seq (minimize (apply #'fsa-concat (mapcar #'walk children)))) (:union (minimize (apply #'fsa-union (mapcar #'walk children)))) (:option (minimize (fsa-optional (walk (car children))))) (:closure (minimize (fsa-closure (walk (car children))))) (:plus (minimize (fsa-plus (walk (car children))))) #||(counter)||# (:str ; we make an arc for each char in the string (minimize (apply #'fsa-concat (map 'list #'(lambda (c) (fsa-symbol c fsa)) children)))) (:range (minimize (apply #'fsa-union (map 'list #'(lambda (c) (fsa-symbol c fsa)) (build-range children))))) (:@ (fsa-symbol (edge-type edge) fsa)) (otherwise (fsa-symbol children fsa)))))) (let ((top-nodes (get-top-nodes chart 'expression))) (when top-nodes (any-determinize (walk (car top-nodes))))))) (defmethod dfa-compile-parsed-regexp ((list-regexp list) (fsa regexp-fsa)) (labels ((walk (exp) (etypecase exp (list (ecase (car exp) (:seq (minimize (apply #'fsa-concat (mapcar #'walk (cdr exp))))) (:or (minimize (apply #'fsa-union (mapcar #'walk (cdr exp))))) (:? (assert (null (cddr exp))) (minimize (fsa-optional (walk (cadr exp))))) (:* (assert (null (cddr exp))) (minimize (fsa-closure (walk (cadr exp))))) (:+ (assert (null (cddr exp))) (minimize (fsa-plus (walk (cadr exp))))) (:range (minimize (apply #'fsa-union (map 'list #'(lambda (c) (fsa-symbol c fsa)) (build-range (cdr exp)))))))) (string (minimize (apply #'fsa-concat (map 'list #'(lambda (c) (fsa-symbol c fsa)) exp)))) (symbol (case exp (:@ (fsa-symbol exp fsa)) (otherwise (fsa-symbol exp fsa))))))) (any-determinize (walk list-regexp)))) #+test (dfa-compile-parsed-regexp '(:seq (:or : (:seq : (:? "n") "ing")) (:* (:seq (:? (:or "s" "e")) (:or "" (:seq "" (:? "n") "ing"))))) (make-instance 'regexp-fsa)) ;; this one is much faster (appx. 10 times on simple regexps) (defmethod compile-parsed-regexp ((chart regexp-chart) (fsa regexp-fsa) &key (determinize t)) (let* ((fsa (make-nfa fsa)) (start-state (fsa-start-state fsa)) (final-state (make-state fsa))) (set-insert final-state (fsa-final-states fsa)) (labels ((generate-state () (let ((new-state (make-state fsa)) (new-relation (fsa::make-symbol-relation fsa))) (setf (relation-get new-state (fsa-delta fsa)) new-relation) new-state)) (walk (edge start final) (let ((at-start-p (= start start-state)) (children (remove-if #'(lambda (e) (and (parser::parse-edge-p e) #|| #+mcl(ccl::structurep e) #+pcl(pcl::structurep e) #+allegro(excl::structurep e) #+lispworks(structurep e) ||# (characterp (child-edges e)))) (child-edges edge)))) (case (edge-type edge) (:skip (walk (car children) start final)) (:seq (let ((new-state start)) (loop for sublist on children do (if (cdr sublist) (let ((new-new-state (generate-state))) (walk (car sublist) new-state new-new-state) (setq new-state new-new-state)) (walk (car sublist) new-state final))))) (:union (dolist (subtree children) (walk subtree start final))) (:option (add-transition fsa start final *epsilon*) (walk (car children) start final)) (:closure (let ((new-state (if at-start-p start (generate-state)))) (unless at-start-p (add-transition fsa start new-state *epsilon*)) (add-transition fsa new-state final *epsilon*) (walk (car children) new-state new-state))) (:plus (let ((new-start-state (if at-start-p start (generate-state))) (new-final-state (generate-state))) (unless at-start-p (add-transition fsa start new-start-state *epsilon*)) (add-transition fsa new-final-state final *epsilon*) (add-transition fsa new-final-state new-start-state *epsilon*) (walk (car children) new-start-state new-final-state))) (:str ; we make an arc for each char in the string (let* ((string children) (length (length string)) (new-state start)) (loop for i from 0 to (- length 2) do (let ((new-new-state (generate-state))) (compile-leaf fsa 'char (char string i) new-state new-new-state) (setq new-state new-new-state))) (compile-leaf fsa 'char (char string (1- length)) new-state final))) (:range (dolist (c (build-range children)) (compile-leaf fsa 'char c start final))) (otherwise (compile-leaf fsa (edge-type edge) children start final)))))) (let ((top-nodes (get-top-nodes chart 'expression))) (when top-nodes (walk (car top-nodes) start-state final-state) (if determinize (any-determinize fsa) fsa)))))) ;(compile-regexp "(12*)" :determinize t) (defmethod add-transition ((fsa regexp-fsa) state next-state symbol) (with-slots (fsa-delta) fsa (let ((symbol-states-rel (or (relation-get state fsa-delta) (setf (relation-get state fsa-delta) (fsa::make-symbol-relation fsa))))) (set-insert next-state (or (relation-get symbol symbol-states-rel) (setf (relation-get symbol symbol-states-rel) (fsa::make-symbol-set fsa))))))) (defmethod compile-leaf ((fsa regexp-fsa) cat leaf start final) (let ((label (ecase cat ((char str) leaf) (:template (u:string-to-keyword (parser::parse-edge-found (car leaf))) ;(utils:string-to-keyword leaf) ) (:@ :@) (:range (build-range leaf))))) (add-transition fsa start final label))) #|| (time (defparameter *fsa* (compile-regexp " [01]?[0-9]\\.([01]?[0-9]\\.|(jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|des)\\.?)(19)?[0-9][0-9][ \\.\\,\\;\\:]"))) ||# (defmethod compile-regexp ((regexp string) &key intermediate-det ;(goal 'S) (chart *regexp-chart*) (determinize t) (fsa-class 'regexp-fsa)) (let ((tokens (tokenize-string regexp chart :reverse-p (reverse-p chart)))) (initialize-chart chart tokens) (or (if intermediate-det (dfa-compile-parsed-regexp chart (make-instance fsa-class)) (compile-parsed-regexp chart (make-instance fsa-class) :determinize determinize)) (error "the regular expression <~a> did not compile" regexp)))) #|| (defparameter *fsa* (any-determinize (compile-regexp "(.*|wu)tz"))) (fsa::fsa-print *fsa*) (in-language-p "atz" *fsa*) (in-language-p "wollalulflloho" *fsa*) (defparameter *fsa* (compile-regexp "(jaja|trull)+")) ; 600, 378, 367, 355, 450, class state: 493, more than 1 final-state: 645 ; dfa: 273; 340; lw: 120 (time (dotimes ( i 100) (substring-match *fsa* "wollwutzriguswollwutzwollwutzwollwutzwollwutzrigusrigus jaja " :minimal nil))) ||# (defun build-range (edge-list) (labels ((string-to-char-list (str) (coerce str 'list)) (range-to-char-list (start end) (let ((start-char (char start 0)) (end-char (char end 0))) (if (char< end-char start-char) (error "error in range") (u:collecting (loop for i from (char-code start-char) to (char-code end-char) do (u:collect (code-char i))))))) (build (edges) (ecase (length edges) ; keep in mind that rules are inverted! (1 (coerce (child-edges (car edges)) 'list)) (2 (coerce (child-edges (cadr edges)) 'list) (build (child-edges (car edges)))) (3 (range-to-char-list (child-edges (car edges)) (child-edges (caddr edges)))) (4 (range-to-char-list (child-edges (cadr edges)) (child-edges (cadddr edges))) (build (child-edges (car edges))))))) (build (child-edges (car edge-list))))) #|| (defun build-range (edge-list) (let ((range-vector (make-array 256 :element-type 'bit :initial-element 0))) (labels ((string-to-range-vector (str) (loop for c across str do (setf (sbit range-vector (char-code c)) 1))) (range-to-range-vector (start end) (let ((start-char (char start 0)) (end-char (char end 0))) (if (char< end-char start-char) (error "error in range") (loop for i from (char-code start-char) to (char-code end-char) do (setf (sbit range-vector i) 1))))) (build (edges) (ecase (length edges) ; keep in mind that rules are inverted! (1 (string-to-range-vector (child-edges (car edges)))) (2 (string-to-range-vector (child-edges (cadr edges))) (build (child-edges (car edges)))) (3 (range-to-range-vector (child-edges (car edges)) (child-edges (caddr edges)))) (4 (range-to-range-vector (child-edges (cadr edges)) (child-edges (cadddr edges))) (build (child-edges (car edges))))))) (build (child-edges (car edge-list))) range-vector))) ||# ;(compile-regexp "(a[bc-d])") #|| ; 54 ms with big grammar and intermediate tree build ; 31 ms with smaller grammar and no tree build ; 17 ms without extended-lists ; 11 ms in reversed order ; 20 ms with one-char-strings (cfg-chart-parse "(((woll){3,5}|)?[1a-rt]+o\\+ho)*" :chart *regexp-chart* :goal 'expression) (tokenize-string "(((woll){3,5}|)?[1a-rt]+o\\+ho)*" *regexp-chart*) ; 27 ms (print-fsa (compile-regexp "(((woll){3,5}|)?[1a-rt]+o\\+ho)*")) (fsa::fsa-print (compile-regexp "(((woll)|rigus)?.1a\\-rt+o\\+ho)*")) (print-fsa (compile-regexp "((woll|)?rt+o\\+ho)*")) (fsa::fsa-print (compile-regexp "woll*" :chart *regexp-chart*)) (defparameter *fsa* (compile-regexp "(woll.?)+")) (fsa::fsa-print *fsa*) (defparameter *fsa* (compile-regexp "[a-zA-Z]+.3\\*.?")) (time (dotimes (i 100) (string-match *fsa* "aaaaaahadddddd43*r"))) (get-matching-substring *fsa* "aaaaaahadddddd3*") (string-match *fsa* "aaaaaahadddddd43*r") ||# (defmethod any-determinize ((fsa regexp-dfa)) (let ((nfa (fsa::copy-to-nfa fsa))) (any-determinize nfa))) (defmethod any-determinize ((nfa regexp-nfa)) (relation-map #'(lambda (state relation) (declare (ignore state)) (let ((any-dest (relation-get :@ relation))) (when any-dest (relation-map #'(lambda (key value) (unless (eq key :@) (set-map #'(lambda (dest) (set-insert dest value)) any-dest))) relation)))) (fsa-delta nfa)) (minimize nfa)) (defun regexp-delta-get (symbol state delta) (let ((relation (relation-get state delta))) (when relation (or (relation-get symbol relation) (relation-get :@ relation))))) (defmethod in-language-p (vector (fsa regexp-dfa)) (do ((delta (fsa-delta fsa)) (state (fsa-start-state fsa) (regexp-delta-get (aref vector i) state delta)) (i 0 (1+ i))) ((or (= i (length vector)) (null state)) (and state (set-member-p state (fsa-final-states fsa)))))) #+old (defmethod string-match ((fsa regexp-dfa) (string string) &key minimal exact) "If exact = t returns t if there is an exact match, and if MINIMAL = T only if the exact match is minimal. If EXACT = NIL returns all matching endpositions, or only the first one if MINIMAL = T" (let ((end ())) (with-slots (fsa-start-state fsa-final-states fsa-delta) fsa (loop with state = fsa-start-state and i = 0 when (set-member-p state fsa-final-states) do (if exact (let ((success (= i (length string)))) (when (or minimal success) (return-from string-match success))) (push i end)) until (or (= i (length string)) (null state) (and minimal end)) do (setf state (delta-get (char string i) state fsa-delta)) (incf i))) end)) (defmethod string-match ((fsa regexp-dfa) (string string) &key minimal maximal exact &allow-other-keys) "If exact = t returns t if there is an exact match, and if MINIMAL/MAXIMAL = T only if the exact match is minimal/maximal. If EXACT = NIL returns all matching endpositions, only the first one if MINIMAL = T, only the last one if MAXIMAL = T" (declare #.cl-user::*highly-optimized*) (let ((end ())) (with-slots (fsa-start-state fsa-final-states fsa-delta) fsa (loop with state fixnum = fsa-start-state and i fixnum = 0 when (and state (set-member-p state fsa-final-states)) do (cond (exact (let ((success (= i (length string)))) (when (or minimal success) (return-from string-match success)))) (maximal (if (and minimal end) (return-from string-match) (setf end i))) (minimal (setf end i)) (t (push i end))) until (or (= i (length string)) (null state) (and minimal (not maximal) end)) do (setf state (regexp-delta-get (char string i) state fsa-delta)) (incf i))) end)) (defmethod substring-match ((fsa regexp-dfa) (string string) &key minimal (start 0)) (declare (fixnum start)) (with-slots (fsa-start-state fsa-final-states fsa-delta) fsa (let ((end nil)) (loop do (loop with state = fsa-start-state and i = start when (and state (set-member-p state fsa-final-states)) do (setq end i) until (or (= i (length string)) (null state) (and minimal end)) do (setf state (regexp-delta-get (char string i) state fsa-delta)) (incf i)) until (or end (= start (length string))) do (incf start)) (when end (values start end))))) #|| (defparameter *fsa* (compile-regexp "\\((~|¢).*; ?(~|¢).*\\)")) (string-match *fsa* "(~a;~b) )" :exact nil :minimal nil) (defparameter *fsa* (compile-regexp "(jaja|trull)+")) (fsa::fsa-print *fsa*) ; 600, 378, 367, 355, 392, 480; dfa: 311; lw: 140 (time (dotimes ( i 1000) (substring-match *fsa* "wollwutzriguswollwutzwollwutzwollwutzwollwutzrigusrigus jaja " :minimal nil))) (defparameter *fsa* (compile-regexp "(woll.?|wutz[i4-9o])+")) (get-matching-substring *fsa* "ja diese wollowollwollwutz5rigus wolllllllllllllelein oho") (time (defparameter *fsa* (compile-regexp " [01]?[0-9]\\.([01]?[0-9]\\.|(jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|des)\\.?)(19)?[0-9][0-9][ \\.\\,\\;\\:]"))) (tokenize-string "[a-zA-Z]+.3\\*" *regexp-chart*) (tokenize-string " [01]?[0-9]\\.([01]?[0-9]\\.|(jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|des)\\.?)(19)?[0-9][0-9][ \\.\\,\\;\\:]" *regexp-chart*) (get-matching-substring *fsa* "aaaaaahadddddd3*45*") (substring-match *fsa* "aaaaaahadddddd3*") (fsa::fsa-print *fsa*) (get-matching-substring *fsa* "am 1.feb.1998 da hat das Wollwutzerl Geburtstag.") ||# #|| ; without backtracking (defmethod substring-match ((fsa fsa) (stream stream) &key minimal (start 0)) (let ((end nil) (i 0)) (multiple-value-bind (reader-function value) (stream-reader stream) ;(locally (declare (optimize (speed 3))) (loop do (setf start i) until (do ((states (epsilon-closure (fsa-start-state fsa) fsa) (extended-delta char states fsa)) (char (progn (incf i) (funcall reader-function value)) (progn (incf i) (funcall reader-function value)))) ((or (not char) (null states) (and minimal end)) (when (not char) (return-from substring-match end)) end) (when (intersection (fsa-final-states fsa) states) (setf end (1- i))))) (when end (values start end))))) ||# (defmethod get-substring-match ((fsa fsa) (string string) &key minimal) (multiple-value-bind (start end) (substring-match fsa string :minimal minimal) (when start (subseq string start end)))) #|| (defmethod get-substring-match ((fsa fsa) (stream stream) &key minimal) (let ((found nil) (string nil)) (multiple-value-bind (reader-function value) (stream-reader stream) ;(locally (declare (optimize (speed 3))) (loop do (setf string (with-output-to-string (string-stream) (do ((states (epsilon-closure (fsa-start-state fsa) fsa) (extended-delta char states fsa)) (char (funcall reader-function value) (funcall reader-function value))) ((or (not char) (null states) (and minimal found)) (unread-char char stream) (when (not char) (return-from get-substring-match :end))) (write-char char string-stream) (if (intersection (fsa-final-states fsa) states) (setf found t))))) until found) (when string (subseq string 0 (1- (length string))))))) ||# :eof