;;;-*- Mode: Lisp; Package: PARSER -*- (in-package :parser) (eval-when (compile load eval) #+ignore (um:use-module :context-free-parser "projects:parser;context-free-parser") (export '(expression regexp-chart *regexp-chart* tokenize-string reverse-p )) (setf *super-optimize-primary-slot-access* t)) ;;========================== ;; regular expression parser (defclass regexp-chart (cfg-chart) ()) #| (defmethod initialize-chart ((chart regexp-chart) (regexp string)) ; reset and adjust the edge arrays (let ((array-length (array-dimension (active-edges chart) 0)) (regexp-length (length regexp))) (when (>= regexp-length array-length) (setf (active-edges chart) (adjust-array (active-edges chart) (1+ regexp-length))) (setf (inactive-edges chart) (adjust-array (inactive-edges chart) (1+ regexp-length)))) (loop for i from 0 to (max regexp-length (1- array-length)) do (setf (aref (active-edges chart) i) nil (aref (inactive-edges chart) i) nil))) ; initialize with lexical edges (let ((vertex 0)) (loop with i = (1- (length regexp)) while (>= i 0) do (let* ((char (char regexp i)) (fun (lexical-cat-fn chart)) (cat (cond ((char= char #\>) ( (setf cat 'template)) ((and (not (zerop i)) (char= (char regexp (1- i)) #\\)) (decf i) (funcall fun char t)) (t (funcall fun char)))) (dolist (char regexp) (add-lexical-edges chart char ;(print (gethash word (lexicon chart))) (funcall (lexical-cat-fn chart) char) vertex) (incf vertex))) (setf (string-length chart) (length string))) |# (declaim (inline number-char-p)) (defun number-char-p (c) (char<= #\0 c #\9)) (defun regexp-cat (c) (cond ((not (stringp c)) (list c)) ((not (find-if-not #'number-char-p c)) '(str num)) (t '(str)))) (defparameter *regexp-chart* (make-instance 'regexp-chart :lexical-cat-fn #'regexp-cat)) #| (defmethod cfg-chart-parse ((regexp string) &key (goal 'expression) (chart *regexp-chart*) (display t) (tokenize t)) "parses a regular expression given as a string" (let ((tokens (if tokenize (tokenize-string regexp chart :reverse-p (reverse-p chart)) regexp))) (time (initialize-chart chart tokens)) (display-parse chart :display display :goal goal :title regexp))) |# #| ; not needed (defmethod build-tree ((chart regexp-chart) goal) (labels ((build (edge) (let ((subnodes (parse-edge-found edge))) (if (listp subnodes) (cons (or (rule-type (parse-edge-rule edge)) (car (rule-flat-rule (parse-edge-rule edge)))) (nreverse (mapcar #'build subnodes))) (cons (string (parse-edge-rule edge)) (string (parse-edge-found edge))))))) (mapcar #'build (get-top-nodes chart goal)))) |# ;;; initialize the chart (bottom-up version) #| (defmethod tokenize-string (regexp (chart regexp-chart) &key) (loop for c across regexp collect (string c))) |# (defun regexp-char-to-symbol (c) (if (find c "+*?|()[]{}<>,-^.") c (string c))) ; this is very regexp-specific and rather opaque ; maybe we should parse from right to left; which is more natural for regexps (defmethod tokenize-string (regexp (chart regexp-chart) &key (reverse-p nil)) (let ((pos 0) (tokens ()) (next-single? nil) (escaped? nil) (in-template? nil) (escape-positions ())) (loop for i from 0 to (1- (length regexp)) do (let ((char (char regexp i))) (cond (escaped? ; treat it as a nonspecial character (setf escaped? nil)) ((and (not next-single?) (or (and in-template? (char/= char #\>)) #+digitool(ccl::xalphanumericp char) #-digitool(alphanumericp char)) ) ; is alphanumericp buggy? nil) ((char= char #\\) ; escape char ; afterwards we have to remove escape chars; store their pos (push (- i pos) escape-positions) (setf escaped? t)) ((and in-template? (char/= char #\>)) nil) (t (cond ((and (not in-template?) (find char "-+*?{<")) ; those with higher precedence than seq (setf next-single? (char= char #\-) in-template? (char= char #\<)) (when (< pos (1- i)) (let ((substr (subseq regexp pos (1- i)))) ; remove escape chars (reverse order is important!) (dolist (pos escape-positions) (setf substr (delete #\\ substr :start pos :end (1+ pos)))) (setf escape-positions ()) (unless (zerop (length substr)) (push substr tokens)))) (when (< pos i) (push (subseq regexp (1- i) i) tokens))) (t (setq in-template? nil next-single? nil) (when (< pos i) (let ((substr (subseq regexp pos i))) (dolist (pos escape-positions) (setf substr (delete #\\ substr :start pos :end (1+ pos)))) (setf escape-positions ()) (unless (zerop (length substr)) (push substr tokens)))))) (push (regexp-char-to-symbol (char regexp i)) tokens) (setf pos (1+ i) ;in-template? nil )))) finally (when (< pos (length regexp)) (let ((substr (subseq regexp pos))) (dolist (pos escape-positions) (setf substr (delete #\\ substr :start pos :end (1+ pos)))) (unless (zerop (length substr)) (push substr tokens))))) (if reverse-p tokens (nreverse tokens)))) #| (defmethod tokenize-string (regexp (chart regexp-chart) &key (reverse-p nil)) (let ((escaped-p nil) (tokens ())) (loop for i from 0 to (1- (length regexp)) do (let ((char (char regexp i))) (cond ((or escaped-p (ccl::xalphanumericp char)) (setf escaped-p nil) (push (string char) tokens)) ((char= char #\\) (setf escaped-p t)) (t (push char tokens))))) (if reverse-p tokens (nreverse tokens)))) |# #| (cfg-chart-parse "(d)" :chart *regexp-chart*) (cfg-chart-parse "(woll){3}" :chart *regexp-chart*) (cfg-chart-parse "(((woll){3,5}|)?[1a-rt]+o\\+ho)*" :chart *regexp-chart* :goal 'expression) (cfg-chart-parse "((wollwutz)|oho)*" :chart *regexp-chart*) (cfg-chart-parse "(((woll)|rigus)?lull+oho)*" :chart *regexp-chart*) (tokenize-string "(((woll){3,5}|<@rigus>)?[1a-rt]+o\\+ho)*" *regexp-chart*) (tokenize-string "(<@num>)?" *regexp-chart*) (tokenize-string "(num)?" *regexp-chart*) (defparameter *fsa* (compile-regexp "(((woll)|rigus)?lull+oho)*" :chart *regexp-chart*)) (defparameter *fsa* (compile-regexp "(((woll){3,5}|)?[1a-rt]+o\\+ho)*" :chart *regexp-chart*)) |# (defconstant $any :@) (defmethod build-cfg-grammar ((chart regexp-chart) rules-list &key reverse-p) (with-slots (grammar) chart (setf grammar nil (reverse-p chart) reverse-p) (dolist (rule rules-list) (let ((flat-rule (if reverse-p (cons (car rule) (reverse (cadr rule))) (cons (car rule) (cadr rule))))) (push (make-rule :flat-rule flat-rule ;:applicable-rules nil :function (nth 2 rule) :type (nth 2 rule)) grammar))) (dolist (rule grammar) (dolist (r grammar) (when (eq (cadr (rule-flat-rule r)) (car (rule-flat-rule rule))) (push r (rule-applicable-rules rule)))))) t) (build-cfg-grammar *regexp-chart* '((expression (sequence #\| expression) :union) (expression (sequence)) (sequence (sequence term) :seq) (sequence (term) :seq) (term (#\( expression #\))) (term (term #\*) :closure) (term (term #\+) :plus) (term (term #\?) :option) (term (#\.) :@) (term (term #\{ counter #\}) :counter) (term (#\[ range #\]) :range) (term (str)) (term (#\< str #\>) :template) (counter (num) :count-exact) (counter (num #\,) :count-lower-bound) (counter (#\, num) :count-upper-bound) (counter (num #\, num) :count-both-bounds) (range (range str)) (range (range str #\- str)) (range (str)) (range (str #\- str))) :reverse-p t) #| (build-cfg-grammar *regexp-chart* '((expression sequence [#\| expression]) (sequence term [sequence]) (term #\( expression #\)) (term term #\*) (term term #\+) (term term #\?) (term term #\{ iteration #\}) (term #\[ range #\]) (term str) (term #\< str #\>) (iteration num [#\, num]) (range str [#\- str] [range]))) |#