;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- ;;; $Header$ ;;; This file defines the REGEX class and some utility methods for ;;; this class. REGEX objects are used to represent the (transformed) ;;; parse trees internally ;;; Copyright (c) 2002-2004, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions ;;; are met: ;;; * Redistributions of source code must retain the above copyright ;;; notice, this list of conditions and the following disclaimer. ;;; * Redistributions in binary form must reproduce the above ;;; copyright notice, this list of conditions and the following ;;; disclaimer in the documentation and/or other materials ;;; provided with the distribution. ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (in-package #:cl-ppcre) ;; Genera need the eval-when, here, or the types created by the class ;; definitions aren't seen by the typep calls later in the file. (eval-when (:compile-toplevel :load-toplevel :execute) (locally (declare (optimize speed (safety 0) (space 0) (debug 0) (compilation-speed 0) #+:lispworks (hcl:fixnum-safety 0))) (defclass regex () () (:documentation "The REGEX base class. All other classes inherit from this one.")) (defclass seq (regex) ((elements :initarg :elements :accessor elements :type cons :documentation "A list of REGEX objects.")) (:documentation "SEQ objects represents sequences of regexes. (Like \"ab\" is the sequence of \"a\" and \"b\".)")) (defclass alternation (regex) ((choices :initarg :choices :accessor choices :type cons :documentation "A list of REGEX objects")) (:documentation "ALTERNATION objects represent alternations of regexes. (Like \"a|b\" ist the alternation of \"a\" or \"b\".)")) (defclass lookahead (regex) ((regex :initarg :regex :accessor regex :documentation "The REGEX object we're checking.") (positivep :initarg :positivep :reader positivep :documentation "Whether this assertion is positive.")) (:documentation "LOOKAHEAD objects represent look-ahead assertions.")) (defclass lookbehind (regex) ((regex :initarg :regex :accessor regex :documentation "The REGEX object we're checking.") (positivep :initarg :positivep :reader positivep :documentation "Whether this assertion is positive.") (len :initarg :len :accessor len :type fixnum :documentation "The (fixed) length of the enclosed regex.")) (:documentation "LOOKBEHIND objects represent look-behind assertions.")) (defclass repetition (regex) ((regex :initarg :regex :accessor regex :documentation "The REGEX that's repeated.") (greedyp :initarg :greedyp :reader greedyp :documentation "Whether the repetition is greedy.") (minimum :initarg :minimum :accessor minimum :type fixnum :documentation "The minimal number of repetitions.") (maximum :initarg :maximum :accessor maximum :documentation "The maximal number of repetitions. Can be NIL for unbounded.") (min-len :initarg :min-len :reader min-len :documentation "The minimal length of the enclosed regex.") (len :initarg :len :reader len :documentation "The length of the enclosed regex. NIL if unknown.") (min-rest :initform 0 :accessor min-rest :type fixnum :documentation "The minimal number of characters which must appear after this repetition.") (contains-register-p :initarg :contains-register-p :reader contains-register-p :documentation "If the regex contains a register.")) (:documentation "REPETITION objects represent repetitions of regexes.")) (defclass register (regex) ((regex :initarg :regex :accessor regex :documentation "The inner regex.") (num :initarg :num :reader num :type fixnum :documentation "The number of this register, starting from 0. This is the index into *REGS-START* and *REGS-END*.")) (:documentation "REGISTER objects represent register groups.")) (defclass standalone (regex) ((regex :initarg :regex :accessor regex :documentation "The inner regex.")) (:documentation "A standalone regular expression.")) (defclass back-reference (regex) ((num :initarg :num :accessor num :type fixnum :documentation "The number of the register this reference refers to.") (case-insensitive-p :initarg :case-insensitive-p :reader case-insensitive-p :documentation "Whether we check case-insensitively.")) (:documentation "BACK-REFERENCE objects represent backreferences.")) (defclass char-class (regex) ((hash :initarg :hash :reader hash :type (or hash-table null) :documentation "A hash table the keys of which are the characters; the values are always T.") (case-insensitive-p :initarg :case-insensitive-p :reader case-insensitive-p :documentation "If the char class case-insensitive.") (invertedp :initarg :invertedp :reader invertedp :documentation "Whether we mean the inverse of the char class.") (word-char-class-p :initarg :word-char-class-p :reader word-char-class-p :documentation "Whether this CHAR CLASS represents the special class WORD-CHAR-CLASS.")) (:documentation "CHAR-CLASS objects represent character classes.")) (defclass str (regex) ((str :initarg :str :accessor str :type string :documentation "The actual string.") (len :initform 0 :accessor len :type fixnum :documentation "The length of the string.") (case-insensitive-p :initarg :case-insensitive-p :reader case-insensitive-p :documentation "If we match case-insensitively.") (offset :initform nil :accessor offset :documentation "Offset from the left of the whole parse tree. The first regex has offset 0. NIL if unknown, i.e. behind a variable-length regex.") (skip :initform nil :initarg :skip :accessor skip :documentation "If we can avoid testing for this string because the SCAN function has done this already.") (start-of-end-string-p :initform nil :accessor start-of-end-string-p :documentation "If this is the unique STR which starts END-STRING (a slot of MATCHER).")) (:documentation "STR objects represent string.")) (defclass anchor (regex) ((startp :initarg :startp :reader startp :documentation "Whether this is a \"start anchor\".") (multi-line-p :initarg :multi-line-p :reader multi-line-p :documentation "Whether we're in multi-line mode, i.e. whether each #\\Newline is surrounded by anchors.") (no-newline-p :initarg :no-newline-p :reader no-newline-p :documentation "Whether we ignore #\\Newline at the end.")) (:documentation "ANCHOR objects represent anchors like \"^\" or \"$\".")) (defclass everything (regex) ((single-line-p :initarg :single-line-p :reader single-line-p :documentation "Whether we're in single-line mode, i.e. whether we also match #\\Newline.")) (:documentation "EVERYTHING objects represent regexes matching \"everything\", i.e. dots.")) (defclass word-boundary (regex) ((negatedp :initarg :negatedp :reader negatedp :documentation "Whether we mean the opposite, i.e. no word-boundary.")) (:documentation "WORD-BOUNDARY objects represent word-boundary assertions.")) (defclass branch (regex) ((test :initarg :test :accessor test :documentation "The test of this branch, one of LOOKAHEAD, LOOKBEHIND, or a number.") (then-regex :initarg :then-regex :accessor then-regex :documentation "The regex that's to be matched if the test succeeds.") (else-regex :initarg :else-regex :initform (make-instance 'void) :accessor else-regex :documentation "The regex that's to be matched if the test fails.")) (:documentation "BRANCH objects represent Perl's conditional regular expressions.")) (defclass filter (regex) ((fn :initarg :fn :accessor fn :type (or function symbol) :documentation "The user-defined function.") (len :initarg :len :reader len :documentation "The fixed length of this filter or NIL.")) (:documentation "FILTER objects represent arbitrary functions defined by the user.")) (defclass void (regex) () (:documentation "VOID objects represent empty regular expressions.")))) (defmethod initialize-instance :after ((char-class char-class) &rest init-args) (declare (optimize speed (safety 0) (space 0) (debug 0) (compilation-speed 0) #+:lispworks (hcl:fixnum-safety 0))) "Make large hash tables smaller, if possible." (let ((hash (getf init-args :hash))) (when (and hash (> *regex-char-code-limit* 256) (> (hash-table-count hash) (/ *regex-char-code-limit* 2))) (setf (slot-value char-class 'hash) (merge-inverted-hash (make-hash-table) hash) (slot-value char-class 'invertedp) (not (slot-value char-class 'invertedp)))))) (declaim (ftype (function (t) simple-string) str)) ;;; The following four methods allow a VOID object to behave like a ;;; zero-length STR object (only readers needed) (defmethod initialize-instance :after ((str str) &rest init-args) (declare (optimize speed (safety 0) (space 0) (debug 0) (compilation-speed 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (ignore init-args)) "Automatically computes the length of a STR after initialization." (let ((str-slot (slot-value str 'str))) (unless (typep str-slot 'simple-string) (setf (slot-value str 'str) (coerce str-slot 'simple-string)))) (setf (len str) (length (str str)))) (defmethod len ((void void)) (declare (optimize speed (safety 0) (space 0) (debug 0) (compilation-speed 0) #+:lispworks (hcl:fixnum-safety 0))) 0) (defmethod str ((void void)) (declare (optimize speed (safety 0) (space 0) (debug 0) (compilation-speed 0) #+:lispworks (hcl:fixnum-safety 0))) "") (defmethod skip ((void void)) (declare (optimize speed (safety 0) (space 0) (debug 0) (compilation-speed 0) #+:lispworks (hcl:fixnum-safety 0))) nil) (defmethod start-of-end-string-p ((void void)) (declare (optimize speed (safety 0) (space 0) (debug 0) (compilation-speed 0) #+:lispworks (hcl:fixnum-safety 0))) nil) (defgeneric case-mode (regex old-case-mode) (declare (optimize speed (safety 0) (space 0) (debug 0) (compilation-speed 0) #+:lispworks (hcl:fixnum-safety 0))) (:documentation "Utility function used by the optimizer (see GATHER-STRINGS). Returns a keyword denoting the case-(in)sensitivity of a STR or its second argument if the STR has length 0. Returns NIL for REGEX objects which are not of type STR.")) (defmethod case-mode ((str str) old-case-mode) (cond ((zerop (len str)) old-case-mode) ((case-insensitive-p str) :case-insensitive) (t :case-sensitive))) (defmethod case-mode ((regex regex) old-case-mode) (declare (ignore old-case-mode)) nil) (defgeneric copy-regex (regex) (declare (optimize speed (safety 0) (space 0) (debug 0) (compilation-speed 0) #+:lispworks (hcl:fixnum-safety 0))) (:documentation "Implements a deep copy of a REGEX object.")) (defmethod copy-regex ((anchor anchor)) (make-instance 'anchor :startp (startp anchor) :multi-line-p (multi-line-p anchor) :no-newline-p (no-newline-p anchor))) (defmethod copy-regex ((everything everything)) (make-instance 'everything :single-line-p (single-line-p everything))) (defmethod copy-regex ((word-boundary word-boundary)) (make-instance 'word-boundary :negatedp (negatedp word-boundary))) (defmethod copy-regex ((void void)) (make-instance 'void)) (defmethod copy-regex ((lookahead lookahead)) (make-instance 'lookahead :regex (copy-regex (regex lookahead)) :positivep (positivep lookahead))) (defmethod copy-regex ((seq seq)) (make-instance 'seq :elements (mapcar #'copy-regex (elements seq)))) (defmethod copy-regex ((alternation alternation)) (make-instance 'alternation :choices (mapcar #'copy-regex (choices alternation)))) (defmethod copy-regex ((branch branch)) (with-slots ((test test)) branch (make-instance 'branch :test (if (typep test 'regex) (copy-regex test) test) :then-regex (copy-regex (then-regex branch)) :else-regex (copy-regex (else-regex branch))))) (defmethod copy-regex ((lookbehind lookbehind)) (make-instance 'lookbehind :regex (copy-regex (regex lookbehind)) :positivep (positivep lookbehind) :len (len lookbehind))) (defmethod copy-regex ((repetition repetition)) (make-instance 'repetition :regex (copy-regex (regex repetition)) :greedyp (greedyp repetition) :minimum (minimum repetition) :maximum (maximum repetition) :min-len (min-len repetition) :len (len repetition) :contains-register-p (contains-register-p repetition))) (defmethod copy-regex ((register register)) (make-instance 'register :regex (copy-regex (regex register)) :num (num register))) (defmethod copy-regex ((standalone standalone)) (make-instance 'standalone :regex (copy-regex (regex standalone)))) (defmethod copy-regex ((back-reference back-reference)) (make-instance 'back-reference :num (num back-reference) :case-insensitive-p (case-insensitive-p back-reference))) (defmethod copy-regex ((char-class char-class)) (make-instance 'char-class :hash (hash char-class) :case-insensitive-p (case-insensitive-p char-class) :invertedp (invertedp char-class) :word-char-class-p (word-char-class-p char-class))) (defmethod copy-regex ((str str)) (make-instance 'str :str (str str) :case-insensitive-p (case-insensitive-p str))) (defmethod copy-regex ((filter filter)) (make-instance 'filter :fn (fn filter) :len (len filter))) ;;; Note that COPY-REGEX and REMOVE-REGISTERS could have easily been ;;; wrapped into one function. Maybe in the next release... ;;; Further note that this function is used by CONVERT to factor out ;;; complicated repetitions, i.e. cases like ;;; (a)* -> (?:a*(a))? ;;; This won't work for, say, ;;; ((a)|(b))* -> (?:(?:a|b)*((a)|(b)))? ;;; and therefore we stop REGISTER removal once we see an ALTERNATION. (defgeneric remove-registers (regex) (declare (optimize speed (safety 0) (space 0) (debug 0) (compilation-speed 0) #+:lispworks (hcl:fixnum-safety 0))) (:documentation "Returns a deep copy of a REGEX (see COPY-REGEX) and optionally removes embedded REGISTER objects if possible and if the special variable REMOVE-REGISTERS-P is true.")) (defmethod remove-registers ((register register)) (declare (special remove-registers-p reg-seen)) (cond (remove-registers-p (remove-registers (regex register))) (t ;; mark REG-SEEN as true so enclosing REPETITION objects ;; (see method below) know if they contain a register or not (setq reg-seen t) (copy-regex register)))) (defmethod remove-registers ((repetition repetition)) (let* (reg-seen (inner-regex (remove-registers (regex repetition)))) ;; REMOVE-REGISTERS will set REG-SEEN (see method above) if ;; (REGEX REPETITION) contains a REGISTER (declare (special reg-seen)) (make-instance 'repetition :regex inner-regex :greedyp (greedyp repetition) :minimum (minimum repetition) :maximum (maximum repetition) :min-len (min-len repetition) :len (len repetition) :contains-register-p reg-seen))) (defmethod remove-registers ((standalone standalone)) (make-instance 'standalone :regex (remove-registers (regex standalone)))) (defmethod remove-registers ((lookahead lookahead)) (make-instance 'lookahead :regex (remove-registers (regex lookahead)) :positivep (positivep lookahead))) (defmethod remove-registers ((lookbehind lookbehind)) (make-instance 'lookbehind :regex (remove-registers (regex lookbehind)) :positivep (positivep lookbehind) :len (len lookbehind))) (defmethod remove-registers ((branch branch)) (with-slots ((test test)) branch (make-instance 'branch :test (if (typep test 'regex) (remove-registers test) test) :then-regex (remove-registers (then-regex branch)) :else-regex (remove-registers (else-regex branch))))) (defmethod remove-registers ((alternation alternation)) (declare (special remove-registers-p)) ;; an ALTERNATION, so we can't remove REGISTER objects further down (setq remove-registers-p nil) (copy-regex alternation)) (defmethod remove-registers ((regex regex)) (copy-regex regex)) (defmethod remove-registers ((seq seq)) (make-instance 'seq :elements (mapcar #'remove-registers (elements seq)))) (defgeneric everythingp (regex) (declare (optimize speed (safety 0) (space 0) (debug 0) (compilation-speed 0) #+:lispworks (hcl:fixnum-safety 0))) (:documentation "Returns an EVERYTHING object if REGEX is equivalent to this object, otherwise NIL. So, \"(.){1}\" would return true (i.e. the object corresponding to \".\", for example.")) (defmethod everythingp ((seq seq)) ;; we might have degenerate cases like (:SEQUENCE :VOID ...) ;; due to the parsing process (let ((cleaned-elements (remove-if #'(lambda (element) (typep element 'void)) (elements seq)))) (and (= 1 (length cleaned-elements)) (everythingp (first cleaned-elements))))) (defmethod everythingp ((alternation alternation)) (with-slots ((choices choices)) alternation (and (= 1 (length choices)) ;; this is unlikely to happen for human-generated regexes, ;; but machine-generated ones might look like this (everythingp (first choices))))) (defmethod everythingp ((repetition repetition)) (with-slots ((maximum maximum) (minimum minimum) (regex regex)) repetition (and maximum (= 1 minimum maximum) ;; treat "{1,1}" like "" (everythingp regex)))) (defmethod everythingp ((register register)) (everythingp (regex register))) (defmethod everythingp ((standalone standalone)) (everythingp (regex standalone))) (defmethod everythingp ((everything everything)) everything) (defmethod everythingp ((regex regex)) ;; the general case for ANCHOR, BACK-REFERENCE, BRANCH, CHAR-CLASS, ;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY nil) (defgeneric regex-length (regex) (declare (optimize speed (safety 0) (space 0) (debug 0) (compilation-speed 0) #+:lispworks (hcl:fixnum-safety 0))) (:documentation "Return the length of REGEX if it is fixed, NIL otherwise.")) (defmethod regex-length ((seq seq)) ;; simply add all inner lengths unless one of them is NIL (loop for sub-regex in (elements seq) for len = (regex-length sub-regex) if (not len) do (return nil) sum len)) (defmethod regex-length ((alternation alternation)) ;; only return a true value if all inner lengths are non-NIL and ;; mutually equal (loop for sub-regex in (choices alternation) for old-len = nil then len for len = (regex-length sub-regex) if (or (not len) (and old-len (/= len old-len))) do (return nil) finally (return len))) (defmethod regex-length ((branch branch)) ;; only return a true value if both alternations have a length and ;; if they're equal (let ((then-length (regex-length (then-regex branch)))) (and then-length (eql then-length (regex-length (else-regex branch))) then-length))) (defmethod regex-length ((repetition repetition)) ;; we can only compute the length of a REPETITION object if the ;; number of repetitions is fixed; note that we don't call ;; REGEX-LENGTH for the inner regex, we assume that the LEN slot is ;; always set correctly (with-slots ((len len) (minimum minimum) (maximum maximum)) repetition (if (and len (eq minimum maximum)) (* minimum len) nil))) (defmethod regex-length ((register register)) (regex-length (regex register))) (defmethod regex-length ((standalone standalone)) (regex-length (regex standalone))) (defmethod regex-length ((back-reference back-reference)) ;; with enough effort we could possibly do better here, but ;; currently we just give up and return NIL nil) (defmethod regex-length ((char-class char-class)) 1) (defmethod regex-length ((everything everything)) 1) (defmethod regex-length ((str str)) (len str)) (defmethod regex-length ((filter filter)) (len filter)) (defmethod regex-length ((regex regex)) ;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and ;; WORD-BOUNDARY (which all have zero-length) 0) (defgeneric regex-min-length (regex) (declare (optimize speed (safety 0) (space 0) (debug 0) (compilation-speed 0) #+:lispworks (hcl:fixnum-safety 0))) (:documentation "Returns the minimal length of REGEX.")) (defmethod regex-min-length ((seq seq)) ;; simply add all inner minimal lengths (loop for sub-regex in (elements seq) for len = (regex-min-length sub-regex) sum len)) (defmethod regex-min-length ((alternation alternation)) ;; minimal length of an alternation is the minimal length of the ;; "shortest" element (loop for sub-regex in (choices alternation) for len = (regex-min-length sub-regex) minimize len)) (defmethod regex-min-length ((branch branch)) ;; minimal length of both alternations (min (regex-min-length (then-regex branch)) (regex-min-length (else-regex branch)))) (defmethod regex-min-length ((repetition repetition)) ;; obviously the product of the inner minimal length and the minimal ;; number of repetitions (* (minimum repetition) (min-len repetition))) (defmethod regex-min-length ((register register)) (regex-min-length (regex register))) (defmethod regex-min-length ((standalone standalone)) (regex-min-length (regex standalone))) (defmethod regex-min-length ((char-class char-class)) 1) (defmethod regex-min-length ((everything everything)) 1) (defmethod regex-min-length ((str str)) (len str)) (defmethod regex-min-length ((filter filter)) (or (len filter) 0)) (defmethod regex-min-length ((regex regex)) ;; the general case for ANCHOR, BACK-REFERENCE, LOOKAHEAD, ;; LOOKBEHIND, VOID, and WORD-BOUNDARY 0) (defgeneric compute-offsets (regex start-pos) (declare (optimize speed (safety 0) (space 0) (debug 0) (compilation-speed 0) #+:lispworks (hcl:fixnum-safety 0))) (:documentation "Returns the offset the following regex would have relative to START-POS or NIL if we can't compute it. Sets the OFFSET slot of REGEX to START-POS if REGEX is a STR. May also affect OFFSET slots of STR objects further down the tree.")) ;; note that we're actually only interested in the offset of ;; "top-level" STR objects (see ADVANCE-FN in the SCAN function) so we ;; can stop at variable-length alternations and don't need to descend ;; into repetitions (defmethod compute-offsets ((seq seq) start-pos) (loop for element in (elements seq) ;; advance offset argument for next call while looping through ;; the elements for pos = start-pos then curr-offset for curr-offset = (compute-offsets element pos) while curr-offset finally (return curr-offset))) (defmethod compute-offsets ((alternation alternation) start-pos) (loop for choice in (choices alternation) for old-offset = nil then curr-offset for curr-offset = (compute-offsets choice start-pos) ;; we stop immediately if two alternations don't result in the ;; same offset if (or (not curr-offset) (and old-offset (/= curr-offset old-offset))) do (return nil) finally (return curr-offset))) (defmethod compute-offsets ((branch branch) start-pos) ;; only return offset if both alternations have equal value (let ((then-offset (compute-offsets (then-regex branch) start-pos))) (and then-offset (eql then-offset (compute-offsets (else-regex branch) start-pos)) then-offset))) (defmethod compute-offsets ((repetition repetition) start-pos) ;; no need to descend into the inner regex (with-slots ((len len) (minimum minimum) (maximum maximum)) repetition (if (and len (eq minimum maximum)) ;; fixed number of repetitions, so we know how to proceed (+ start-pos (* minimum len)) ;; otherwise return NIL nil))) (defmethod compute-offsets ((register register) start-pos) (compute-offsets (regex register) start-pos)) (defmethod compute-offsets ((standalone standalone) start-pos) (compute-offsets (regex standalone) start-pos)) (defmethod compute-offsets ((char-class char-class) start-pos) (1+ start-pos)) (defmethod compute-offsets ((everything everything) start-pos) (1+ start-pos)) (defmethod compute-offsets ((str str) start-pos) (setf (offset str) start-pos) (+ start-pos (len str))) (defmethod compute-offsets ((back-reference back-reference) start-pos) ;; with enough effort we could possibly do better here, but ;; currently we just give up and return NIL (declare (ignore start-pos)) nil) (defmethod compute-offsets ((filter filter) start-pos) (let ((len (len filter))) (if len (+ start-pos len) nil))) (defmethod compute-offsets ((regex regex) start-pos) ;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and ;; WORD-BOUNDARY (which all have zero-length) start-pos)