1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // regexcmp.h 5 // 6 // Copyright (C) 2002-2016, International Business Machines Corporation and others. 7 // All Rights Reserved. 8 // 9 // This file contains declarations for the class RegexCompile 10 // 11 // This class is internal to the regular expression implementation. 12 // For the public Regular Expression API, see the file "unicode/regex.h" 13 // 14 15 16 #ifndef RBBISCAN_H 17 #define RBBISCAN_H 18 19 #include "unicode/utypes.h" 20 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 21 22 #include "unicode/parseerr.h" 23 #include "unicode/uniset.h" 24 #include "unicode/uobject.h" 25 #include "unicode/utext.h" 26 #include "uhash.h" 27 #include "uvector.h" 28 #include "uvectr32.h" 29 30 31 32 U_NAMESPACE_BEGIN 33 34 35 //-------------------------------------------------------------------------------- 36 // 37 // class RegexCompile Contains the regular expression compiler. 38 // 39 //-------------------------------------------------------------------------------- 40 class RegexPattern; 41 42 43 class U_I18N_API RegexCompile : public UMemory { 44 public: 45 46 enum { 47 kStackSize = 100 // The size of the state stack for 48 }; // pattern parsing. Corresponds roughly 49 // to the depth of parentheses nesting 50 // that is allowed in the rules. 51 52 struct RegexPatternChar { 53 UChar32 fChar; 54 UBool fQuoted; 55 }; 56 57 RegexCompile(RegexPattern *rp, UErrorCode &e); 58 59 void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); 60 void compile(UText *pat, UParseError &pp, UErrorCode &e); 61 62 63 virtual ~RegexCompile(); 64 65 void nextChar(RegexPatternChar &c); // Get the next char from the input stream. 66 67 68 // Categories of parentheses in pattern. 69 // The category is saved in the compile-time parentheses stack frame, and 70 // determines the code to be generated when the matching close ) is encountered. 71 enum EParenClass { 72 plain = -1, // No special handling 73 capturing = -2, 74 atomic = -3, 75 lookAhead = -4, 76 negLookAhead = -5, 77 flags = -6, 78 lookBehind = -7, 79 lookBehindN = -8 80 }; 81 82 private: 83 84 85 UBool doParseActions(int32_t a); 86 void error(UErrorCode e); // error reporting convenience function. 87 88 UChar32 nextCharLL(); 89 UChar32 peekCharLL(); 90 UnicodeSet *scanProp(); 91 UnicodeSet *scanPosixProp(); 92 void handleCloseParen(); 93 int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern 94 // at the top of the just completed block 95 // or operation, and optionally ensure that 96 // there is space to add an opcode there. 97 void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for 98 // a reference to a UnicodeSet. 99 void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier. 100 int32_t LoopOp); 101 UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier 102 void literalChar(UChar32 c); // Compile a literal char 103 void fixLiterals(UBool split=false); // Generate code for pending literal characters. 104 void insertOp(int32_t where); // Open up a slot for a new op in the 105 // generated code at the specified location. 106 void appendOp(int32_t op); // Append a new op to the compiled pattern. 107 void appendOp(int32_t type, int32_t val); // Build & append a new op to the compiled pattern. 108 int32_t buildOp(int32_t type, int32_t val); // Construct a new pcode instruction. 109 int32_t allocateData(int32_t size); // Allocate space in the matcher data area. 110 // Return index of the newly allocated data. 111 int32_t allocateStackData(int32_t size); // Allocate space in the match back-track stack frame. 112 // Return offset index in the frame. 113 int32_t minMatchLength(int32_t start, 114 int32_t end); 115 int32_t maxMatchLength(int32_t start, 116 int32_t end); 117 void matchStartType(); 118 void stripNOPs(); 119 120 void setEval(int32_t op); 121 void setPushOp(int32_t op); 122 UChar32 scanNamedChar(); 123 UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated); 124 125 public: // Public for testing only. 126 static void U_EXPORT2 findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars); 127 private: 128 129 130 UErrorCode *fStatus; 131 RegexPattern *fRXPat; 132 UParseError *fParseErr; 133 134 // 135 // Data associated with low level character scanning 136 // 137 int64_t fScanIndex; // Index of current character being processed 138 // in the rule input string. 139 UBool fQuoteMode; // Scan is in a \Q...\E quoted region 140 UBool fInBackslashQuote; // Scan is between a '\' and the following char. 141 UBool fEOLComments; // When scan is just after '(?', inhibit #... to 142 // end of line comments, in favor of (?#...) comments. 143 int64_t fLineNum; // Line number in input file. 144 int64_t fCharNum; // Char position within the line. 145 UChar32 fLastChar; // Previous char, needed to count CR-LF 146 // as a single line, not two. 147 UChar32 fPeekChar; // Saved char, if we've scanned ahead. 148 149 150 RegexPatternChar fC; // Current char for parse state machine 151 // processing. 152 153 uint16_t fStack[kStackSize]; // State stack, holds state pushes 154 int32_t fStackPtr; // and pops as specified in the state 155 // transition rules. 156 157 // 158 // Data associated with the generation of the pcode for the match engine 159 // 160 int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.) 161 // Always has high bit (31) set so that flag values 162 // on the paren stack are distinguished from relocatable 163 // pcode addresses. 164 int32_t fNewModeFlags; // New flags, while compiling (?i, holds state 165 // until last flag is scanned. 166 UBool fSetModeFlag; // true for (?ismx, false for (?-ismx 167 168 UnicodeString fLiteralChars; // Literal chars or strings from the pattern are accumulated here. 169 // Once completed, meaning that some non-literal pattern 170 // construct is encountered, the appropriate opcodes 171 // to match the literal will be generated, and this 172 // string will be cleared. 173 174 int64_t fPatternLength; // Length of the input pattern string. 175 176 UVector32 fParenStack; // parentheses stack. Each frame consists of 177 // the positions of compiled pattern operations 178 // needing fixup, followed by negative value. The 179 // first entry in each frame is the position of the 180 // spot reserved for use when a quantifier 181 // needs to add a SAVE at the start of a (block) 182 // The negative value (-1, -2,...) indicates 183 // the kind of paren that opened the frame. Some 184 // need special handling on close. 185 186 187 int32_t fMatchOpenParen; // The position in the compiled pattern 188 // of the slot reserved for a state save 189 // at the start of the most recently processed 190 // parenthesized block. Updated when processing 191 // a close to the location for the corresponding open. 192 193 int32_t fMatchCloseParen; // The position in the pattern of the first 194 // location after the most recently processed 195 // parenthesized block. 196 197 int32_t fIntervalLow; // {lower, upper} interval quantifier values. 198 int32_t fIntervalUpper; // Placed here temporarily, when pattern is 199 // initially scanned. Each new interval 200 // encountered overwrites these values. 201 // -1 for the upper interval value means none 202 // was specified (unlimited occurrences.) 203 204 UStack fSetStack; // Stack of UnicodeSets, used while evaluating 205 // (at compile time) set expressions within 206 // the pattern. 207 UStack fSetOpStack; // Stack of pending set operators (&&, --, union) 208 209 UChar32 fLastSetLiteral; // The last single code point added to a set. 210 // needed when "-y" is scanned, and we need 211 // to turn "x-y" into a range. 212 213 UnicodeString *fCaptureName; // Named Capture, the group name is built up 214 // in this string while being scanned. 215 }; 216 217 // Constant values to be pushed onto fSetOpStack while scanning & evaluating [set expressions] 218 // The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself. 219 220 enum SetOperations { 221 setStart = 0 << 16 | 1, 222 setEnd = 1 << 16 | 2, 223 setNegation = 2 << 16 | 3, 224 setCaseClose = 2 << 16 | 9, 225 setDifference2 = 3 << 16 | 4, // '--' set difference operator 226 setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator 227 setUnion = 4 << 16 | 6, // implicit union of adjacent items 228 setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet. 229 setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. 230 }; 231 232 U_NAMESPACE_END 233 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 234 #endif // RBBISCAN_H 235