00001 /************************************************* 00002 * Perl-Compatible Regular Expressions * 00003 *************************************************/ 00004 00005 00006 #define PCRE_VERSION "2.06 21-Jun-1999" 00007 00008 00009 /* This is a library of functions to support regular expressions whose syntax 00010 and semantics are as close as possible to those of the Perl 5 language. See 00011 the file Tech.Notes for some information on the internals. 00012 00013 Written by: Philip Hazel <ph10@cam.ac.uk> 00014 00015 Copyright (c) 1997-1999 University of Cambridge 00016 00017 ----------------------------------------------------------------------------- 00018 Permission is granted to anyone to use this software for any purpose on any 00019 computer system, and to redistribute it freely, subject to the following 00020 restrictions: 00021 00022 1. This software is distributed in the hope that it will be useful, 00023 but WITHOUT ANY WARRANTY; without even the implied warranty of 00024 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 00025 00026 2. The origin of this software must not be misrepresented, either by 00027 explicit claim or by omission. 00028 00029 3. Altered versions must be plainly marked as such, and must not be 00030 misrepresented as being the original software. 00031 00032 4. If PCRE is embedded in any software that is released under the GNU 00033 General Purpose Licence (GPL), then the terms of that licence shall 00034 supersede any condition above with which it is incompatible. 00035 ----------------------------------------------------------------------------- 00036 */ 00037 00038 /* This header contains definitions that are shared between the different 00039 modules, but which are not relevant to the outside. */ 00040 00041 /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(), 00042 define a macro for memmove() if USE_BCOPY is defined. */ 00043 00044 #ifdef USE_BCOPY 00045 #undef memmove /* some systems may have a macro */ 00046 #define memmove(a, b, c) bcopy(b, a, c) 00047 #endif 00048 00049 /* Standard C headers plus the external interface definition */ 00050 00051 #include <ctype.h> 00052 #include <limits.h> 00053 #include <stddef.h> 00054 #include <stdio.h> 00055 #include <stdlib.h> 00056 #include <string.h> 00057 #include "pcre.h" 00058 00059 /* In case there is no definition of offsetof() provided - though any proper 00060 Standard C system should have one. */ 00061 00062 #ifndef offsetof 00063 #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field)) 00064 #endif 00065 00066 /* These are the public options that can change during matching. */ 00067 00068 #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL) 00069 00070 /* Private options flags start at the most significant end of the two bytes. 00071 The public options defined in pcre.h start at the least significant end. Make 00072 sure they don't overlap! */ 00073 00074 #define PCRE_FIRSTSET 0x8000 /* first_char is set */ 00075 #define PCRE_STARTLINE 0x4000 /* start after \n for multiline */ 00076 #define PCRE_INGROUP 0x2000 /* compiling inside a group */ 00077 00078 /* Options for the "extra" block produced by pcre_study(). */ 00079 00080 #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */ 00081 00082 /* Masks for identifying the public options which are permitted at compile 00083 time, run time or study time, respectively. */ 00084 00085 #define PUBLIC_OPTIONS \ 00086 (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ 00087 PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY) 00088 00089 #define PUBLIC_EXEC_OPTIONS (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL) 00090 00091 #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ 00092 00093 /* Magic number to provide a small check against being handed junk. */ 00094 00095 #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ 00096 00097 /* Miscellaneous definitions */ 00098 00099 typedef int BOOL; 00100 00101 #define FALSE 0 00102 #define TRUE 1 00103 00104 /* These are escaped items that aren't just an encoding of a particular data 00105 value such as \n. They must have non-zero values, as check_escape() returns 00106 their negation. Also, they must appear in the same order as in the opcode 00107 definitions below, up to ESC_z. The final one must be ESC_REF as subsequent 00108 values are used for \1, \2, \3, etc. There is a test in the code for an escape 00109 greater than ESC_b and less than ESC_X to detect the types that may be 00110 repeated. If any new escapes are put in-between that don't consume a character, 00111 that code will have to change. */ 00112 00113 enum { ESC_A = 1, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w, 00114 ESC_Z, ESC_z, ESC_REF }; 00115 00116 /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets 00117 that extract substrings. Starting from 1 (i.e. after OP_END), the values up to 00118 OP_EOD must correspond in order to the list of escapes immediately above. */ 00119 00120 enum { 00121 OP_END, /* End of pattern */ 00122 00123 /* Values corresponding to backslashed metacharacters */ 00124 00125 OP_SOD, /* Start of data: \A */ 00126 OP_NOT_WORD_BOUNDARY, /* \B */ 00127 OP_WORD_BOUNDARY, /* \b */ 00128 OP_NOT_DIGIT, /* \D */ 00129 OP_DIGIT, /* \d */ 00130 OP_NOT_WHITESPACE, /* \S */ 00131 OP_WHITESPACE, /* \s */ 00132 OP_NOT_WORDCHAR, /* \W */ 00133 OP_WORDCHAR, /* \w */ 00134 OP_EODN, /* End of data or \n at end of data: \Z. */ 00135 OP_EOD, /* End of data: \z */ 00136 00137 OP_OPT, /* Set runtime options */ 00138 OP_CIRC, /* Start of line - varies with multiline switch */ 00139 OP_DOLL, /* End of line - varies with multiline switch */ 00140 OP_ANY, /* Match any character */ 00141 OP_CHARS, /* Match string of characters */ 00142 OP_NOT, /* Match anything but the following char */ 00143 00144 OP_STAR, /* The maximizing and minimizing versions of */ 00145 OP_MINSTAR, /* all these opcodes must come in pairs, with */ 00146 OP_PLUS, /* the minimizing one second. */ 00147 OP_MINPLUS, /* This first set applies to single characters */ 00148 OP_QUERY, 00149 OP_MINQUERY, 00150 OP_UPTO, /* From 0 to n matches */ 00151 OP_MINUPTO, 00152 OP_EXACT, /* Exactly n matches */ 00153 00154 OP_NOTSTAR, /* The maximizing and minimizing versions of */ 00155 OP_NOTMINSTAR, /* all these opcodes must come in pairs, with */ 00156 OP_NOTPLUS, /* the minimizing one second. */ 00157 OP_NOTMINPLUS, /* This first set applies to "not" single characters */ 00158 OP_NOTQUERY, 00159 OP_NOTMINQUERY, 00160 OP_NOTUPTO, /* From 0 to n matches */ 00161 OP_NOTMINUPTO, 00162 OP_NOTEXACT, /* Exactly n matches */ 00163 00164 OP_TYPESTAR, /* The maximizing and minimizing versions of */ 00165 OP_TYPEMINSTAR, /* all these opcodes must come in pairs, with */ 00166 OP_TYPEPLUS, /* the minimizing one second. These codes must */ 00167 OP_TYPEMINPLUS, /* be in exactly the same order as those above. */ 00168 OP_TYPEQUERY, /* This set applies to character types such as \d */ 00169 OP_TYPEMINQUERY, 00170 OP_TYPEUPTO, /* From 0 to n matches */ 00171 OP_TYPEMINUPTO, 00172 OP_TYPEEXACT, /* Exactly n matches */ 00173 00174 OP_CRSTAR, /* The maximizing and minimizing versions of */ 00175 OP_CRMINSTAR, /* all these opcodes must come in pairs, with */ 00176 OP_CRPLUS, /* the minimizing one second. These codes must */ 00177 OP_CRMINPLUS, /* be in exactly the same order as those above. */ 00178 OP_CRQUERY, /* These are for character classes and back refs */ 00179 OP_CRMINQUERY, 00180 OP_CRRANGE, /* These are different to the three seta above. */ 00181 OP_CRMINRANGE, 00182 00183 OP_CLASS, /* Match a character class */ 00184 OP_REF, /* Match a back reference */ 00185 00186 OP_ALT, /* Start of alternation */ 00187 OP_KET, /* End of group that doesn't have an unbounded repeat */ 00188 OP_KETRMAX, /* These two must remain together and in this */ 00189 OP_KETRMIN, /* order. They are for groups the repeat for ever. */ 00190 00191 /* The assertions must come before ONCE and COND */ 00192 00193 OP_ASSERT, /* Positive lookahead */ 00194 OP_ASSERT_NOT, /* Negative lookahead */ 00195 OP_ASSERTBACK, /* Positive lookbehind */ 00196 OP_ASSERTBACK_NOT, /* Negative lookbehind */ 00197 OP_REVERSE, /* Move pointer back - used in lookbehind assertions */ 00198 00199 /* ONCE and COND must come after the assertions, with ONCE first, as there's 00200 a test for >= ONCE for a subpattern that isn't an assertion. */ 00201 00202 OP_ONCE, /* Once matched, don't back up into the subpattern */ 00203 OP_COND, /* Conditional group */ 00204 OP_CREF, /* Used to hold an extraction string number */ 00205 00206 OP_BRAZERO, /* These two must remain together and in this */ 00207 OP_BRAMINZERO, /* order. */ 00208 00209 OP_BRA /* This and greater values are used for brackets that 00210 extract substrings. */ 00211 }; 00212 00213 /* The highest extraction number. This is limited by the number of opcodes 00214 left after OP_BRA, i.e. 255 - OP_BRA. We actually set it somewhat lower. */ 00215 00216 #define EXTRACT_MAX 99 00217 00218 /* The texts of compile-time error messages are defined as macros here so that 00219 they can be accessed by the POSIX wrapper and converted into error codes. Yes, 00220 I could have used error codes in the first place, but didn't feel like changing 00221 just to accommodate the POSIX wrapper. */ 00222 00223 #define ERR1 "\\ at end of pattern" 00224 #define ERR2 "\\c at end of pattern" 00225 #define ERR3 "unrecognized character follows \\" 00226 #define ERR4 "numbers out of order in {} quantifier" 00227 #define ERR5 "number too big in {} quantifier" 00228 #define ERR6 "missing terminating ] for character class" 00229 #define ERR7 "invalid escape sequence in character class" 00230 #define ERR8 "range out of order in character class" 00231 #define ERR9 "nothing to repeat" 00232 #define ERR10 "operand of unlimited repeat could match the empty string" 00233 #define ERR11 "internal error: unexpected repeat" 00234 #define ERR12 "unrecognized character after (?" 00235 #define ERR13 "too many capturing parenthesized sub-patterns" 00236 #define ERR14 "missing )" 00237 #define ERR15 "back reference to non-existent subpattern" 00238 #define ERR16 "erroffset passed as NULL" 00239 #define ERR17 "unknown option bit(s) set" 00240 #define ERR18 "missing ) after comment" 00241 #define ERR19 "too many sets of parentheses" 00242 #define ERR20 "regular expression too large" 00243 #define ERR21 "failed to get memory" 00244 #define ERR22 "unmatched parentheses" 00245 #define ERR23 "internal error: code overflow" 00246 #define ERR24 "unrecognized character after (?<" 00247 #define ERR25 "lookbehind assertion is not fixed length" 00248 #define ERR26 "malformed number after (?(" 00249 #define ERR27 "conditional group contains more than two branches" 00250 #define ERR28 "assertion expected after (?(" 00251 00252 /* All character handling must be done as unsigned characters. Otherwise there 00253 are problems with top-bit-set characters and functions such as isspace(). 00254 However, we leave the interface to the outside world as char *, because that 00255 should make things easier for callers. We define a short type for unsigned char 00256 to save lots of typing. I tried "uchar", but it causes problems on Digital 00257 Unix, where it is defined in sys/types, so use "uschar" instead. */ 00258 00259 typedef unsigned char uschar; 00260 00261 /* The real format of the start of the pcre block; the actual code vector 00262 runs on as long as necessary after the end. */ 00263 00264 typedef struct real_pcre { 00265 unsigned long int magic_number; 00266 const unsigned char *tables; 00267 unsigned short int options; 00268 unsigned char top_bracket; 00269 unsigned char top_backref; 00270 unsigned char first_char; 00271 unsigned char code[1]; 00272 } real_pcre; 00273 00274 /* The real format of the extra block returned by pcre_study(). */ 00275 00276 typedef struct real_pcre_extra { 00277 unsigned char options; 00278 unsigned char start_bits[32]; 00279 } real_pcre_extra; 00280 00281 00282 /* Structure for passing "static" information around between the functions 00283 doing the compiling, so that they are thread-safe. */ 00284 00285 typedef struct compile_data { 00286 const uschar *lcc; /* Points to lower casing table */ 00287 const uschar *fcc; /* Points to case-flippint table */ 00288 const uschar *cbits; /* Points to character type table */ 00289 const uschar *ctypes; /* Points to table of type maps */ 00290 } compile_data; 00291 00292 /* Structure for passing "static" information around between the functions 00293 doing the matching, so that they are thread-safe. */ 00294 00295 typedef struct match_data { 00296 int errorcode; /* As it says */ 00297 int *offset_vector; /* Offset vector */ 00298 int offset_end; /* One past the end */ 00299 int offset_max; /* The maximum usable for return data */ 00300 const uschar *lcc; /* Points to lower casing table */ 00301 const uschar *ctypes; /* Points to table of type maps */ 00302 BOOL offset_overflow; /* Set if too many extractions */ 00303 BOOL notbol; /* NOTBOL flag */ 00304 BOOL noteol; /* NOTEOL flag */ 00305 BOOL endonly; /* Dollar not before final \n */ 00306 const uschar *start_subject; /* Start of the subject string */ 00307 const uschar *end_subject; /* End of the subject string */ 00308 const uschar *end_match_ptr; /* Subject position at end match */ 00309 int end_offset_top; /* Highwater mark at end of match */ 00310 } match_data; 00311 00312 /* Bit definitions for entries in the pcre_ctypes table. */ 00313 00314 #define ctype_space 0x01 00315 #define ctype_letter 0x02 00316 #define ctype_digit 0x04 00317 #define ctype_xdigit 0x08 00318 #define ctype_word 0x10 /* alphameric or '_' */ 00319 #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ 00320 00321 /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set 00322 of bits for a class map. */ 00323 00324 #define cbit_digit 0 /* for \d */ 00325 #define cbit_word 32 /* for \w */ 00326 #define cbit_space 64 /* for \s */ 00327 #define cbit_length 96 /* Length of the cbits table */ 00328 00329 /* Offsets of the various tables from the base tables pointer, and 00330 total length. */ 00331 00332 #define lcc_offset 0 00333 #define fcc_offset 256 00334 #define cbits_offset 512 00335 #define ctypes_offset (cbits_offset + cbit_length) 00336 #define tables_length (ctypes_offset + 256) 00337 00338 /* End of internal.h */