Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

pcreinternal.h

Go to the documentation of this file.
00001 /*************************************************
00002 *      Perl-Compatible Regular Expressions       *
00003 *************************************************/
00004 
00005 
00006 #define PCRE_VERSION       "2.06 21-Jun-1999"
00007 
00008 
00009 /* This is a library of functions to support regular expressions whose syntax
00010 and semantics are as close as possible to those of the Perl 5 language. See
00011 the file Tech.Notes for some information on the internals.
00012 
00013 Written by: Philip Hazel <ph10@cam.ac.uk>
00014 
00015            Copyright (c) 1997-1999 University of Cambridge
00016 
00017 -----------------------------------------------------------------------------
00018 Permission is granted to anyone to use this software for any purpose on any
00019 computer system, and to redistribute it freely, subject to the following
00020 restrictions:
00021 
00022 1. This software is distributed in the hope that it will be useful,
00023    but WITHOUT ANY WARRANTY; without even the implied warranty of
00024    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
00025 
00026 2. The origin of this software must not be misrepresented, either by
00027    explicit claim or by omission.
00028 
00029 3. Altered versions must be plainly marked as such, and must not be
00030    misrepresented as being the original software.
00031 
00032 4. If PCRE is embedded in any software that is released under the GNU
00033    General Purpose Licence (GPL), then the terms of that licence shall
00034    supersede any condition above with which it is incompatible.
00035 -----------------------------------------------------------------------------
00036 */
00037 
00038 /* This header contains definitions that are shared between the different
00039 modules, but which are not relevant to the outside. */
00040 
00041 /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
00042 define a macro for memmove() if USE_BCOPY is defined. */
00043 
00044 #ifdef USE_BCOPY
00045 #undef  memmove        /* some systems may have a macro */
00046 #define memmove(a, b, c) bcopy(b, a, c)
00047 #endif
00048 
00049 /* Standard C headers plus the external interface definition */
00050 
00051 #include <ctype.h>
00052 #include <limits.h>
00053 #include <stddef.h>
00054 #include <stdio.h>
00055 #include <stdlib.h>
00056 #include <string.h>
00057 #include "pcre.h"
00058 
00059 /* In case there is no definition of offsetof() provided - though any proper
00060 Standard C system should have one. */
00061 
00062 #ifndef offsetof
00063 #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
00064 #endif
00065 
00066 /* These are the public options that can change during matching. */
00067 
00068 #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
00069 
00070 /* Private options flags start at the most significant end of the two bytes.
00071 The public options defined in pcre.h start at the least significant end. Make
00072 sure they don't overlap! */
00073 
00074 #define PCRE_FIRSTSET           0x8000  /* first_char is set */
00075 #define PCRE_STARTLINE          0x4000  /* start after \n for multiline */
00076 #define PCRE_INGROUP            0x2000  /* compiling inside a group */
00077 
00078 /* Options for the "extra" block produced by pcre_study(). */
00079 
00080 #define PCRE_STUDY_MAPPED   0x01     /* a map of starting chars exists */
00081 
00082 /* Masks for identifying the public options which are permitted at compile
00083 time, run time or study time, respectively. */
00084 
00085 #define PUBLIC_OPTIONS \
00086   (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
00087    PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY)
00088 
00089 #define PUBLIC_EXEC_OPTIONS (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL)
00090 
00091 #define PUBLIC_STUDY_OPTIONS 0   /* None defined */
00092 
00093 /* Magic number to provide a small check against being handed junk. */
00094 
00095 #define MAGIC_NUMBER  0x50435245UL   /* 'PCRE' */
00096 
00097 /* Miscellaneous definitions */
00098 
00099 typedef int BOOL;
00100 
00101 #define FALSE   0
00102 #define TRUE    1
00103 
00104 /* These are escaped items that aren't just an encoding of a particular data
00105 value such as \n. They must have non-zero values, as check_escape() returns
00106 their negation. Also, they must appear in the same order as in the opcode
00107 definitions below, up to ESC_z. The final one must be ESC_REF as subsequent
00108 values are used for \1, \2, \3, etc. There is a test in the code for an escape
00109 greater than ESC_b and less than ESC_X to detect the types that may be
00110 repeated. If any new escapes are put in-between that don't consume a character,
00111 that code will have to change. */
00112 
00113 enum { ESC_A = 1, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w,
00114        ESC_Z, ESC_z, ESC_REF };
00115 
00116 /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
00117 that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
00118 OP_EOD must correspond in order to the list of escapes immediately above. */
00119 
00120 enum {
00121   OP_END,            /* End of pattern */
00122 
00123   /* Values corresponding to backslashed metacharacters */
00124 
00125   OP_SOD,            /* Start of data: \A */
00126   OP_NOT_WORD_BOUNDARY,  /* \B */
00127   OP_WORD_BOUNDARY,      /* \b */
00128   OP_NOT_DIGIT,          /* \D */
00129   OP_DIGIT,              /* \d */
00130   OP_NOT_WHITESPACE,     /* \S */
00131   OP_WHITESPACE,         /* \s */
00132   OP_NOT_WORDCHAR,       /* \W */
00133   OP_WORDCHAR,           /* \w */
00134   OP_EODN,           /* End of data or \n at end of data: \Z. */
00135   OP_EOD,            /* End of data: \z */
00136 
00137   OP_OPT,            /* Set runtime options */
00138   OP_CIRC,           /* Start of line - varies with multiline switch */
00139   OP_DOLL,           /* End of line - varies with multiline switch */
00140   OP_ANY,            /* Match any character */
00141   OP_CHARS,          /* Match string of characters */
00142   OP_NOT,            /* Match anything but the following char */
00143 
00144   OP_STAR,           /* The maximizing and minimizing versions of */
00145   OP_MINSTAR,        /* all these opcodes must come in pairs, with */
00146   OP_PLUS,           /* the minimizing one second. */
00147   OP_MINPLUS,        /* This first set applies to single characters */
00148   OP_QUERY,
00149   OP_MINQUERY,
00150   OP_UPTO,           /* From 0 to n matches */
00151   OP_MINUPTO,
00152   OP_EXACT,          /* Exactly n matches */
00153 
00154   OP_NOTSTAR,        /* The maximizing and minimizing versions of */
00155   OP_NOTMINSTAR,     /* all these opcodes must come in pairs, with */
00156   OP_NOTPLUS,        /* the minimizing one second. */
00157   OP_NOTMINPLUS,     /* This first set applies to "not" single characters */
00158   OP_NOTQUERY,
00159   OP_NOTMINQUERY,
00160   OP_NOTUPTO,        /* From 0 to n matches */
00161   OP_NOTMINUPTO,
00162   OP_NOTEXACT,       /* Exactly n matches */
00163 
00164   OP_TYPESTAR,       /* The maximizing and minimizing versions of */
00165   OP_TYPEMINSTAR,    /* all these opcodes must come in pairs, with */
00166   OP_TYPEPLUS,       /* the minimizing one second. These codes must */
00167   OP_TYPEMINPLUS,    /* be in exactly the same order as those above. */
00168   OP_TYPEQUERY,      /* This set applies to character types such as \d */
00169   OP_TYPEMINQUERY,
00170   OP_TYPEUPTO,       /* From 0 to n matches */
00171   OP_TYPEMINUPTO,
00172   OP_TYPEEXACT,      /* Exactly n matches */
00173 
00174   OP_CRSTAR,         /* The maximizing and minimizing versions of */
00175   OP_CRMINSTAR,      /* all these opcodes must come in pairs, with */
00176   OP_CRPLUS,         /* the minimizing one second. These codes must */
00177   OP_CRMINPLUS,      /* be in exactly the same order as those above. */
00178   OP_CRQUERY,        /* These are for character classes and back refs */
00179   OP_CRMINQUERY,
00180   OP_CRRANGE,        /* These are different to the three seta above. */
00181   OP_CRMINRANGE,
00182 
00183   OP_CLASS,          /* Match a character class */
00184   OP_REF,            /* Match a back reference */
00185 
00186   OP_ALT,            /* Start of alternation */
00187   OP_KET,            /* End of group that doesn't have an unbounded repeat */
00188   OP_KETRMAX,        /* These two must remain together and in this */
00189   OP_KETRMIN,        /* order. They are for groups the repeat for ever. */
00190 
00191   /* The assertions must come before ONCE and COND */
00192 
00193   OP_ASSERT,         /* Positive lookahead */
00194   OP_ASSERT_NOT,     /* Negative lookahead */
00195   OP_ASSERTBACK,     /* Positive lookbehind */
00196   OP_ASSERTBACK_NOT, /* Negative lookbehind */
00197   OP_REVERSE,        /* Move pointer back - used in lookbehind assertions */
00198 
00199   /* ONCE and COND must come after the assertions, with ONCE first, as there's
00200   a test for >= ONCE for a subpattern that isn't an assertion. */
00201 
00202   OP_ONCE,           /* Once matched, don't back up into the subpattern */
00203   OP_COND,           /* Conditional group */
00204   OP_CREF,           /* Used to hold an extraction string number */
00205 
00206   OP_BRAZERO,        /* These two must remain together and in this */
00207   OP_BRAMINZERO,     /* order. */
00208 
00209   OP_BRA             /* This and greater values are used for brackets that
00210                         extract substrings. */
00211 };
00212 
00213 /* The highest extraction number. This is limited by the number of opcodes
00214 left after OP_BRA, i.e. 255 - OP_BRA. We actually set it somewhat lower. */
00215 
00216 #define EXTRACT_MAX  99
00217 
00218 /* The texts of compile-time error messages are defined as macros here so that
00219 they can be accessed by the POSIX wrapper and converted into error codes.  Yes,
00220 I could have used error codes in the first place, but didn't feel like changing
00221 just to accommodate the POSIX wrapper. */
00222 
00223 #define ERR1  "\\ at end of pattern"
00224 #define ERR2  "\\c at end of pattern"
00225 #define ERR3  "unrecognized character follows \\"
00226 #define ERR4  "numbers out of order in {} quantifier"
00227 #define ERR5  "number too big in {} quantifier"
00228 #define ERR6  "missing terminating ] for character class"
00229 #define ERR7  "invalid escape sequence in character class"
00230 #define ERR8  "range out of order in character class"
00231 #define ERR9  "nothing to repeat"
00232 #define ERR10 "operand of unlimited repeat could match the empty string"
00233 #define ERR11 "internal error: unexpected repeat"
00234 #define ERR12 "unrecognized character after (?"
00235 #define ERR13 "too many capturing parenthesized sub-patterns"
00236 #define ERR14 "missing )"
00237 #define ERR15 "back reference to non-existent subpattern"
00238 #define ERR16 "erroffset passed as NULL"
00239 #define ERR17 "unknown option bit(s) set"
00240 #define ERR18 "missing ) after comment"
00241 #define ERR19 "too many sets of parentheses"
00242 #define ERR20 "regular expression too large"
00243 #define ERR21 "failed to get memory"
00244 #define ERR22 "unmatched parentheses"
00245 #define ERR23 "internal error: code overflow"
00246 #define ERR24 "unrecognized character after (?<"
00247 #define ERR25 "lookbehind assertion is not fixed length"
00248 #define ERR26 "malformed number after (?("
00249 #define ERR27 "conditional group contains more than two branches"
00250 #define ERR28 "assertion expected after (?("
00251 
00252 /* All character handling must be done as unsigned characters. Otherwise there
00253 are problems with top-bit-set characters and functions such as isspace().
00254 However, we leave the interface to the outside world as char *, because that
00255 should make things easier for callers. We define a short type for unsigned char
00256 to save lots of typing. I tried "uchar", but it causes problems on Digital
00257 Unix, where it is defined in sys/types, so use "uschar" instead. */
00258 
00259 typedef unsigned char uschar;
00260 
00261 /* The real format of the start of the pcre block; the actual code vector
00262 runs on as long as necessary after the end. */
00263 
00264 typedef struct real_pcre {
00265   unsigned long int magic_number;
00266   const unsigned char *tables;
00267   unsigned short int options;
00268   unsigned char top_bracket;
00269   unsigned char top_backref;
00270   unsigned char first_char;
00271   unsigned char code[1];
00272 } real_pcre;
00273 
00274 /* The real format of the extra block returned by pcre_study(). */
00275 
00276 typedef struct real_pcre_extra {
00277   unsigned char options;
00278   unsigned char start_bits[32];
00279 } real_pcre_extra;
00280 
00281 
00282 /* Structure for passing "static" information around between the functions
00283 doing the compiling, so that they are thread-safe. */
00284 
00285 typedef struct compile_data {
00286   const uschar *lcc;            /* Points to lower casing table */
00287   const uschar *fcc;            /* Points to case-flippint table */
00288   const uschar *cbits;          /* Points to character type table */
00289   const uschar *ctypes;         /* Points to table of type maps */
00290 } compile_data;
00291 
00292 /* Structure for passing "static" information around between the functions
00293 doing the matching, so that they are thread-safe. */
00294 
00295 typedef struct match_data {
00296   int    errorcode;             /* As it says */
00297   int   *offset_vector;         /* Offset vector */
00298   int    offset_end;            /* One past the end */
00299   int    offset_max;            /* The maximum usable for return data */
00300   const uschar *lcc;            /* Points to lower casing table */
00301   const uschar *ctypes;         /* Points to table of type maps */
00302   BOOL   offset_overflow;       /* Set if too many extractions */
00303   BOOL   notbol;                /* NOTBOL flag */
00304   BOOL   noteol;                /* NOTEOL flag */
00305   BOOL   endonly;               /* Dollar not before final \n */
00306   const uschar *start_subject;  /* Start of the subject string */
00307   const uschar *end_subject;    /* End of the subject string */
00308   const uschar *end_match_ptr;  /* Subject position at end match */
00309   int     end_offset_top;       /* Highwater mark at end of match */
00310 } match_data;
00311 
00312 /* Bit definitions for entries in the pcre_ctypes table. */
00313 
00314 #define ctype_space   0x01
00315 #define ctype_letter  0x02
00316 #define ctype_digit   0x04
00317 #define ctype_xdigit  0x08
00318 #define ctype_word    0x10   /* alphameric or '_' */
00319 #define ctype_meta    0x80   /* regexp meta char or zero (end pattern) */
00320 
00321 /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
00322 of bits for a class map. */
00323 
00324 #define cbit_digit    0      /* for \d */
00325 #define cbit_word    32      /* for \w */
00326 #define cbit_space   64      /* for \s */
00327 #define cbit_length  96      /* Length of the cbits table */
00328 
00329 /* Offsets of the various tables from the base tables pointer, and
00330 total length. */
00331 
00332 #define lcc_offset      0
00333 #define fcc_offset    256
00334 #define cbits_offset  512
00335 #define ctypes_offset (cbits_offset + cbit_length)
00336 #define tables_length (ctypes_offset + 256)
00337 
00338 /* End of internal.h */

Generated on Thu Mar 28 02:43:59 2024 for VMD (current) by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002