Blame - jni/libpcre/sources/pcre_compile.c - jami-client-android

blob: ca07db1d15957a6e24ae087a122471dddf2ed2fc [file] [log] [blame]

Tristan Matthews	0461646	2013-11-14 16:09:34 -0500	[diff] [blame]	1	/*************************************************
				2	* Perl-Compatible Regular Expressions *
				3	*************************************************/
				4
				5	/* PCRE is a library of functions to support regular expressions whose syntax
				6	and semantics are as close as possible to those of the Perl 5 language.
				7
				8	Written by Philip Hazel
				9	Copyright (c) 1997-2011 University of Cambridge
				10
				11	-----------------------------------------------------------------------------
				12	Redistribution and use in source and binary forms, with or without
				13	modification, are permitted provided that the following conditions are met:
				14
				15	* Redistributions of source code must retain the above copyright notice,
				16	this list of conditions and the following disclaimer.
				17
				18	* Redistributions in binary form must reproduce the above copyright
				19	notice, this list of conditions and the following disclaimer in the
				20	documentation and/or other materials provided with the distribution.
				21
				22	* Neither the name of the University of Cambridge nor the names of its
				23	contributors may be used to endorse or promote products derived from
				24	this software without specific prior written permission.
				25
				26	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				27	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				28	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				29	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
				30	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				31	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				32	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				33	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				34	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				35	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				36	POSSIBILITY OF SUCH DAMAGE.
				37	-----------------------------------------------------------------------------
				38	*/
				39
				40
				41	/* This module contains the external function pcre_compile(), along with
				42	supporting internal functions that are not used by other modules. */
				43
				44
				45	#ifdef HAVE_CONFIG_H
				46	#include "config.h"
				47	#endif
				48
				49	#define NLBLOCK cd /* Block containing newline information */
				50	#define PSSTART start_pattern /* Field containing processed string start */
				51	#define PSEND end_pattern /* Field containing processed string end */
				52
				53	#include "pcre_internal.h"
				54
				55
				56	/* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
				57	also used by pcretest. PCRE_DEBUG is not defined when building a production
				58	library. */
				59
				60	#ifdef PCRE_DEBUG
				61	#include "pcre_printint.src"
				62	#endif
				63
				64
				65	/* Macro for setting individual bits in class bitmaps. */
				66
				67	#define SETBIT(a,b) a[b/8] \|= (1 << (b%8))
				68
				69	/* Maximum length value to check against when making sure that the integer that
				70	holds the compiled pattern length does not overflow. We make it a bit less than
				71	INT_MAX to allow for adding in group terminating bytes, so that we don't have
				72	to check them every time. */
				73
				74	#define OFLOW_MAX (INT_MAX - 20)
				75
				76
				77	/*************************************************
				78	* Code parameters and static tables *
				79	*************************************************/
				80
				81	/* This value specifies the size of stack workspace that is used during the
				82	first pre-compile phase that determines how much memory is required. The regex
				83	is partly compiled into this space, but the compiled parts are discarded as
				84	soon as they can be, so that hopefully there will never be an overrun. The code
				85	does, however, check for an overrun. The largest amount I've seen used is 218,
				86	so this number is very generous.
				87
				88	The same workspace is used during the second, actual compile phase for
				89	remembering forward references to groups so that they can be filled in at the
				90	end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
				91	is 4 there is plenty of room for most patterns. However, the memory can get
				92	filled up by repetitions of forward references, for example patterns like
				93	/(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
				94	that the workspace is expanded using malloc() in this situation. The value
				95	below is therefore a minimum, and we put a maximum on it for safety. The
				96	minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
				97	kicks in at the same number of forward references in all cases. */
				98
				99	#define COMPILE_WORK_SIZE (2048*LINK_SIZE)
				100	#define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
				101
				102	/* The overrun tests check for a slightly smaller size so that they detect the
				103	overrun before it actually does run off the end of the data block. */
				104
				105	#define WORK_SIZE_SAFETY_MARGIN (100)
				106
				107
				108	/* Table for handling escaped characters in the range '0'-'z'. Positive returns
				109	are simple data values; negative values are for special things like \d and so
				110	on. Zero means further processing is needed (for things like \x), or the escape
				111	is invalid. */
				112
				113	#ifndef EBCDIC
				114
				115	/* This is the "normal" table for ASCII systems or for EBCDIC systems running
				116	in UTF-8 mode. */
				117
				118	static const short int escapes[] = {
				119	0, 0,
				120	0, 0,
				121	0, 0,
				122	0, 0,
				123	0, 0,
				124	CHAR_COLON, CHAR_SEMICOLON,
				125	CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
				126	CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
				127	CHAR_COMMERCIAL_AT, -ESC_A,
				128	-ESC_B, -ESC_C,
				129	-ESC_D, -ESC_E,
				130	0, -ESC_G,
				131	-ESC_H, 0,
				132	0, -ESC_K,
				133	0, 0,
				134	-ESC_N, 0,
				135	-ESC_P, -ESC_Q,
				136	-ESC_R, -ESC_S,
				137	0, 0,
				138	-ESC_V, -ESC_W,
				139	-ESC_X, 0,
				140	-ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
				141	CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
				142	CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
				143	CHAR_GRAVE_ACCENT, 7,
				144	-ESC_b, 0,
				145	-ESC_d, ESC_e,
				146	ESC_f, 0,
				147	-ESC_h, 0,
				148	0, -ESC_k,
				149	0, 0,
				150	ESC_n, 0,
				151	-ESC_p, 0,
				152	ESC_r, -ESC_s,
				153	ESC_tee, 0,
				154	-ESC_v, -ESC_w,
				155	0, 0,
				156	-ESC_z
				157	};
				158
				159	#else
				160
				161	/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
				162
				163	static const short int escapes[] = {
				164	/* 48 */ 0, 0, 0, '.', '<', '(', '+', '\|',
				165	/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
				166	/* 58 / 0, 0, '!', '$', '', ')', ';', '~',
				167	/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
				168	/* 68 */ 0, 0, '\|', ',', '%', '_', '>', '?',
				169	/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
				170	/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
				171	/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
				172	/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
				173	/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
				174	/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
				175	/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
				176	/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
				177	/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
				178	/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
				179	/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
				180	/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
				181	/* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
				182	/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
				183	/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
				184	/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
				185	/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
				186	/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
				187	};
				188	#endif
				189
				190
				191	/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
				192	searched linearly. Put all the names into a single string, in order to reduce
				193	the number of relocations when a shared library is dynamically linked. The
				194	string is built from string macros so that it works in UTF-8 mode on EBCDIC
				195	platforms. */
				196
				197	typedef struct verbitem {
				198	int len; /* Length of verb name */
				199	int op; /* Op when no arg, or -1 if arg mandatory */
				200	int op_arg; /* Op when arg present, or -1 if not allowed */
				201	} verbitem;
				202
				203	static const char verbnames[] =
				204	"\0" /* Empty name is a shorthand for MARK */
				205	STRING_MARK0
				206	STRING_ACCEPT0
				207	STRING_COMMIT0
				208	STRING_F0
				209	STRING_FAIL0
				210	STRING_PRUNE0
				211	STRING_SKIP0
				212	STRING_THEN;
				213
				214	static const verbitem verbs[] = {
				215	{ 0, -1, OP_MARK },
				216	{ 4, -1, OP_MARK },
				217	{ 6, OP_ACCEPT, -1 },
				218	{ 6, OP_COMMIT, -1 },
				219	{ 1, OP_FAIL, -1 },
				220	{ 4, OP_FAIL, -1 },
				221	{ 5, OP_PRUNE, OP_PRUNE_ARG },
				222	{ 4, OP_SKIP, OP_SKIP_ARG },
				223	{ 4, OP_THEN, OP_THEN_ARG }
				224	};
				225
				226	static const int verbcount = sizeof(verbs)/sizeof(verbitem);
				227
				228
				229	/* Tables of names of POSIX character classes and their lengths. The names are
				230	now all in a single string, to reduce the number of relocations when a shared
				231	library is dynamically loaded. The list of lengths is terminated by a zero
				232	length entry. The first three must be alpha, lower, upper, as this is assumed
				233	for handling case independence. */
				234
				235	static const char posix_names[] =
				236	STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
				237	STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
				238	STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
				239	STRING_word0 STRING_xdigit;
				240
				241	static const uschar posix_name_lengths[] = {
				242	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
				243
				244	/* Table of class bit maps for each POSIX class. Each class is formed from a
				245	base map, with an optional addition or removal of another map. Then, for some
				246	classes, there is some additional tweaking: for [:blank:] the vertical space
				247	characters are removed, and for [:alpha:] and [:alnum:] the underscore
				248	character is removed. The triples in the table consist of the base map offset,
				249	second map offset or -1 if no second map, and a non-negative value for map
				250	addition or a negative value for map subtraction (if there are two maps). The
				251	absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
				252	remove vertical space characters, 2 => remove underscore. */
				253
				254	static const int posix_class_maps[] = {
				255	cbit_word, cbit_digit, -2, /* alpha */
				256	cbit_lower, -1, 0, /* lower */
				257	cbit_upper, -1, 0, /* upper */
				258	cbit_word, -1, 2, /* alnum - word without underscore */
				259	cbit_print, cbit_cntrl, 0, /* ascii */
				260	cbit_space, -1, 1, /* blank - a GNU extension */
				261	cbit_cntrl, -1, 0, /* cntrl */
				262	cbit_digit, -1, 0, /* digit */
				263	cbit_graph, -1, 0, /* graph */
				264	cbit_print, -1, 0, /* print */
				265	cbit_punct, -1, 0, /* punct */
				266	cbit_space, -1, 0, /* space */
				267	cbit_word, -1, 0, /* word - a Perl extension */
				268	cbit_xdigit,-1, 0 /* xdigit */
				269	};
				270
				271	/* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
				272	substitutes must be in the order of the names, defined above, and there are
				273	both positive and negative cases. NULL means no substitute. */
				274
				275	#ifdef SUPPORT_UCP
				276	static const uschar *substitutes[] = {
				277	(uschar )"\\P{Nd}", / \D */
				278	(uschar )"\\p{Nd}", / \d */
				279	(uschar )"\\P{Xsp}", / \S / / NOTE: Xsp is Perl space */
				280	(uschar )"\\p{Xsp}", / \s */
				281	(uschar )"\\P{Xwd}", / \W */
				282	(uschar )"\\p{Xwd}" / \w */
				283	};
				284
				285	static const uschar *posix_substitutes[] = {
				286	(uschar )"\\p{L}", / alpha */
				287	(uschar )"\\p{Ll}", / lower */
				288	(uschar )"\\p{Lu}", / upper */
				289	(uschar )"\\p{Xan}", / alnum */
				290	NULL, /* ascii */
				291	(uschar )"\\h", / blank */
				292	NULL, /* cntrl */
				293	(uschar )"\\p{Nd}", / digit */
				294	NULL, /* graph */
				295	NULL, /* print */
				296	NULL, /* punct */
				297	(uschar )"\\p{Xps}", / space / / NOTE: Xps is POSIX space */
				298	(uschar )"\\p{Xwd}", / word */
				299	NULL, /* xdigit */
				300	/* Negated cases */
				301	(uschar )"\\P{L}", / ^alpha */
				302	(uschar )"\\P{Ll}", / ^lower */
				303	(uschar )"\\P{Lu}", / ^upper */
				304	(uschar )"\\P{Xan}", / ^alnum */
				305	NULL, /* ^ascii */
				306	(uschar )"\\H", / ^blank */
				307	NULL, /* ^cntrl */
				308	(uschar )"\\P{Nd}", / ^digit */
				309	NULL, /* ^graph */
				310	NULL, /* ^print */
				311	NULL, /* ^punct */
				312	(uschar )"\\P{Xps}", / ^space / / NOTE: Xps is POSIX space */
				313	(uschar )"\\P{Xwd}", / ^word */
				314	NULL /* ^xdigit */
				315	};
				316	#define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
				317	#endif
				318
				319	#define STRING(a) # a
				320	#define XSTRING(s) STRING(s)
				321
				322	/* The texts of compile-time error messages. These are "char *" because they
				323	are passed to the outside world. Do not ever re-use any error number, because
				324	they are documented. Always add a new error instead. Messages marked DEAD below
				325	are no longer used. This used to be a table of strings, but in order to reduce
				326	the number of relocations needed when a shared library is loaded dynamically,
				327	it is now one long string. We cannot use a table of offsets, because the
				328	lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
				329	simply count through to the one we want - this isn't a performance issue
				330	because these strings are used only when there is a compilation error.
				331
				332	Each substring ends with \0 to insert a null character. This includes the final
				333	substring, so that the whole string ends with \0\0, which can be detected when
				334	counting through. */
				335
				336	static const char error_texts[] =
				337	"no error\0"
				338	"\\ at end of pattern\0"
				339	"\\c at end of pattern\0"
				340	"unrecognized character follows \\\0"
				341	"numbers out of order in {} quantifier\0"
				342	/* 5 */
				343	"number too big in {} quantifier\0"
				344	"missing terminating ] for character class\0"
				345	"invalid escape sequence in character class\0"
				346	"range out of order in character class\0"
				347	"nothing to repeat\0"
				348	/* 10 */
				349	"operand of unlimited repeat could match the empty string\0" / DEAD /
				350	"internal error: unexpected repeat\0"
				351	"unrecognized character after (? or (?-\0"
				352	"POSIX named classes are supported only within a class\0"
				353	"missing )\0"
				354	/* 15 */
				355	"reference to non-existent subpattern\0"
				356	"erroffset passed as NULL\0"
				357	"unknown option bit(s) set\0"
				358	"missing ) after comment\0"
				359	"parentheses nested too deeply\0" / DEAD /
				360	/* 20 */
				361	"regular expression is too large\0"
				362	"failed to get memory\0"
				363	"unmatched parentheses\0"
				364	"internal error: code overflow\0"
				365	"unrecognized character after (?<\0"
				366	/* 25 */
				367	"lookbehind assertion is not fixed length\0"
				368	"malformed number or name after (?(\0"
				369	"conditional group contains more than two branches\0"
				370	"assertion expected after (?(\0"
				371	"(?R or (?[+-]digits must be followed by )\0"
				372	/* 30 */
				373	"unknown POSIX class name\0"
				374	"POSIX collating elements are not supported\0"
				375	"this version of PCRE is not compiled with PCRE_UTF8 support\0"
				376	"spare error\0" / DEAD /
				377	"character value in \\x{...} sequence is too large\0"
				378	/* 35 */
				379	"invalid condition (?(0)\0"
				380	"\\C not allowed in lookbehind assertion\0"
				381	"PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
				382	"number after (?C is > 255\0"
				383	"closing ) for (?C expected\0"
				384	/* 40 */
				385	"recursive call could loop indefinitely\0"
				386	"unrecognized character after (?P\0"
				387	"syntax error in subpattern name (missing terminator)\0"
				388	"two named subpatterns have the same name\0"
				389	"invalid UTF-8 string\0"
				390	/* 45 */
				391	"support for \\P, \\p, and \\X has not been compiled\0"
				392	"malformed \\P or \\p sequence\0"
				393	"unknown property name after \\P or \\p\0"
				394	"subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
				395	"too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
				396	/* 50 */
				397	"repeated subpattern is too long\0" / DEAD /
				398	"octal value is greater than \\377 (not in UTF-8 mode)\0"
				399	"internal error: overran compiling workspace\0"
				400	"internal error: previously-checked referenced subpattern not found\0"
				401	"DEFINE group contains more than one branch\0"
				402	/* 55 */
				403	"repeating a DEFINE group is not allowed\0" / DEAD /
				404	"inconsistent NEWLINE options\0"
				405	"\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
				406	"a numbered reference must not be zero\0"
				407	"an argument is not allowed for (ACCEPT), (FAIL), or (*COMMIT)\0"
				408	/* 60 */
				409	"(*VERB) not recognized\0"
				410	"number is too big\0"
				411	"subpattern name expected\0"
				412	"digit expected after (?+\0"
				413	"] is an invalid data character in JavaScript compatibility mode\0"
				414	/* 65 */
				415	"different names for subpatterns of the same number are not allowed\0"
				416	"(*MARK) must have an argument\0"
				417	"this version of PCRE is not compiled with PCRE_UCP support\0"
				418	"\\c must be followed by an ASCII character\0"
				419	"\\k is not followed by a braced, angle-bracketed, or quoted name\0"
				420	/* 70 */
				421	"internal error: unknown opcode in find_fixedlength()\0"
				422	"\\N is not supported in a class\0"
				423	"too many forward references\0"
				424	;
				425
				426	/* Table to identify digits and hex digits. This is used when compiling
				427	patterns. Note that the tables in chartables are dependent on the locale, and
				428	may mark arbitrary characters as digits - but the PCRE compiling code expects
				429	to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
				430	a private table here. It costs 256 bytes, but it is a lot faster than doing
				431	character value tests (at least in some simple cases I timed), and in some
				432	applications one wants PCRE to compile efficiently as well as match
				433	efficiently.
				434
				435	For convenience, we use the same bit definitions as in chartables:
				436
				437	0x04 decimal digit
				438	0x08 hexadecimal digit
				439
				440	Then we can use ctype_digit and ctype_xdigit in the code. */
				441
				442	#ifndef EBCDIC
				443
				444	/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
				445	UTF-8 mode. */
				446
				447	static const unsigned char digitab[] =
				448	{
				449	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
				450	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
				451	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
				452	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
				453	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
				454	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
				455	0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
				456	0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
				457	0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
				458	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
				459	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
				460	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
				461	0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
				462	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
				463	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
				464	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
				465	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
				466	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
				467	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
				468	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
				469	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
				470	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
				471	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
				472	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
				473	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
				474	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
				475	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
				476	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
				477	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
				478	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
				479	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
				480	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
				481
				482	#else
				483
				484	/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
				485
				486	static const unsigned char digitab[] =
				487	{
				488	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
				489	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
				490	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
				491	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
				492	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
				493	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
				494	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
				495	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
				496	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
				497	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- \| */
				498	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
				499	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
				500	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
				501	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
				502	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
				503	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
				504	0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
				505	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
				506	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
				507	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
				508	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
				509	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
				510	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
				511	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
				512	0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
				513	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
				514	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
				515	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
				516	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
				517	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
				518	0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
				519	0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
				520
				521	static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
				522	0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
				523	0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
				524	0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
				525	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
				526	0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
				527	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
				528	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
				529	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
				530	0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
				531	0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- \| */
				532	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
				533	0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
				534	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
				535	0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
				536	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
				537	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
				538	0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
				539	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
				540	0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
				541	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
				542	0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
				543	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
				544	0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
				545	0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
				546	0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
				547	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
				548	0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
				549	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
				550	0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
				551	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
				552	0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
				553	0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
				554	#endif
				555
				556
				557	/* Definition to allow mutual recursion */
				558
				559	static BOOL
				560	compile_regex(int, uschar , const uschar , int *, BOOL, BOOL, int, int,
				561	int , int , branch_chain , compile_data , int *);
				562
				563
				564
				565	/*************************************************
				566	* Find an error text *
				567	*************************************************/
				568
				569	/* The error texts are now all in one long string, to save on relocations. As
				570	some of the text is of unknown length, we can't use a table of offsets.
				571	Instead, just count through the strings. This is not a performance issue
				572	because it happens only when there has been a compilation error.
				573
				574	Argument: the error number
				575	Returns: pointer to the error string
				576	*/
				577
				578	static const char *
				579	find_error_text(int n)
				580	{
				581	const char *s = error_texts;
				582	for (; n > 0; n--)
				583	{
				584	while (*s++ != 0) {};
				585	if (*s == 0) return "Error text not found (please report)";
				586	}
				587	return s;
				588	}
				589
				590
				591	/*************************************************
				592	* Expand the workspace *
				593	*************************************************/
				594
				595	/* This function is called during the second compiling phase, if the number of
				596	forward references fills the existing workspace, which is originally a block on
				597	the stack. A larger block is obtained from malloc() unless the ultimate limit
				598	has been reached or the increase will be rather small.
				599
				600	Argument: pointer to the compile data block
				601	Returns: 0 if all went well, else an error number
				602	*/
				603
				604	static int
				605	expand_workspace(compile_data *cd)
				606	{
				607	uschar *newspace;
				608	int newsize = cd->workspace_size * 2;
				609
				610	if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
				611	if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX \|\|
				612	newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
				613	return ERR72;
				614
				615	newspace = (pcre_malloc)(newsize);
				616	if (newspace == NULL) return ERR21;
				617
				618	memcpy(newspace, cd->start_workspace, cd->workspace_size);
				619	cd->hwm = (uschar *)newspace + (cd->hwm - cd->start_workspace);
				620	if (cd->workspace_size > COMPILE_WORK_SIZE)
				621	(pcre_free)((void *)cd->start_workspace);
				622	cd->start_workspace = newspace;
				623	cd->workspace_size = newsize;
				624	return 0;
				625	}
				626
				627
				628
				629	/*************************************************
				630	* Check for counted repeat *
				631	*************************************************/
				632
				633	/* This function is called when a '{' is encountered in a place where it might
				634	start a quantifier. It looks ahead to see if it really is a quantifier or not.
				635	It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
				636	where the ddds are digits.
				637
				638	Arguments:
				639	p pointer to the first char after '{'
				640
				641	Returns: TRUE or FALSE
				642	*/
				643
				644	static BOOL
				645	is_counted_repeat(const uschar *p)
				646	{
				647	if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
				648	while ((digitab[*p] & ctype_digit) != 0) p++;
				649	if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
				650
				651	if (*p++ != CHAR_COMMA) return FALSE;
				652	if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
				653
				654	if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
				655	while ((digitab[*p] & ctype_digit) != 0) p++;
				656
				657	return (*p == CHAR_RIGHT_CURLY_BRACKET);
				658	}
				659
				660
				661
				662	/*************************************************
				663	* Handle escapes *
				664	*************************************************/
				665
				666	/* This function is called when a \ has been encountered. It either returns a
				667	positive value for a simple escape such as \n, or a negative value which
				668	encodes one of the more complicated things such as \d. A backreference to group
				669	n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
				670	UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
				671	ptr is pointing at the \. On exit, it is on the final character of the escape
				672	sequence.
				673
				674	Arguments:
				675	ptrptr points to the pattern position pointer
				676	errorcodeptr points to the errorcode variable
				677	bracount number of previous extracting brackets
				678	options the options bits
				679	isclass TRUE if inside a character class
				680
				681	Returns: zero or positive => a data character
				682	negative => a special escape sequence
				683	on error, errorcodeptr is set
				684	*/
				685
				686	static int
				687	check_escape(const uschar *ptrptr, int errorcodeptr, int bracount,
				688	int options, BOOL isclass)
				689	{
				690	BOOL utf8 = (options & PCRE_UTF8) != 0;
				691	const uschar ptr = ptrptr + 1;
				692	int c, i;
				693
				694	GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
				695	ptr--; /* Set pointer back to the last byte */
				696
				697	/* If backslash is at the end of the pattern, it's an error. */
				698
				699	if (c == 0) *errorcodeptr = ERR1;
				700
				701	/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
				702	in a table. A non-zero result is something that can be returned immediately.
				703	Otherwise further processing may be required. */
				704
				705	#ifndef EBCDIC /* ASCII/UTF-8 coding */
				706	else if (c < CHAR_0 \|\| c > CHAR_z) {} /* Not alphanumeric */
				707	else if ((i = escapes[c - CHAR_0]) != 0) c = i;
				708
				709	#else /* EBCDIC coding */
				710	else if (c < 'a' \|\| (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
				711	else if ((i = escapes[c - 0x48]) != 0) c = i;
				712	#endif
				713
				714	/* Escapes that need further processing, or are illegal. */
				715
				716	else
				717	{
				718	const uschar *oldptr;
				719	BOOL braced, negated;
				720
				721	switch (c)
				722	{
				723	/* A number of Perl escapes are not handled by PCRE. We give an explicit
				724	error. */
				725
				726	case CHAR_l:
				727	case CHAR_L:
				728	*errorcodeptr = ERR37;
				729	break;
				730
				731	case CHAR_u:
				732	if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
				733	{
				734	/* In JavaScript, \u must be followed by four hexadecimal numbers.
				735	Otherwise it is a lowercase u letter. */
				736	if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0
				737	&& (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0)
				738	{
				739	c = 0;
				740	for (i = 0; i < 4; ++i)
				741	{
				742	register int cc = *(++ptr);
				743	#ifndef EBCDIC /* ASCII/UTF-8 coding */
				744	if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
				745	c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
				746	#else /* EBCDIC coding */
				747	if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
				748	c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
				749	#endif
				750	}
				751	}
				752	}
				753	else
				754	*errorcodeptr = ERR37;
				755	break;
				756
				757	case CHAR_U:
				758	/* In JavaScript, \U is an uppercase U letter. */
				759	if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
				760	break;
				761
				762	/* In a character class, \g is just a literal "g". Outside a character
				763	class, \g must be followed by one of a number of specific things:
				764
				765	(1) A number, either plain or braced. If positive, it is an absolute
				766	backreference. If negative, it is a relative backreference. This is a Perl
				767	5.10 feature.
				768
				769	(2) Perl 5.10 also supports \g{name} as a reference to a named group. This
				770	is part of Perl's movement towards a unified syntax for back references. As
				771	this is synonymous with \k{name}, we fudge it up by pretending it really
				772	was \k.
				773
				774	(3) For Oniguruma compatibility we also support \g followed by a name or a
				775	number either in angle brackets or in single quotes. However, these are
				776	(possibly recursive) subroutine calls, _not_ backreferences. Just return
				777	the -ESC_g code (cf \k). */
				778
				779	case CHAR_g:
				780	if (isclass) break;
				781	if (ptr[1] == CHAR_LESS_THAN_SIGN \|\| ptr[1] == CHAR_APOSTROPHE)
				782	{
				783	c = -ESC_g;
				784	break;
				785	}
				786
				787	/* Handle the Perl-compatible cases */
				788
				789	if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
				790	{
				791	const uschar *p;
				792	for (p = ptr+2; p != 0 && p != CHAR_RIGHT_CURLY_BRACKET; p++)
				793	if (p != CHAR_MINUS && (digitab[p] & ctype_digit) == 0) break;
				794	if (p != 0 && p != CHAR_RIGHT_CURLY_BRACKET)
				795	{
				796	c = -ESC_k;
				797	break;
				798	}
				799	braced = TRUE;
				800	ptr++;
				801	}
				802	else braced = FALSE;
				803
				804	if (ptr[1] == CHAR_MINUS)
				805	{
				806	negated = TRUE;
				807	ptr++;
				808	}
				809	else negated = FALSE;
				810
				811	c = 0;
				812	while ((digitab[ptr[1]] & ctype_digit) != 0)
				813	c = c * 10 + *(++ptr) - CHAR_0;
				814
				815	if (c < 0) /* Integer overflow */
				816	{
				817	*errorcodeptr = ERR61;
				818	break;
				819	}
				820
				821	if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
				822	{
				823	*errorcodeptr = ERR57;
				824	break;
				825	}
				826
				827	if (c == 0)
				828	{
				829	*errorcodeptr = ERR58;
				830	break;
				831	}
				832
				833	if (negated)
				834	{
				835	if (c > bracount)
				836	{
				837	*errorcodeptr = ERR15;
				838	break;
				839	}
				840	c = bracount - (c - 1);
				841	}
				842
				843	c = -(ESC_REF + c);
				844	break;
				845
				846	/* The handling of escape sequences consisting of a string of digits
				847	starting with one that is not zero is not straightforward. By experiment,
				848	the way Perl works seems to be as follows:
				849
				850	Outside a character class, the digits are read as a decimal number. If the
				851	number is less than 10, or if there are that many previous extracting
				852	left brackets, then it is a back reference. Otherwise, up to three octal
				853	digits are read to form an escaped byte. Thus \123 is likely to be octal
				854	123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
				855	value is greater than 377, the least significant 8 bits are taken. Inside a
				856	character class, \ followed by a digit is always an octal number. */
				857
				858	case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
				859	case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
				860
				861	if (!isclass)
				862	{
				863	oldptr = ptr;
				864	c -= CHAR_0;
				865	while ((digitab[ptr[1]] & ctype_digit) != 0)
				866	c = c * 10 + *(++ptr) - CHAR_0;
				867	if (c < 0) /* Integer overflow */
				868	{
				869	*errorcodeptr = ERR61;
				870	break;
				871	}
				872	if (c < 10 \|\| c <= bracount)
				873	{
				874	c = -(ESC_REF + c);
				875	break;
				876	}
				877	ptr = oldptr; /* Put the pointer back and fall through */
				878	}
				879
				880	/* Handle an octal number following \. If the first digit is 8 or 9, Perl
				881	generates a binary zero byte and treats the digit as a following literal.
				882	Thus we have to pull back the pointer by one. */
				883
				884	if ((c = *ptr) >= CHAR_8)
				885	{
				886	ptr--;
				887	c = 0;
				888	break;
				889	}
				890
				891	/* \0 always starts an octal number, but we may drop through to here with a
				892	larger first octal digit. The original code used just to take the least
				893	significant 8 bits of octal numbers (I think this is what early Perls used
				894	to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
				895	than 3 octal digits. */
				896
				897	case CHAR_0:
				898	c -= CHAR_0;
				899	while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
				900	c = c * 8 + *(++ptr) - CHAR_0;
				901	if (!utf8 && c > 255) *errorcodeptr = ERR51;
				902	break;
				903
				904	/* \x is complicated. \x{ddd} is a character number which can be greater
				905	than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
				906	treated as a data character. */
				907
				908	case CHAR_x:
				909	if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
				910	{
				911	/* In JavaScript, \x must be followed by two hexadecimal numbers.
				912	Otherwise it is a lowercase x letter. */
				913	if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0)
				914	{
				915	c = 0;
				916	for (i = 0; i < 2; ++i)
				917	{
				918	register int cc = *(++ptr);
				919	#ifndef EBCDIC /* ASCII/UTF-8 coding */
				920	if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
				921	c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
				922	#else /* EBCDIC coding */
				923	if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
				924	c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
				925	#endif
				926	}
				927	}
				928	break;
				929	}
				930
				931	if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
				932	{
				933	const uschar *pt = ptr + 2;
				934	int count = 0;
				935
				936	c = 0;
				937	while ((digitab[*pt] & ctype_xdigit) != 0)
				938	{
				939	register int cc = *pt++;
				940	if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
				941	count++;
				942
				943	#ifndef EBCDIC /* ASCII/UTF-8 coding */
				944	if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
				945	c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
				946	#else /* EBCDIC coding */
				947	if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
				948	c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
				949	#endif
				950	}
				951
				952	if (*pt == CHAR_RIGHT_CURLY_BRACKET)
				953	{
				954	if (c < 0 \|\| count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
				955	ptr = pt;
				956	break;
				957	}
				958
				959	/* If the sequence of hex digits does not end with '}', then we don't
				960	recognize this construct; fall through to the normal \x handling. */
				961	}
				962
				963	/* Read just a single-byte hex-defined char */
				964
				965	c = 0;
				966	while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
				967	{
				968	int cc; /* Some compilers don't like */
				969	cc = (++ptr); / ++ in initializers */
				970	#ifndef EBCDIC /* ASCII/UTF-8 coding */
				971	if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
				972	c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
				973	#else /* EBCDIC coding */
				974	if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
				975	c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
				976	#endif
				977	}
				978	break;
				979
				980	/* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
				981	An error is given if the byte following \c is not an ASCII character. This
				982	coding is ASCII-specific, but then the whole concept of \cx is
				983	ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
				984
				985	case CHAR_c:
				986	c = *(++ptr);
				987	if (c == 0)
				988	{
				989	*errorcodeptr = ERR2;
				990	break;
				991	}
				992	#ifndef EBCDIC /* ASCII/UTF-8 coding */
				993	if (c > 127) /* Excludes all non-ASCII in either mode */
				994	{
				995	*errorcodeptr = ERR68;
				996	break;
				997	}
				998	if (c >= CHAR_a && c <= CHAR_z) c -= 32;
				999	c ^= 0x40;
				1000	#else /* EBCDIC coding */
				1001	if (c >= CHAR_a && c <= CHAR_z) c += 64;
				1002	c ^= 0xC0;
				1003	#endif
				1004	break;
				1005
				1006	/* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
				1007	other alphanumeric following \ is an error if PCRE_EXTRA was set;
				1008	otherwise, for Perl compatibility, it is a literal. This code looks a bit
				1009	odd, but there used to be some cases other than the default, and there may
				1010	be again in future, so I haven't "optimized" it. */
				1011
				1012	default:
				1013	if ((options & PCRE_EXTRA) != 0) switch(c)
				1014	{
				1015	default:
				1016	*errorcodeptr = ERR3;
				1017	break;
				1018	}
				1019	break;
				1020	}
				1021	}
				1022
				1023	/* Perl supports \N{name} for character names, as well as plain \N for "not
				1024	newline". PCRE does not support \N{name}. However, it does support
				1025	quantification such as \N{2,3}. */
				1026
				1027	if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
				1028	!is_counted_repeat(ptr+2))
				1029	*errorcodeptr = ERR37;
				1030
				1031	/* If PCRE_UCP is set, we change the values for \d etc. */
				1032
				1033	if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
				1034	c -= (ESC_DU - ESC_D);
				1035
				1036	/* Set the pointer to the final character before returning. */
				1037
				1038	*ptrptr = ptr;
				1039	return c;
				1040	}
				1041
				1042
				1043
				1044	#ifdef SUPPORT_UCP
				1045	/*************************************************
				1046	* Handle \P and \p *
				1047	*************************************************/
				1048
				1049	/* This function is called after \P or \p has been encountered, provided that
				1050	PCRE is compiled with support for Unicode properties. On entry, ptrptr is
				1051	pointing at the P or p. On exit, it is pointing at the final character of the
				1052	escape sequence.
				1053
				1054	Argument:
				1055	ptrptr points to the pattern position pointer
				1056	negptr points to a boolean that is set TRUE for negation else FALSE
				1057	dptr points to an int that is set to the detailed property value
				1058	errorcodeptr points to the error code variable
				1059
				1060	Returns: type value from ucp_type_table, or -1 for an invalid type
				1061	*/
				1062
				1063	static int
				1064	get_ucp(const uschar *ptrptr, BOOL negptr, int dptr, int errorcodeptr)
				1065	{
				1066	int c, i, bot, top;
				1067	const uschar ptr = ptrptr;
				1068	char name[32];
				1069
				1070	c = *(++ptr);
				1071	if (c == 0) goto ERROR_RETURN;
				1072
				1073	*negptr = FALSE;
				1074
				1075	/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
				1076	negation. */
				1077
				1078	if (c == CHAR_LEFT_CURLY_BRACKET)
				1079	{
				1080	if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
				1081	{
				1082	*negptr = TRUE;
				1083	ptr++;
				1084	}
				1085	for (i = 0; i < (int)sizeof(name) - 1; i++)
				1086	{
				1087	c = *(++ptr);
				1088	if (c == 0) goto ERROR_RETURN;
				1089	if (c == CHAR_RIGHT_CURLY_BRACKET) break;
				1090	name[i] = c;
				1091	}
				1092	if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
				1093	name[i] = 0;
				1094	}
				1095
				1096	/* Otherwise there is just one following character */
				1097
				1098	else
				1099	{
				1100	name[0] = c;
				1101	name[1] = 0;
				1102	}
				1103
				1104	*ptrptr = ptr;
				1105
				1106	/* Search for a recognized property name using binary chop */
				1107
				1108	bot = 0;
				1109	top = _pcre_utt_size;
				1110
				1111	while (bot < top)
				1112	{
				1113	i = (bot + top) >> 1;
				1114	c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
				1115	if (c == 0)
				1116	{
				1117	*dptr = _pcre_utt[i].value;
				1118	return _pcre_utt[i].type;
				1119	}
				1120	if (c > 0) bot = i + 1; else top = i;
				1121	}
				1122
				1123	*errorcodeptr = ERR47;
				1124	*ptrptr = ptr;
				1125	return -1;
				1126
				1127	ERROR_RETURN:
				1128	*errorcodeptr = ERR46;
				1129	*ptrptr = ptr;
				1130	return -1;
				1131	}
				1132	#endif
				1133
				1134
				1135
				1136
				1137	/*************************************************
				1138	* Read repeat counts *
				1139	*************************************************/
				1140
				1141	/* Read an item of the form {n,m} and return the values. This is called only
				1142	after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
				1143	so the syntax is guaranteed to be correct, but we need to check the values.
				1144
				1145	Arguments:
				1146	p pointer to first char after '{'
				1147	minp pointer to int for min
				1148	maxp pointer to int for max
				1149	returned as -1 if no max
				1150	errorcodeptr points to error code variable
				1151
				1152	Returns: pointer to '}' on success;
				1153	current ptr on error, with errorcodeptr set non-zero
				1154	*/
				1155
				1156	static const uschar *
				1157	read_repeat_counts(const uschar p, int minp, int maxp, int errorcodeptr)
				1158	{
				1159	int min = 0;
				1160	int max = -1;
				1161
				1162	/* Read the minimum value and do a paranoid check: a negative value indicates
				1163	an integer overflow. */
				1164
				1165	while ((digitab[p] & ctype_digit) != 0) min = min 10 + *p++ - CHAR_0;
				1166	if (min < 0 \|\| min > 65535)
				1167	{
				1168	*errorcodeptr = ERR5;
				1169	return p;
				1170	}
				1171
				1172	/* Read the maximum value if there is one, and again do a paranoid on its size.
				1173	Also, max must not be less than min. */
				1174
				1175	if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
				1176	{
				1177	if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
				1178	{
				1179	max = 0;
				1180	while((digitab[p] & ctype_digit) != 0) max = max 10 + *p++ - CHAR_0;
				1181	if (max < 0 \|\| max > 65535)
				1182	{
				1183	*errorcodeptr = ERR5;
				1184	return p;
				1185	}
				1186	if (max < min)
				1187	{
				1188	*errorcodeptr = ERR4;
				1189	return p;
				1190	}
				1191	}
				1192	}
				1193
				1194	/* Fill in the required variables, and pass back the pointer to the terminating
				1195	'}'. */
				1196
				1197	*minp = min;
				1198	*maxp = max;
				1199	return p;
				1200	}
				1201
				1202
				1203
				1204	/*************************************************
				1205	* Subroutine for finding forward reference *
				1206	*************************************************/
				1207
				1208	/* This recursive function is called only from find_parens() below. The
				1209	top-level call starts at the beginning of the pattern. All other calls must
				1210	start at a parenthesis. It scans along a pattern's text looking for capturing
				1211	subpatterns, and counting them. If it finds a named pattern that matches the
				1212	name it is given, it returns its number. Alternatively, if the name is NULL, it
				1213	returns when it reaches a given numbered subpattern. Recursion is used to keep
				1214	track of subpatterns that reset the capturing group numbers - the (?\| feature.
				1215
				1216	This function was originally called only from the second pass, in which we know
				1217	that if (?< or (?' or (?P< is encountered, the name will be correctly
				1218	terminated because that is checked in the first pass. There is now one call to
				1219	this function in the first pass, to check for a recursive back reference by
				1220	name (so that we can make the whole group atomic). In this case, we need check
				1221	only up to the current position in the pattern, and that is still OK because
				1222	and previous occurrences will have been checked. To make this work, the test
				1223	for "end of pattern" is a check against cd->end_pattern in the main loop,
				1224	instead of looking for a binary zero. This means that the special first-pass
				1225	call can adjust cd->end_pattern temporarily. (Checks for binary zero while
				1226	processing items within the loop are OK, because afterwards the main loop will
				1227	terminate.)
				1228
				1229	Arguments:
				1230	ptrptr address of the current character pointer (updated)
				1231	cd compile background data
				1232	name name to seek, or NULL if seeking a numbered subpattern
				1233	lorn name length, or subpattern number if name is NULL
				1234	xmode TRUE if we are in /x mode
				1235	utf8 TRUE if we are in UTF-8 mode
				1236	count pointer to the current capturing subpattern number (updated)
				1237
				1238	Returns: the number of the named subpattern, or -1 if not found
				1239	*/
				1240
				1241	static int
				1242	find_parens_sub(uschar *ptrptr, compile_data cd, const uschar *name, int lorn,
				1243	BOOL xmode, BOOL utf8, int *count)
				1244	{
				1245	uschar ptr = ptrptr;
				1246	int start_count = *count;
				1247	int hwm_count = start_count;
				1248	BOOL dup_parens = FALSE;
				1249
				1250	/* If the first character is a parenthesis, check on the type of group we are
				1251	dealing with. The very first call may not start with a parenthesis. */
				1252
				1253	if (ptr[0] == CHAR_LEFT_PARENTHESIS)
				1254	{
				1255	/* Handle specials such as (SKIP) or (UTF8) etc. */
				1256
				1257	if (ptr[1] == CHAR_ASTERISK) ptr += 2;
				1258
				1259	/* Handle a normal, unnamed capturing parenthesis. */
				1260
				1261	else if (ptr[1] != CHAR_QUESTION_MARK)
				1262	{
				1263	*count += 1;
				1264	if (name == NULL && count == lorn) return count;
				1265	ptr++;
				1266	}
				1267
				1268	/* All cases now have (? at the start. Remember when we are in a group
				1269	where the parenthesis numbers are duplicated. */
				1270
				1271	else if (ptr[2] == CHAR_VERTICAL_LINE)
				1272	{
				1273	ptr += 3;
				1274	dup_parens = TRUE;
				1275	}
				1276
				1277	/* Handle comments; all characters are allowed until a ket is reached. */
				1278
				1279	else if (ptr[2] == CHAR_NUMBER_SIGN)
				1280	{
				1281	for (ptr += 3; ptr != 0; ptr++) if (ptr == CHAR_RIGHT_PARENTHESIS) break;
				1282	goto FAIL_EXIT;
				1283	}
				1284
				1285	/* Handle a condition. If it is an assertion, just carry on so that it
				1286	is processed as normal. If not, skip to the closing parenthesis of the
				1287	condition (there can't be any nested parens). */
				1288
				1289	else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
				1290	{
				1291	ptr += 2;
				1292	if (ptr[1] != CHAR_QUESTION_MARK)
				1293	{
				1294	while (ptr != 0 && ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
				1295	if (*ptr != 0) ptr++;
				1296	}
				1297	}
				1298
				1299	/* Start with (? but not a condition. */
				1300
				1301	else
				1302	{
				1303	ptr += 2;
				1304	if (ptr == CHAR_P) ptr++; / Allow optional P */
				1305
				1306	/* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
				1307
				1308	if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
				1309	ptr[1] != CHAR_EQUALS_SIGN) \|\| *ptr == CHAR_APOSTROPHE)
				1310	{
				1311	int term;
				1312	const uschar *thisname;
				1313	*count += 1;
				1314	if (name == NULL && count == lorn) return count;
				1315	term = *ptr++;
				1316	if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
				1317	thisname = ptr;
				1318	while (*ptr != term) ptr++;
				1319	if (name != NULL && lorn == ptr - thisname &&
				1320	strncmp((const char )name, (const char )thisname, lorn) == 0)
				1321	return *count;
				1322	term++;
				1323	}
				1324	}
				1325	}
				1326
				1327	/* Past any initial parenthesis handling, scan for parentheses or vertical
				1328	bars. Stop if we get to cd->end_pattern. Note that this is important for the
				1329	first-pass call when this value is temporarily adjusted to stop at the current
				1330	position. So DO NOT change this to a test for binary zero. */
				1331
				1332	for (; ptr < cd->end_pattern; ptr++)
				1333	{
				1334	/* Skip over backslashed characters and also entire \Q...\E */
				1335
				1336	if (*ptr == CHAR_BACKSLASH)
				1337	{
				1338	if (*(++ptr) == 0) goto FAIL_EXIT;
				1339	if (*ptr == CHAR_Q) for (;;)
				1340	{
				1341	while ((++ptr) != 0 && ptr != CHAR_BACKSLASH) {};
				1342	if (*ptr == 0) goto FAIL_EXIT;
				1343	if (*(++ptr) == CHAR_E) break;
				1344	}
				1345	continue;
				1346	}
				1347
				1348	/* Skip over character classes; this logic must be similar to the way they
				1349	are handled for real. If the first character is '^', skip it. Also, if the
				1350	first few characters (either before or after ^) are \Q\E or \E we skip them
				1351	too. This makes for compatibility with Perl. Note the use of STR macros to
				1352	encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
				1353
				1354	if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
				1355	{
				1356	BOOL negate_class = FALSE;
				1357	for (;;)
				1358	{
				1359	if (ptr[1] == CHAR_BACKSLASH)
				1360	{
				1361	if (ptr[2] == CHAR_E)
				1362	ptr+= 2;
				1363	else if (strncmp((const char *)ptr+2,
				1364	STR_Q STR_BACKSLASH STR_E, 3) == 0)
				1365	ptr += 4;
				1366	else
				1367	break;
				1368	}
				1369	else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
				1370	{
				1371	negate_class = TRUE;
				1372	ptr++;
				1373	}
				1374	else break;
				1375	}
				1376
				1377	/* If the next character is ']', it is a data character that must be
				1378	skipped, except in JavaScript compatibility mode. */
				1379
				1380	if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
				1381	(cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
				1382	ptr++;
				1383
				1384	while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
				1385	{
				1386	if (*ptr == 0) return -1;
				1387	if (*ptr == CHAR_BACKSLASH)
				1388	{
				1389	if (*(++ptr) == 0) goto FAIL_EXIT;
				1390	if (*ptr == CHAR_Q) for (;;)
				1391	{
				1392	while ((++ptr) != 0 && ptr != CHAR_BACKSLASH) {};
				1393	if (*ptr == 0) goto FAIL_EXIT;
				1394	if (*(++ptr) == CHAR_E) break;
				1395	}
				1396	continue;
				1397	}
				1398	}
				1399	continue;
				1400	}
				1401
				1402	/* Skip comments in /x mode */
				1403
				1404	if (xmode && *ptr == CHAR_NUMBER_SIGN)
				1405	{
				1406	ptr++;
				1407	while (*ptr != 0)
				1408	{
				1409	if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
				1410	ptr++;
				1411	#ifdef SUPPORT_UTF8
				1412	if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
				1413	#endif
				1414	}
				1415	if (*ptr == 0) goto FAIL_EXIT;
				1416	continue;
				1417	}
				1418
				1419	/* Check for the special metacharacters */
				1420
				1421	if (*ptr == CHAR_LEFT_PARENTHESIS)
				1422	{
				1423	int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
				1424	if (rc > 0) return rc;
				1425	if (*ptr == 0) goto FAIL_EXIT;
				1426	}
				1427
				1428	else if (*ptr == CHAR_RIGHT_PARENTHESIS)
				1429	{
				1430	if (dup_parens && count < hwm_count) count = hwm_count;
				1431	goto FAIL_EXIT;
				1432	}
				1433
				1434	else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
				1435	{
				1436	if (count > hwm_count) hwm_count = count;
				1437	*count = start_count;
				1438	}
				1439	}
				1440
				1441	FAIL_EXIT:
				1442	*ptrptr = ptr;
				1443	return -1;
				1444	}
				1445
				1446
				1447
				1448
				1449	/*************************************************
				1450	* Find forward referenced subpattern *
				1451	*************************************************/
				1452
				1453	/* This function scans along a pattern's text looking for capturing
				1454	subpatterns, and counting them. If it finds a named pattern that matches the
				1455	name it is given, it returns its number. Alternatively, if the name is NULL, it
				1456	returns when it reaches a given numbered subpattern. This is used for forward
				1457	references to subpatterns. We used to be able to start this scan from the
				1458	current compiling point, using the current count value from cd->bracount, and
				1459	do it all in a single loop, but the addition of the possibility of duplicate
				1460	subpattern numbers means that we have to scan from the very start, in order to
				1461	take account of such duplicates, and to use a recursive function to keep track
				1462	of the different types of group.
				1463
				1464	Arguments:
				1465	cd compile background data
				1466	name name to seek, or NULL if seeking a numbered subpattern
				1467	lorn name length, or subpattern number if name is NULL
				1468	xmode TRUE if we are in /x mode
				1469	utf8 TRUE if we are in UTF-8 mode
				1470
				1471	Returns: the number of the found subpattern, or -1 if not found
				1472	*/
				1473
				1474	static int
				1475	find_parens(compile_data cd, const uschar name, int lorn, BOOL xmode,
				1476	BOOL utf8)
				1477	{
				1478	uschar ptr = (uschar )cd->start_pattern;
				1479	int count = 0;
				1480	int rc;
				1481
				1482	/* If the pattern does not start with an opening parenthesis, the first call
				1483	to find_parens_sub() will scan right to the end (if necessary). However, if it
				1484	does start with a parenthesis, find_parens_sub() will return when it hits the
				1485	matching closing parens. That is why we have to have a loop. */
				1486
				1487	for (;;)
				1488	{
				1489	rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
				1490	if (rc > 0 \|\| *ptr++ == 0) break;
				1491	}
				1492
				1493	return rc;
				1494	}
				1495
				1496
				1497
				1498
				1499	/*************************************************
				1500	* Find first significant op code *
				1501	*************************************************/
				1502
				1503	/* This is called by several functions that scan a compiled expression looking
				1504	for a fixed first character, or an anchoring op code etc. It skips over things
				1505	that do not influence this. For some calls, it makes sense to skip negative
				1506	forward and all backward assertions, and also the \b assertion; for others it
				1507	does not.
				1508
				1509	Arguments:
				1510	code pointer to the start of the group
				1511	skipassert TRUE if certain assertions are to be skipped
				1512
				1513	Returns: pointer to the first significant opcode
				1514	*/
				1515
				1516	static const uschar*
				1517	first_significant_code(const uschar *code, BOOL skipassert)
				1518	{
				1519	for (;;)
				1520	{
				1521	switch ((int)*code)
				1522	{
				1523	case OP_ASSERT_NOT:
				1524	case OP_ASSERTBACK:
				1525	case OP_ASSERTBACK_NOT:
				1526	if (!skipassert) return code;
				1527	do code += GET(code, 1); while (*code == OP_ALT);
				1528	code += _pcre_OP_lengths[*code];
				1529	break;
				1530
				1531	case OP_WORD_BOUNDARY:
				1532	case OP_NOT_WORD_BOUNDARY:
				1533	if (!skipassert) return code;
				1534	/* Fall through */
				1535
				1536	case OP_CALLOUT:
				1537	case OP_CREF:
				1538	case OP_NCREF:
				1539	case OP_RREF:
				1540	case OP_NRREF:
				1541	case OP_DEF:
				1542	code += _pcre_OP_lengths[*code];
				1543	break;
				1544
				1545	default:
				1546	return code;
				1547	}
				1548	}
				1549	/* Control never reaches here */
				1550	}
				1551
				1552
				1553
				1554
				1555	/*************************************************
				1556	* Find the fixed length of a branch *
				1557	*************************************************/
				1558
				1559	/* Scan a branch and compute the fixed length of subject that will match it,
				1560	if the length is fixed. This is needed for dealing with backward assertions.
				1561	In UTF8 mode, the result is in characters rather than bytes. The branch is
				1562	temporarily terminated with OP_END when this function is called.
				1563
				1564	This function is called when a backward assertion is encountered, so that if it
				1565	fails, the error message can point to the correct place in the pattern.
				1566	However, we cannot do this when the assertion contains subroutine calls,
				1567	because they can be forward references. We solve this by remembering this case
				1568	and doing the check at the end; a flag specifies which mode we are running in.
				1569
				1570	Arguments:
				1571	code points to the start of the pattern (the bracket)
				1572	utf8 TRUE in UTF-8 mode
				1573	atend TRUE if called when the pattern is complete
				1574	cd the "compile data" structure
				1575
				1576	Returns: the fixed length,
				1577	or -1 if there is no fixed length,
				1578	or -2 if \C was encountered (in UTF-8 mode only)
				1579	or -3 if an OP_RECURSE item was encountered and atend is FALSE
				1580	or -4 if an unknown opcode was encountered (internal error)
				1581	*/
				1582
				1583	static int
				1584	find_fixedlength(uschar code, BOOL utf8, BOOL atend, compile_data cd)
				1585	{
				1586	int length = -1;
				1587
				1588	register int branchlength = 0;
				1589	register uschar *cc = code + 1 + LINK_SIZE;
				1590
				1591	/* Scan along the opcodes for this branch. If we get to the end of the
				1592	branch, check the length against that of the other branches. */
				1593
				1594	for (;;)
				1595	{
				1596	int d;
				1597	uschar ce, cs;
				1598	register int op = *cc;
				1599	switch (op)
				1600	{
				1601	/* We only need to continue for OP_CBRA (normal capturing bracket) and
				1602	OP_BRA (normal non-capturing bracket) because the other variants of these
				1603	opcodes are all concerned with unlimited repeated groups, which of course
				1604	are not of fixed length. */
				1605
				1606	case OP_CBRA:
				1607	case OP_BRA:
				1608	case OP_ONCE:
				1609	case OP_ONCE_NC:
				1610	case OP_COND:
				1611	d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
				1612	if (d < 0) return d;
				1613	branchlength += d;
				1614	do cc += GET(cc, 1); while (*cc == OP_ALT);
				1615	cc += 1 + LINK_SIZE;
				1616	break;
				1617
				1618	/* Reached end of a branch; if it's a ket it is the end of a nested call.
				1619	If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
				1620	an ALT. If it is END it's the end of the outer call. All can be handled by
				1621	the same code. Note that we must not include the OP_KETRxxx opcodes here,
				1622	because they all imply an unlimited repeat. */
				1623
				1624	case OP_ALT:
				1625	case OP_KET:
				1626	case OP_END:
				1627	case OP_ACCEPT:
				1628	case OP_ASSERT_ACCEPT:
				1629	if (length < 0) length = branchlength;
				1630	else if (length != branchlength) return -1;
				1631	if (*cc != OP_ALT) return length;
				1632	cc += 1 + LINK_SIZE;
				1633	branchlength = 0;
				1634	break;
				1635
				1636	/* A true recursion implies not fixed length, but a subroutine call may
				1637	be OK. If the subroutine is a forward reference, we can't deal with
				1638	it until the end of the pattern, so return -3. */
				1639
				1640	case OP_RECURSE:
				1641	if (!atend) return -3;
				1642	cs = ce = (uschar )cd->start_code + GET(cc, 1); / Start subpattern */
				1643	do ce += GET(ce, 1); while (ce == OP_ALT); / End subpattern */
				1644	if (cc > cs && cc < ce) return -1; /* Recursion */
				1645	d = find_fixedlength(cs + 2, utf8, atend, cd);
				1646	if (d < 0) return d;
				1647	branchlength += d;
				1648	cc += 1 + LINK_SIZE;
				1649	break;
				1650
				1651	/* Skip over assertive subpatterns */
				1652
				1653	case OP_ASSERT:
				1654	case OP_ASSERT_NOT:
				1655	case OP_ASSERTBACK:
				1656	case OP_ASSERTBACK_NOT:
				1657	do cc += GET(cc, 1); while (*cc == OP_ALT);
				1658	/* Fall through */
				1659
				1660	/* Skip over things that don't match chars */
				1661
				1662	case OP_MARK:
				1663	case OP_PRUNE_ARG:
				1664	case OP_SKIP_ARG:
				1665	case OP_THEN_ARG:
				1666	cc += cc[1] + _pcre_OP_lengths[*cc];
				1667	break;
				1668
				1669	case OP_CALLOUT:
				1670	case OP_CIRC:
				1671	case OP_CIRCM:
				1672	case OP_CLOSE:
				1673	case OP_COMMIT:
				1674	case OP_CREF:
				1675	case OP_DEF:
				1676	case OP_DOLL:
				1677	case OP_DOLLM:
				1678	case OP_EOD:
				1679	case OP_EODN:
				1680	case OP_FAIL:
				1681	case OP_NCREF:
				1682	case OP_NRREF:
				1683	case OP_NOT_WORD_BOUNDARY:
				1684	case OP_PRUNE:
				1685	case OP_REVERSE:
				1686	case OP_RREF:
				1687	case OP_SET_SOM:
				1688	case OP_SKIP:
				1689	case OP_SOD:
				1690	case OP_SOM:
				1691	case OP_THEN:
				1692	case OP_WORD_BOUNDARY:
				1693	cc += _pcre_OP_lengths[*cc];
				1694	break;
				1695
				1696	/* Handle literal characters */
				1697
				1698	case OP_CHAR:
				1699	case OP_CHARI:
				1700	case OP_NOT:
				1701	case OP_NOTI:
				1702	branchlength++;
				1703	cc += 2;
				1704	#ifdef SUPPORT_UTF8
				1705	if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
				1706	#endif
				1707	break;
				1708
				1709	/* Handle exact repetitions. The count is already in characters, but we
				1710	need to skip over a multibyte character in UTF8 mode. */
				1711
				1712	case OP_EXACT:
				1713	case OP_EXACTI:
				1714	case OP_NOTEXACT:
				1715	case OP_NOTEXACTI:
				1716	branchlength += GET2(cc,1);
				1717	cc += 4;
				1718	#ifdef SUPPORT_UTF8
				1719	if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
				1720	#endif
				1721	break;
				1722
				1723	case OP_TYPEEXACT:
				1724	branchlength += GET2(cc,1);
				1725	if (cc[3] == OP_PROP \|\| cc[3] == OP_NOTPROP) cc += 2;
				1726	cc += 4;
				1727	break;
				1728
				1729	/* Handle single-char matchers */
				1730
				1731	case OP_PROP:
				1732	case OP_NOTPROP:
				1733	cc += 2;
				1734	/* Fall through */
				1735
				1736	case OP_HSPACE:
				1737	case OP_VSPACE:
				1738	case OP_NOT_HSPACE:
				1739	case OP_NOT_VSPACE:
				1740	case OP_NOT_DIGIT:
				1741	case OP_DIGIT:
				1742	case OP_NOT_WHITESPACE:
				1743	case OP_WHITESPACE:
				1744	case OP_NOT_WORDCHAR:
				1745	case OP_WORDCHAR:
				1746	case OP_ANY:
				1747	case OP_ALLANY:
				1748	branchlength++;
				1749	cc++;
				1750	break;
				1751
				1752	/* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
				1753	otherwise \C is coded as OP_ALLANY. */
				1754
				1755	case OP_ANYBYTE:
				1756	return -2;
				1757
				1758	/* Check a class for variable quantification */
				1759
				1760	#ifdef SUPPORT_UTF8
				1761	case OP_XCLASS:
				1762	cc += GET(cc, 1) - 33;
				1763	/* Fall through */
				1764	#endif
				1765
				1766	case OP_CLASS:
				1767	case OP_NCLASS:
				1768	cc += 33;
				1769
				1770	switch (*cc)
				1771	{
				1772	case OP_CRPLUS:
				1773	case OP_CRMINPLUS:
				1774	case OP_CRSTAR:
				1775	case OP_CRMINSTAR:
				1776	case OP_CRQUERY:
				1777	case OP_CRMINQUERY:
				1778	return -1;
				1779
				1780	case OP_CRRANGE:
				1781	case OP_CRMINRANGE:
				1782	if (GET2(cc,1) != GET2(cc,3)) return -1;
				1783	branchlength += GET2(cc,1);
				1784	cc += 5;
				1785	break;
				1786
				1787	default:
				1788	branchlength++;
				1789	}
				1790	break;
				1791
				1792	/* Anything else is variable length */
				1793
				1794	case OP_ANYNL:
				1795	case OP_BRAMINZERO:
				1796	case OP_BRAPOS:
				1797	case OP_BRAPOSZERO:
				1798	case OP_BRAZERO:
				1799	case OP_CBRAPOS:
				1800	case OP_EXTUNI:
				1801	case OP_KETRMAX:
				1802	case OP_KETRMIN:
				1803	case OP_KETRPOS:
				1804	case OP_MINPLUS:
				1805	case OP_MINPLUSI:
				1806	case OP_MINQUERY:
				1807	case OP_MINQUERYI:
				1808	case OP_MINSTAR:
				1809	case OP_MINSTARI:
				1810	case OP_MINUPTO:
				1811	case OP_MINUPTOI:
				1812	case OP_NOTMINPLUS:
				1813	case OP_NOTMINPLUSI:
				1814	case OP_NOTMINQUERY:
				1815	case OP_NOTMINQUERYI:
				1816	case OP_NOTMINSTAR:
				1817	case OP_NOTMINSTARI:
				1818	case OP_NOTMINUPTO:
				1819	case OP_NOTMINUPTOI:
				1820	case OP_NOTPLUS:
				1821	case OP_NOTPLUSI:
				1822	case OP_NOTPOSPLUS:
				1823	case OP_NOTPOSPLUSI:
				1824	case OP_NOTPOSQUERY:
				1825	case OP_NOTPOSQUERYI:
				1826	case OP_NOTPOSSTAR:
				1827	case OP_NOTPOSSTARI:
				1828	case OP_NOTPOSUPTO:
				1829	case OP_NOTPOSUPTOI:
				1830	case OP_NOTQUERY:
				1831	case OP_NOTQUERYI:
				1832	case OP_NOTSTAR:
				1833	case OP_NOTSTARI:
				1834	case OP_NOTUPTO:
				1835	case OP_NOTUPTOI:
				1836	case OP_PLUS:
				1837	case OP_PLUSI:
				1838	case OP_POSPLUS:
				1839	case OP_POSPLUSI:
				1840	case OP_POSQUERY:
				1841	case OP_POSQUERYI:
				1842	case OP_POSSTAR:
				1843	case OP_POSSTARI:
				1844	case OP_POSUPTO:
				1845	case OP_POSUPTOI:
				1846	case OP_QUERY:
				1847	case OP_QUERYI:
				1848	case OP_REF:
				1849	case OP_REFI:
				1850	case OP_SBRA:
				1851	case OP_SBRAPOS:
				1852	case OP_SCBRA:
				1853	case OP_SCBRAPOS:
				1854	case OP_SCOND:
				1855	case OP_SKIPZERO:
				1856	case OP_STAR:
				1857	case OP_STARI:
				1858	case OP_TYPEMINPLUS:
				1859	case OP_TYPEMINQUERY:
				1860	case OP_TYPEMINSTAR:
				1861	case OP_TYPEMINUPTO:
				1862	case OP_TYPEPLUS:
				1863	case OP_TYPEPOSPLUS:
				1864	case OP_TYPEPOSQUERY:
				1865	case OP_TYPEPOSSTAR:
				1866	case OP_TYPEPOSUPTO:
				1867	case OP_TYPEQUERY:
				1868	case OP_TYPESTAR:
				1869	case OP_TYPEUPTO:
				1870	case OP_UPTO:
				1871	case OP_UPTOI:
				1872	return -1;
				1873
				1874	/* Catch unrecognized opcodes so that when new ones are added they
				1875	are not forgotten, as has happened in the past. */
				1876
				1877	default:
				1878	return -4;
				1879	}
				1880	}
				1881	/* Control never gets here */
				1882	}
				1883
				1884
				1885
				1886
				1887	/*************************************************
				1888	* Scan compiled regex for specific bracket *
				1889	*************************************************/
				1890
				1891	/* This little function scans through a compiled pattern until it finds a
				1892	capturing bracket with the given number, or, if the number is negative, an
				1893	instance of OP_REVERSE for a lookbehind. The function is global in the C sense
				1894	so that it can be called from pcre_study() when finding the minimum matching
				1895	length.
				1896
				1897	Arguments:
				1898	code points to start of expression
				1899	utf8 TRUE in UTF-8 mode
				1900	number the required bracket number or negative to find a lookbehind
				1901
				1902	Returns: pointer to the opcode for the bracket, or NULL if not found
				1903	*/
				1904
				1905	const uschar *
				1906	_pcre_find_bracket(const uschar *code, BOOL utf8, int number)
				1907	{
				1908	for (;;)
				1909	{
				1910	register int c = *code;
				1911
				1912	if (c == OP_END) return NULL;
				1913
				1914	/* XCLASS is used for classes that cannot be represented just by a bit
				1915	map. This includes negated single high-valued characters. The length in
				1916	the table is zero; the actual length is stored in the compiled code. */
				1917
				1918	if (c == OP_XCLASS) code += GET(code, 1);
				1919
				1920	/* Handle recursion */
				1921
				1922	else if (c == OP_REVERSE)
				1923	{
				1924	if (number < 0) return (uschar *)code;
				1925	code += _pcre_OP_lengths[c];
				1926	}
				1927
				1928	/* Handle capturing bracket */
				1929
				1930	else if (c == OP_CBRA \|\| c == OP_SCBRA \|\|
				1931	c == OP_CBRAPOS \|\| c == OP_SCBRAPOS)
				1932	{
				1933	int n = GET2(code, 1+LINK_SIZE);
				1934	if (n == number) return (uschar *)code;
				1935	code += _pcre_OP_lengths[c];
				1936	}
				1937
				1938	/* Otherwise, we can get the item's length from the table, except that for
				1939	repeated character types, we have to test for \p and \P, which have an extra
				1940	two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
				1941	must add in its length. */
				1942
				1943	else
				1944	{
				1945	switch(c)
				1946	{
				1947	case OP_TYPESTAR:
				1948	case OP_TYPEMINSTAR:
				1949	case OP_TYPEPLUS:
				1950	case OP_TYPEMINPLUS:
				1951	case OP_TYPEQUERY:
				1952	case OP_TYPEMINQUERY:
				1953	case OP_TYPEPOSSTAR:
				1954	case OP_TYPEPOSPLUS:
				1955	case OP_TYPEPOSQUERY:
				1956	if (code[1] == OP_PROP \|\| code[1] == OP_NOTPROP) code += 2;
				1957	break;
				1958
				1959	case OP_TYPEUPTO:
				1960	case OP_TYPEMINUPTO:
				1961	case OP_TYPEEXACT:
				1962	case OP_TYPEPOSUPTO:
				1963	if (code[3] == OP_PROP \|\| code[3] == OP_NOTPROP) code += 2;
				1964	break;
				1965
				1966	case OP_MARK:
				1967	case OP_PRUNE_ARG:
				1968	case OP_SKIP_ARG:
				1969	code += code[1];
				1970	break;
				1971
				1972	case OP_THEN_ARG:
				1973	code += code[1];
				1974	break;
				1975	}
				1976
				1977	/* Add in the fixed length from the table */
				1978
				1979	code += _pcre_OP_lengths[c];
				1980
				1981	/* In UTF-8 mode, opcodes that are followed by a character may be followed by
				1982	a multi-byte character. The length in the table is a minimum, so we have to
				1983	arrange to skip the extra bytes. */
				1984
				1985	#ifdef SUPPORT_UTF8
				1986	if (utf8) switch(c)
				1987	{
				1988	case OP_CHAR:
				1989	case OP_CHARI:
				1990	case OP_EXACT:
				1991	case OP_EXACTI:
				1992	case OP_UPTO:
				1993	case OP_UPTOI:
				1994	case OP_MINUPTO:
				1995	case OP_MINUPTOI:
				1996	case OP_POSUPTO:
				1997	case OP_POSUPTOI:
				1998	case OP_STAR:
				1999	case OP_STARI:
				2000	case OP_MINSTAR:
				2001	case OP_MINSTARI:
				2002	case OP_POSSTAR:
				2003	case OP_POSSTARI:
				2004	case OP_PLUS:
				2005	case OP_PLUSI:
				2006	case OP_MINPLUS:
				2007	case OP_MINPLUSI:
				2008	case OP_POSPLUS:
				2009	case OP_POSPLUSI:
				2010	case OP_QUERY:
				2011	case OP_QUERYI:
				2012	case OP_MINQUERY:
				2013	case OP_MINQUERYI:
				2014	case OP_POSQUERY:
				2015	case OP_POSQUERYI:
				2016	if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
				2017	break;
				2018	}
				2019	#else
				2020	(void)(utf8); /* Keep compiler happy by referencing function argument */
				2021	#endif
				2022	}
				2023	}
				2024	}
				2025
				2026
				2027
				2028	/*************************************************
				2029	* Scan compiled regex for recursion reference *
				2030	*************************************************/
				2031
				2032	/* This little function scans through a compiled pattern until it finds an
				2033	instance of OP_RECURSE.
				2034
				2035	Arguments:
				2036	code points to start of expression
				2037	utf8 TRUE in UTF-8 mode
				2038
				2039	Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
				2040	*/
				2041
				2042	static const uschar *
				2043	find_recurse(const uschar *code, BOOL utf8)
				2044	{
				2045	for (;;)
				2046	{
				2047	register int c = *code;
				2048	if (c == OP_END) return NULL;
				2049	if (c == OP_RECURSE) return code;
				2050
				2051	/* XCLASS is used for classes that cannot be represented just by a bit
				2052	map. This includes negated single high-valued characters. The length in
				2053	the table is zero; the actual length is stored in the compiled code. */
				2054
				2055	if (c == OP_XCLASS) code += GET(code, 1);
				2056
				2057	/* Otherwise, we can get the item's length from the table, except that for
				2058	repeated character types, we have to test for \p and \P, which have an extra
				2059	two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
				2060	must add in its length. */
				2061
				2062	else
				2063	{
				2064	switch(c)
				2065	{
				2066	case OP_TYPESTAR:
				2067	case OP_TYPEMINSTAR:
				2068	case OP_TYPEPLUS:
				2069	case OP_TYPEMINPLUS:
				2070	case OP_TYPEQUERY:
				2071	case OP_TYPEMINQUERY:
				2072	case OP_TYPEPOSSTAR:
				2073	case OP_TYPEPOSPLUS:
				2074	case OP_TYPEPOSQUERY:
				2075	if (code[1] == OP_PROP \|\| code[1] == OP_NOTPROP) code += 2;
				2076	break;
				2077
				2078	case OP_TYPEPOSUPTO:
				2079	case OP_TYPEUPTO:
				2080	case OP_TYPEMINUPTO:
				2081	case OP_TYPEEXACT:
				2082	if (code[3] == OP_PROP \|\| code[3] == OP_NOTPROP) code += 2;
				2083	break;
				2084
				2085	case OP_MARK:
				2086	case OP_PRUNE_ARG:
				2087	case OP_SKIP_ARG:
				2088	code += code[1];
				2089	break;
				2090
				2091	case OP_THEN_ARG:
				2092	code += code[1];
				2093	break;
				2094	}
				2095
				2096	/* Add in the fixed length from the table */
				2097
				2098	code += _pcre_OP_lengths[c];
				2099
				2100	/* In UTF-8 mode, opcodes that are followed by a character may be followed
				2101	by a multi-byte character. The length in the table is a minimum, so we have
				2102	to arrange to skip the extra bytes. */
				2103
				2104	#ifdef SUPPORT_UTF8
				2105	if (utf8) switch(c)
				2106	{
				2107	case OP_CHAR:
				2108	case OP_CHARI:
				2109	case OP_EXACT:
				2110	case OP_EXACTI:
				2111	case OP_UPTO:
				2112	case OP_UPTOI:
				2113	case OP_MINUPTO:
				2114	case OP_MINUPTOI:
				2115	case OP_POSUPTO:
				2116	case OP_POSUPTOI:
				2117	case OP_STAR:
				2118	case OP_STARI:
				2119	case OP_MINSTAR:
				2120	case OP_MINSTARI:
				2121	case OP_POSSTAR:
				2122	case OP_POSSTARI:
				2123	case OP_PLUS:
				2124	case OP_PLUSI:
				2125	case OP_MINPLUS:
				2126	case OP_MINPLUSI:
				2127	case OP_POSPLUS:
				2128	case OP_POSPLUSI:
				2129	case OP_QUERY:
				2130	case OP_QUERYI:
				2131	case OP_MINQUERY:
				2132	case OP_MINQUERYI:
				2133	case OP_POSQUERY:
				2134	case OP_POSQUERYI:
				2135	if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
				2136	break;
				2137	}
				2138	#else
				2139	(void)(utf8); /* Keep compiler happy by referencing function argument */
				2140	#endif
				2141	}
				2142	}
				2143	}
				2144
				2145
				2146
				2147	/*************************************************
				2148	* Scan compiled branch for non-emptiness *
				2149	*************************************************/
				2150
				2151	/* This function scans through a branch of a compiled pattern to see whether it
				2152	can match the empty string or not. It is called from could_be_empty()
				2153	below and from compile_branch() when checking for an unlimited repeat of a
				2154	group that can match nothing. Note that first_significant_code() skips over
				2155	backward and negative forward assertions when its final argument is TRUE. If we
				2156	hit an unclosed bracket, we return "empty" - this means we've struck an inner
				2157	bracket whose current branch will already have been scanned.
				2158
				2159	Arguments:
				2160	code points to start of search
				2161	endcode points to where to stop
				2162	utf8 TRUE if in UTF8 mode
				2163	cd contains pointers to tables etc.
				2164
				2165	Returns: TRUE if what is matched could be empty
				2166	*/
				2167
				2168	static BOOL
				2169	could_be_empty_branch(const uschar code, const uschar endcode, BOOL utf8,
				2170	compile_data *cd)
				2171	{
				2172	register int c;
				2173	for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
				2174	code < endcode;
				2175	code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))
				2176	{
				2177	const uschar *ccode;
				2178
				2179	c = *code;
				2180
				2181	/* Skip over forward assertions; the other assertions are skipped by
				2182	first_significant_code() with a TRUE final argument. */
				2183
				2184	if (c == OP_ASSERT)
				2185	{
				2186	do code += GET(code, 1); while (*code == OP_ALT);
				2187	c = *code;
				2188	continue;
				2189	}
				2190
				2191	/* For a recursion/subroutine call, if its end has been reached, which
				2192	implies a backward reference subroutine call, we can scan it. If it's a
				2193	forward reference subroutine call, we can't. To detect forward reference
				2194	we have to scan up the list that is kept in the workspace. This function is
				2195	called only when doing the real compile, not during the pre-compile that
				2196	measures the size of the compiled pattern. */
				2197
				2198	if (c == OP_RECURSE)
				2199	{
				2200	const uschar *scode;
				2201	BOOL empty_branch;
				2202
				2203	/* Test for forward reference */
				2204
				2205	for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
				2206	if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
				2207
				2208	/* Not a forward reference, test for completed backward reference */
				2209
				2210	empty_branch = FALSE;
				2211	scode = cd->start_code + GET(code, 1);
				2212	if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
				2213
				2214	/* Completed backwards reference */
				2215
				2216	do
				2217	{
				2218	if (could_be_empty_branch(scode, endcode, utf8, cd))
				2219	{
				2220	empty_branch = TRUE;
				2221	break;
				2222	}
				2223	scode += GET(scode, 1);
				2224	}
				2225	while (*scode == OP_ALT);
				2226
				2227	if (!empty_branch) return FALSE; /* All branches are non-empty */
				2228	continue;
				2229	}
				2230
				2231	/* Groups with zero repeats can of course be empty; skip them. */
				2232
				2233	if (c == OP_BRAZERO \|\| c == OP_BRAMINZERO \|\| c == OP_SKIPZERO \|\|
				2234	c == OP_BRAPOSZERO)
				2235	{
				2236	code += _pcre_OP_lengths[c];
				2237	do code += GET(code, 1); while (*code == OP_ALT);
				2238	c = *code;
				2239	continue;
				2240	}
				2241
				2242	/* A nested group that is already marked as "could be empty" can just be
				2243	skipped. */
				2244
				2245	if (c == OP_SBRA \|\| c == OP_SBRAPOS \|\|
				2246	c == OP_SCBRA \|\| c == OP_SCBRAPOS)
				2247	{
				2248	do code += GET(code, 1); while (*code == OP_ALT);
				2249	c = *code;
				2250	continue;
				2251	}
				2252
				2253	/* For other groups, scan the branches. */
				2254
				2255	if (c == OP_BRA \|\| c == OP_BRAPOS \|\|
				2256	c == OP_CBRA \|\| c == OP_CBRAPOS \|\|
				2257	c == OP_ONCE \|\| c == OP_ONCE_NC \|\|
				2258	c == OP_COND)
				2259	{
				2260	BOOL empty_branch;
				2261	if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
				2262
				2263	/* If a conditional group has only one branch, there is a second, implied,
				2264	empty branch, so just skip over the conditional, because it could be empty.
				2265	Otherwise, scan the individual branches of the group. */
				2266
				2267	if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
				2268	code += GET(code, 1);
				2269	else
				2270	{
				2271	empty_branch = FALSE;
				2272	do
				2273	{
				2274	if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
				2275	empty_branch = TRUE;
				2276	code += GET(code, 1);
				2277	}
				2278	while (*code == OP_ALT);
				2279	if (!empty_branch) return FALSE; /* All branches are non-empty */
				2280	}
				2281
				2282	c = *code;
				2283	continue;
				2284	}
				2285
				2286	/* Handle the other opcodes */
				2287
				2288	switch (c)
				2289	{
				2290	/* Check for quantifiers after a class. XCLASS is used for classes that
				2291	cannot be represented just by a bit map. This includes negated single
				2292	high-valued characters. The length in _pcre_OP_lengths[] is zero; the
				2293	actual length is stored in the compiled code, so we must update "code"
				2294	here. */
				2295
				2296	#ifdef SUPPORT_UTF8
				2297	case OP_XCLASS:
				2298	ccode = code += GET(code, 1);
				2299	goto CHECK_CLASS_REPEAT;
				2300	#endif
				2301
				2302	case OP_CLASS:
				2303	case OP_NCLASS:
				2304	ccode = code + 33;
				2305
				2306	#ifdef SUPPORT_UTF8
				2307	CHECK_CLASS_REPEAT:
				2308	#endif
				2309
				2310	switch (*ccode)
				2311	{
				2312	case OP_CRSTAR: /* These could be empty; continue */
				2313	case OP_CRMINSTAR:
				2314	case OP_CRQUERY:
				2315	case OP_CRMINQUERY:
				2316	break;
				2317
				2318	default: /* Non-repeat => class must match */
				2319	case OP_CRPLUS: /* These repeats aren't empty */
				2320	case OP_CRMINPLUS:
				2321	return FALSE;
				2322
				2323	case OP_CRRANGE:
				2324	case OP_CRMINRANGE:
				2325	if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
				2326	break;
				2327	}
				2328	break;
				2329
				2330	/* Opcodes that must match a character */
				2331
				2332	case OP_PROP:
				2333	case OP_NOTPROP:
				2334	case OP_EXTUNI:
				2335	case OP_NOT_DIGIT:
				2336	case OP_DIGIT:
				2337	case OP_NOT_WHITESPACE:
				2338	case OP_WHITESPACE:
				2339	case OP_NOT_WORDCHAR:
				2340	case OP_WORDCHAR:
				2341	case OP_ANY:
				2342	case OP_ALLANY:
				2343	case OP_ANYBYTE:
				2344	case OP_CHAR:
				2345	case OP_CHARI:
				2346	case OP_NOT:
				2347	case OP_NOTI:
				2348	case OP_PLUS:
				2349	case OP_MINPLUS:
				2350	case OP_POSPLUS:
				2351	case OP_EXACT:
				2352	case OP_NOTPLUS:
				2353	case OP_NOTMINPLUS:
				2354	case OP_NOTPOSPLUS:
				2355	case OP_NOTEXACT:
				2356	case OP_TYPEPLUS:
				2357	case OP_TYPEMINPLUS:
				2358	case OP_TYPEPOSPLUS:
				2359	case OP_TYPEEXACT:
				2360	return FALSE;
				2361
				2362	/* These are going to continue, as they may be empty, but we have to
				2363	fudge the length for the \p and \P cases. */
				2364
				2365	case OP_TYPESTAR:
				2366	case OP_TYPEMINSTAR:
				2367	case OP_TYPEPOSSTAR:
				2368	case OP_TYPEQUERY:
				2369	case OP_TYPEMINQUERY:
				2370	case OP_TYPEPOSQUERY:
				2371	if (code[1] == OP_PROP \|\| code[1] == OP_NOTPROP) code += 2;
				2372	break;
				2373
				2374	/* Same for these */
				2375
				2376	case OP_TYPEUPTO:
				2377	case OP_TYPEMINUPTO:
				2378	case OP_TYPEPOSUPTO:
				2379	if (code[3] == OP_PROP \|\| code[3] == OP_NOTPROP) code += 2;
				2380	break;
				2381
				2382	/* End of branch */
				2383
				2384	case OP_KET:
				2385	case OP_KETRMAX:
				2386	case OP_KETRMIN:
				2387	case OP_KETRPOS:
				2388	case OP_ALT:
				2389	return TRUE;
				2390
				2391	/* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
				2392	MINUPTO, and POSUPTO may be followed by a multibyte character */
				2393
				2394	#ifdef SUPPORT_UTF8
				2395	case OP_STAR:
				2396	case OP_STARI:
				2397	case OP_MINSTAR:
				2398	case OP_MINSTARI:
				2399	case OP_POSSTAR:
				2400	case OP_POSSTARI:
				2401	case OP_QUERY:
				2402	case OP_QUERYI:
				2403	case OP_MINQUERY:
				2404	case OP_MINQUERYI:
				2405	case OP_POSQUERY:
				2406	case OP_POSQUERYI:
				2407	if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
				2408	break;
				2409
				2410	case OP_UPTO:
				2411	case OP_UPTOI:
				2412	case OP_MINUPTO:
				2413	case OP_MINUPTOI:
				2414	case OP_POSUPTO:
				2415	case OP_POSUPTOI:
				2416	if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
				2417	break;
				2418	#endif
				2419
				2420	/* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
				2421	string. */
				2422
				2423	case OP_MARK:
				2424	case OP_PRUNE_ARG:
				2425	case OP_SKIP_ARG:
				2426	code += code[1];
				2427	break;
				2428
				2429	case OP_THEN_ARG:
				2430	code += code[1];
				2431	break;
				2432
				2433	/* None of the remaining opcodes are required to match a character. */
				2434
				2435	default:
				2436	break;
				2437	}
				2438	}
				2439
				2440	return TRUE;
				2441	}
				2442
				2443
				2444
				2445	/*************************************************
				2446	* Scan compiled regex for non-emptiness *
				2447	*************************************************/
				2448
				2449	/* This function is called to check for left recursive calls. We want to check
				2450	the current branch of the current pattern to see if it could match the empty
				2451	string. If it could, we must look outwards for branches at other levels,
				2452	stopping when we pass beyond the bracket which is the subject of the recursion.
				2453	This function is called only during the real compile, not during the
				2454	pre-compile.
				2455
				2456	Arguments:
				2457	code points to start of the recursion
				2458	endcode points to where to stop (current RECURSE item)
				2459	bcptr points to the chain of current (unclosed) branch starts
				2460	utf8 TRUE if in UTF-8 mode
				2461	cd pointers to tables etc
				2462
				2463	Returns: TRUE if what is matched could be empty
				2464	*/
				2465
				2466	static BOOL
				2467	could_be_empty(const uschar code, const uschar endcode, branch_chain *bcptr,
				2468	BOOL utf8, compile_data *cd)
				2469	{
				2470	while (bcptr != NULL && bcptr->current_branch >= code)
				2471	{
				2472	if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
				2473	return FALSE;
				2474	bcptr = bcptr->outer;
				2475	}
				2476	return TRUE;
				2477	}
				2478
				2479
				2480
				2481	/*************************************************
				2482	* Check for POSIX class syntax *
				2483	*************************************************/
				2484
				2485	/* This function is called when the sequence "[:" or "[." or "[=" is
				2486	encountered in a character class. It checks whether this is followed by a
				2487	sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
				2488	reach an unescaped ']' without the special preceding character, return FALSE.
				2489
				2490	Originally, this function only recognized a sequence of letters between the
				2491	terminators, but it seems that Perl recognizes any sequence of characters,
				2492	though of course unknown POSIX names are subsequently rejected. Perl gives an
				2493	"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
				2494	didn't consider this to be a POSIX class. Likewise for [:1234:].
				2495
				2496	The problem in trying to be exactly like Perl is in the handling of escapes. We
				2497	have to be sure that [abc[:x\]pqr] is not treated as containing a POSIX
				2498	class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
				2499	below handles the special case of \], but does not try to do any other escape
				2500	processing. This makes it different from Perl for cases such as [:l\ower:]
				2501	where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
				2502	"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
				2503	I think.
				2504
				2505	A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
				2506	It seems that the appearance of a nested POSIX class supersedes an apparent
				2507	external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
				2508	a digit.
				2509
				2510	In Perl, unescaped square brackets may also appear as part of class names. For
				2511	example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
				2512	[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
				2513	seem right at all. PCRE does not allow closing square brackets in POSIX class
				2514	names.
				2515
				2516	Arguments:
				2517	ptr pointer to the initial [
				2518	endptr where to return the end pointer
				2519
				2520	Returns: TRUE or FALSE
				2521	*/
				2522
				2523	static BOOL
				2524	check_posix_syntax(const uschar ptr, const uschar *endptr)
				2525	{
				2526	int terminator; /* Don't combine these lines; the Solaris cc */
				2527	terminator = (++ptr); / compiler warns about "non-constant" initializer. */
				2528	for (++ptr; *ptr != 0; ptr++)
				2529	{
				2530	if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
				2531	ptr++;
				2532	else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
				2533	else
				2534	{
				2535	if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
				2536	{
				2537	*endptr = ptr;
				2538	return TRUE;
				2539	}
				2540	if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
				2541	(ptr[1] == CHAR_COLON \|\| ptr[1] == CHAR_DOT \|\|
				2542	ptr[1] == CHAR_EQUALS_SIGN) &&
				2543	check_posix_syntax(ptr, endptr))
				2544	return FALSE;
				2545	}
				2546	}
				2547	return FALSE;
				2548	}
				2549
				2550
				2551
				2552
				2553	/*************************************************
				2554	* Check POSIX class name *
				2555	*************************************************/
				2556
				2557	/* This function is called to check the name given in a POSIX-style class entry
				2558	such as [:alnum:].
				2559
				2560	Arguments:
				2561	ptr points to the first letter
				2562	len the length of the name
				2563
				2564	Returns: a value representing the name, or -1 if unknown
				2565	*/
				2566
				2567	static int
				2568	check_posix_name(const uschar *ptr, int len)
				2569	{
				2570	const char *pn = posix_names;
				2571	register int yield = 0;
				2572	while (posix_name_lengths[yield] != 0)
				2573	{
				2574	if (len == posix_name_lengths[yield] &&
				2575	strncmp((const char *)ptr, pn, len) == 0) return yield;
				2576	pn += posix_name_lengths[yield] + 1;
				2577	yield++;
				2578	}
				2579	return -1;
				2580	}
				2581
				2582
				2583	/*************************************************
				2584	* Adjust OP_RECURSE items in repeated group *
				2585	*************************************************/
				2586
				2587	/* OP_RECURSE items contain an offset from the start of the regex to the group
				2588	that is referenced. This means that groups can be replicated for fixed
				2589	repetition simply by copying (because the recursion is allowed to refer to
				2590	earlier groups that are outside the current group). However, when a group is
				2591	optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
				2592	inserted before it, after it has been compiled. This means that any OP_RECURSE
				2593	items within it that refer to the group itself or any contained groups have to
				2594	have their offsets adjusted. That one of the jobs of this function. Before it
				2595	is called, the partially compiled regex must be temporarily terminated with
				2596	OP_END.
				2597
				2598	This function has been extended with the possibility of forward references for
				2599	recursions and subroutine calls. It must also check the list of such references
				2600	for the group we are dealing with. If it finds that one of the recursions in
				2601	the current group is on this list, it adjusts the offset in the list, not the
				2602	value in the reference (which is a group number).
				2603
				2604	Arguments:
				2605	group points to the start of the group
				2606	adjust the amount by which the group is to be moved
				2607	utf8 TRUE in UTF-8 mode
				2608	cd contains pointers to tables etc.
				2609	save_hwm the hwm forward reference pointer at the start of the group
				2610
				2611	Returns: nothing
				2612	*/
				2613
				2614	static void
				2615	adjust_recurse(uschar group, int adjust, BOOL utf8, compile_data cd,
				2616	uschar *save_hwm)
				2617	{
				2618	uschar *ptr = group;
				2619
				2620	while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
				2621	{
				2622	int offset;
				2623	uschar *hc;
				2624
				2625	/* See if this recursion is on the forward reference list. If so, adjust the
				2626	reference. */
				2627
				2628	for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
				2629	{
				2630	offset = GET(hc, 0);
				2631	if (cd->start_code + offset == ptr + 1)
				2632	{
				2633	PUT(hc, 0, offset + adjust);
				2634	break;
				2635	}
				2636	}
				2637
				2638	/* Otherwise, adjust the recursion offset if it's after the start of this
				2639	group. */
				2640
				2641	if (hc >= cd->hwm)
				2642	{
				2643	offset = GET(ptr, 1);
				2644	if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
				2645	}
				2646
				2647	ptr += 1 + LINK_SIZE;
				2648	}
				2649	}
				2650
				2651
				2652
				2653	/*************************************************
				2654	* Insert an automatic callout point *
				2655	*************************************************/
				2656
				2657	/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
				2658	callout points before each pattern item.
				2659
				2660	Arguments:
				2661	code current code pointer
				2662	ptr current pattern pointer
				2663	cd pointers to tables etc
				2664
				2665	Returns: new code pointer
				2666	*/
				2667
				2668	static uschar *
				2669	auto_callout(uschar code, const uschar ptr, compile_data *cd)
				2670	{
				2671	*code++ = OP_CALLOUT;
				2672	*code++ = 255;
				2673	PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
				2674	PUT(code, LINK_SIZE, 0); /* Default length */
				2675	return code + 2*LINK_SIZE;
				2676	}
				2677
				2678
				2679
				2680	/*************************************************
				2681	* Complete a callout item *
				2682	*************************************************/
				2683
				2684	/* A callout item contains the length of the next item in the pattern, which
				2685	we can't fill in till after we have reached the relevant point. This is used
				2686	for both automatic and manual callouts.
				2687
				2688	Arguments:
				2689	previous_callout points to previous callout item
				2690	ptr current pattern pointer
				2691	cd pointers to tables etc
				2692
				2693	Returns: nothing
				2694	*/
				2695
				2696	static void
				2697	complete_callout(uschar previous_callout, const uschar ptr, compile_data *cd)
				2698	{
				2699	int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
				2700	PUT(previous_callout, 2 + LINK_SIZE, length);
				2701	}
				2702
				2703
				2704
				2705	#ifdef SUPPORT_UCP
				2706	/*************************************************
				2707	* Get othercase range *
				2708	*************************************************/
				2709
				2710	/* This function is passed the start and end of a class range, in UTF-8 mode
				2711	with UCP support. It searches up the characters, looking for internal ranges of
				2712	characters in the "other" case. Each call returns the next one, updating the
				2713	start address.
				2714
				2715	Arguments:
				2716	cptr points to starting character value; updated
				2717	d end value
				2718	ocptr where to put start of othercase range
				2719	odptr where to put end of othercase range
				2720
				2721	Yield: TRUE when range returned; FALSE when no more
				2722	*/
				2723
				2724	static BOOL
				2725	get_othercase_range(unsigned int cptr, unsigned int d, unsigned int ocptr,
				2726	unsigned int *odptr)
				2727	{
				2728	unsigned int c, othercase, next;
				2729
				2730	for (c = *cptr; c <= d; c++)
				2731	{ if ((othercase = UCD_OTHERCASE(c)) != c) break; }
				2732
				2733	if (c > d) return FALSE;
				2734
				2735	*ocptr = othercase;
				2736	next = othercase + 1;
				2737
				2738	for (++c; c <= d; c++)
				2739	{
				2740	if (UCD_OTHERCASE(c) != next) break;
				2741	next++;
				2742	}
				2743
				2744	*odptr = next - 1;
				2745	*cptr = c;
				2746
				2747	return TRUE;
				2748	}
				2749
				2750
				2751
				2752	/*************************************************
				2753	* Check a character and a property *
				2754	*************************************************/
				2755
				2756	/* This function is called by check_auto_possessive() when a property item
				2757	is adjacent to a fixed character.
				2758
				2759	Arguments:
				2760	c the character
				2761	ptype the property type
				2762	pdata the data for the type
				2763	negated TRUE if it's a negated property (\P or \p{^)
				2764
				2765	Returns: TRUE if auto-possessifying is OK
				2766	*/
				2767
				2768	static BOOL
				2769	check_char_prop(int c, int ptype, int pdata, BOOL negated)
				2770	{
				2771	const ucd_record *prop = GET_UCD(c);
				2772	switch(ptype)
				2773	{
				2774	case PT_LAMP:
				2775	return (prop->chartype == ucp_Lu \|\|
				2776	prop->chartype == ucp_Ll \|\|
				2777	prop->chartype == ucp_Lt) == negated;
				2778
				2779	case PT_GC:
				2780	return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
				2781
				2782	case PT_PC:
				2783	return (pdata == prop->chartype) == negated;
				2784
				2785	case PT_SC:
				2786	return (pdata == prop->script) == negated;
				2787
				2788	/* These are specials */
				2789
				2790	case PT_ALNUM:
				2791	return (_pcre_ucp_gentype[prop->chartype] == ucp_L \|\|
				2792	_pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
				2793
				2794	case PT_SPACE: /* Perl space */
				2795	return (_pcre_ucp_gentype[prop->chartype] == ucp_Z \|\|
				2796	c == CHAR_HT \|\| c == CHAR_NL \|\| c == CHAR_FF \|\| c == CHAR_CR)
				2797	== negated;
				2798
				2799	case PT_PXSPACE: /* POSIX space */
				2800	return (_pcre_ucp_gentype[prop->chartype] == ucp_Z \|\|
				2801	c == CHAR_HT \|\| c == CHAR_NL \|\| c == CHAR_VT \|\|
				2802	c == CHAR_FF \|\| c == CHAR_CR)
				2803	== negated;
				2804
				2805	case PT_WORD:
				2806	return (_pcre_ucp_gentype[prop->chartype] == ucp_L \|\|
				2807	_pcre_ucp_gentype[prop->chartype] == ucp_N \|\|
				2808	c == CHAR_UNDERSCORE) == negated;
				2809	}
				2810	return FALSE;
				2811	}
				2812	#endif /* SUPPORT_UCP */
				2813
				2814
				2815
				2816	/*************************************************
				2817	* Check if auto-possessifying is possible *
				2818	*************************************************/
				2819
				2820	/* This function is called for unlimited repeats of certain items, to see
				2821	whether the next thing could possibly match the repeated item. If not, it makes
				2822	sense to automatically possessify the repeated item.
				2823
				2824	Arguments:
				2825	previous pointer to the repeated opcode
				2826	utf8 TRUE in UTF-8 mode
				2827	ptr next character in pattern
				2828	options options bits
				2829	cd contains pointers to tables etc.
				2830
				2831	Returns: TRUE if possessifying is wanted
				2832	*/
				2833
				2834	static BOOL
				2835	check_auto_possessive(const uschar previous, BOOL utf8, const uschar ptr,
				2836	int options, compile_data *cd)
				2837	{
				2838	int c, next;
				2839	int op_code = *previous++;
				2840
				2841	/* Skip whitespace and comments in extended mode */
				2842
				2843	if ((options & PCRE_EXTENDED) != 0)
				2844	{
				2845	for (;;)
				2846	{
				2847	while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
				2848	if (*ptr == CHAR_NUMBER_SIGN)
				2849	{
				2850	ptr++;
				2851	while (*ptr != 0)
				2852	{
				2853	if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
				2854	ptr++;
				2855	#ifdef SUPPORT_UTF8
				2856	if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
				2857	#endif
				2858	}
				2859	}
				2860	else break;
				2861	}
				2862	}
				2863
				2864	/* If the next item is one that we can handle, get its value. A non-negative
				2865	value is a character, a negative value is an escape value. */
				2866
				2867	if (*ptr == CHAR_BACKSLASH)
				2868	{
				2869	int temperrorcode = 0;
				2870	next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
				2871	if (temperrorcode != 0) return FALSE;
				2872	ptr++; /* Point after the escape sequence */
				2873	}
				2874
				2875	else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
				2876	{
				2877	#ifdef SUPPORT_UTF8
				2878	if (utf8) { GETCHARINC(next, ptr); } else
				2879	#endif
				2880	next = *ptr++;
				2881	}
				2882
				2883	else return FALSE;
				2884
				2885	/* Skip whitespace and comments in extended mode */
				2886
				2887	if ((options & PCRE_EXTENDED) != 0)
				2888	{
				2889	for (;;)
				2890	{
				2891	while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
				2892	if (*ptr == CHAR_NUMBER_SIGN)
				2893	{
				2894	ptr++;
				2895	while (*ptr != 0)
				2896	{
				2897	if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
				2898	ptr++;
				2899	#ifdef SUPPORT_UTF8
				2900	if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
				2901	#endif
				2902	}
				2903	}
				2904	else break;
				2905	}
				2906	}
				2907
				2908	/* If the next thing is itself optional, we have to give up. */
				2909
				2910	if (ptr == CHAR_ASTERISK \|\| ptr == CHAR_QUESTION_MARK \|\|
				2911	strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
				2912	return FALSE;
				2913
				2914	/* Now compare the next item with the previous opcode. First, handle cases when
				2915	the next item is a character. */
				2916
				2917	if (next >= 0) switch(op_code)
				2918	{
				2919	case OP_CHAR:
				2920	#ifdef SUPPORT_UTF8
				2921	GETCHARTEST(c, previous);
				2922	#else
				2923	c = *previous;
				2924	#endif
				2925	return c != next;
				2926
				2927	/* For CHARI (caseless character) we must check the other case. If we have
				2928	Unicode property support, we can use it to test the other case of
				2929	high-valued characters. */
				2930
				2931	case OP_CHARI:
				2932	#ifdef SUPPORT_UTF8
				2933	GETCHARTEST(c, previous);
				2934	#else
				2935	c = *previous;
				2936	#endif
				2937	if (c == next) return FALSE;
				2938	#ifdef SUPPORT_UTF8
				2939	if (utf8)
				2940	{
				2941	unsigned int othercase;
				2942	if (next < 128) othercase = cd->fcc[next]; else
				2943	#ifdef SUPPORT_UCP
				2944	othercase = UCD_OTHERCASE((unsigned int)next);
				2945	#else
				2946	othercase = NOTACHAR;
				2947	#endif
				2948	return (unsigned int)c != othercase;
				2949	}
				2950	else
				2951	#endif /* SUPPORT_UTF8 */
				2952	return (c != cd->fcc[next]); /* Non-UTF-8 mode */
				2953
				2954	/* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
				2955	opcodes are not used for multi-byte characters, because they are coded using
				2956	an XCLASS instead. */
				2957
				2958	case OP_NOT:
				2959	return (c = *previous) == next;
				2960
				2961	case OP_NOTI:
				2962	if ((c = *previous) == next) return TRUE;
				2963	#ifdef SUPPORT_UTF8
				2964	if (utf8)
				2965	{
				2966	unsigned int othercase;
				2967	if (next < 128) othercase = cd->fcc[next]; else
				2968	#ifdef SUPPORT_UCP
				2969	othercase = UCD_OTHERCASE(next);
				2970	#else
				2971	othercase = NOTACHAR;
				2972	#endif
				2973	return (unsigned int)c == othercase;
				2974	}
				2975	else
				2976	#endif /* SUPPORT_UTF8 */
				2977	return (c == cd->fcc[next]); /* Non-UTF-8 mode */
				2978
				2979	/* Note that OP_DIGIT etc. are generated only when PCRE_UCP is not set.
				2980	When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
				2981
				2982	case OP_DIGIT:
				2983	return next > 127 \|\| (cd->ctypes[next] & ctype_digit) == 0;
				2984
				2985	case OP_NOT_DIGIT:
				2986	return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
				2987
				2988	case OP_WHITESPACE:
				2989	return next > 127 \|\| (cd->ctypes[next] & ctype_space) == 0;
				2990
				2991	case OP_NOT_WHITESPACE:
				2992	return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
				2993
				2994	case OP_WORDCHAR:
				2995	return next > 127 \|\| (cd->ctypes[next] & ctype_word) == 0;
				2996
				2997	case OP_NOT_WORDCHAR:
				2998	return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
				2999
				3000	case OP_HSPACE:
				3001	case OP_NOT_HSPACE:
				3002	switch(next)
				3003	{
				3004	case 0x09:
				3005	case 0x20:
				3006	case 0xa0:
				3007	case 0x1680:
				3008	case 0x180e:
				3009	case 0x2000:
				3010	case 0x2001:
				3011	case 0x2002:
				3012	case 0x2003:
				3013	case 0x2004:
				3014	case 0x2005:
				3015	case 0x2006:
				3016	case 0x2007:
				3017	case 0x2008:
				3018	case 0x2009:
				3019	case 0x200A:
				3020	case 0x202f:
				3021	case 0x205f:
				3022	case 0x3000:
				3023	return op_code == OP_NOT_HSPACE;
				3024	default:
				3025	return op_code != OP_NOT_HSPACE;
				3026	}
				3027
				3028	case OP_ANYNL:
				3029	case OP_VSPACE:
				3030	case OP_NOT_VSPACE:
				3031	switch(next)
				3032	{
				3033	case 0x0a:
				3034	case 0x0b:
				3035	case 0x0c:
				3036	case 0x0d:
				3037	case 0x85:
				3038	case 0x2028:
				3039	case 0x2029:
				3040	return op_code == OP_NOT_VSPACE;
				3041	default:
				3042	return op_code != OP_NOT_VSPACE;
				3043	}
				3044
				3045	#ifdef SUPPORT_UCP
				3046	case OP_PROP:
				3047	return check_char_prop(next, previous[0], previous[1], FALSE);
				3048
				3049	case OP_NOTPROP:
				3050	return check_char_prop(next, previous[0], previous[1], TRUE);
				3051	#endif
				3052
				3053	default:
				3054	return FALSE;
				3055	}
				3056
				3057
				3058	/* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
				3059	is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
				3060	generated only when PCRE_UCP is not set, that is, when only ASCII
				3061	characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
				3062	replaced by OP_PROP codes when PCRE_UCP is set. */
				3063
				3064	switch(op_code)
				3065	{
				3066	case OP_CHAR:
				3067	case OP_CHARI:
				3068	#ifdef SUPPORT_UTF8
				3069	GETCHARTEST(c, previous);
				3070	#else
				3071	c = *previous;
				3072	#endif
				3073	switch(-next)
				3074	{
				3075	case ESC_d:
				3076	return c > 127 \|\| (cd->ctypes[c] & ctype_digit) == 0;
				3077
				3078	case ESC_D:
				3079	return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
				3080
				3081	case ESC_s:
				3082	return c > 127 \|\| (cd->ctypes[c] & ctype_space) == 0;
				3083
				3084	case ESC_S:
				3085	return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
				3086
				3087	case ESC_w:
				3088	return c > 127 \|\| (cd->ctypes[c] & ctype_word) == 0;
				3089
				3090	case ESC_W:
				3091	return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
				3092
				3093	case ESC_h:
				3094	case ESC_H:
				3095	switch(c)
				3096	{
				3097	case 0x09:
				3098	case 0x20:
				3099	case 0xa0:
				3100	case 0x1680:
				3101	case 0x180e:
				3102	case 0x2000:
				3103	case 0x2001:
				3104	case 0x2002:
				3105	case 0x2003:
				3106	case 0x2004:
				3107	case 0x2005:
				3108	case 0x2006:
				3109	case 0x2007:
				3110	case 0x2008:
				3111	case 0x2009:
				3112	case 0x200A:
				3113	case 0x202f:
				3114	case 0x205f:
				3115	case 0x3000:
				3116	return -next != ESC_h;
				3117	default:
				3118	return -next == ESC_h;
				3119	}
				3120
				3121	case ESC_v:
				3122	case ESC_V:
				3123	switch(c)
				3124	{
				3125	case 0x0a:
				3126	case 0x0b:
				3127	case 0x0c:
				3128	case 0x0d:
				3129	case 0x85:
				3130	case 0x2028:
				3131	case 0x2029:
				3132	return -next != ESC_v;
				3133	default:
				3134	return -next == ESC_v;
				3135	}
				3136
				3137	/* When PCRE_UCP is set, these values get generated for \d etc. Find
				3138	their substitutions and process them. The result will always be either
				3139	-ESC_p or -ESC_P. Then fall through to process those values. */
				3140
				3141	#ifdef SUPPORT_UCP
				3142	case ESC_du:
				3143	case ESC_DU:
				3144	case ESC_wu:
				3145	case ESC_WU:
				3146	case ESC_su:
				3147	case ESC_SU:
				3148	{
				3149	int temperrorcode = 0;
				3150	ptr = substitutes[-next - ESC_DU];
				3151	next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
				3152	if (temperrorcode != 0) return FALSE;
				3153	ptr++; /* For compatibility */
				3154	}
				3155	/* Fall through */
				3156
				3157	case ESC_p:
				3158	case ESC_P:
				3159	{
				3160	int ptype, pdata, errorcodeptr;
				3161	BOOL negated;
				3162
				3163	ptr--; /* Make ptr point at the p or P */
				3164	ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
				3165	if (ptype < 0) return FALSE;
				3166	ptr++; /* Point past the final curly ket */
				3167
				3168	/* If the property item is optional, we have to give up. (When generated
				3169	from \d etc by PCRE_UCP, this test will have been applied much earlier,
				3170	to the original \d etc. At this point, ptr will point to a zero byte. */
				3171
				3172	if (ptr == CHAR_ASTERISK \|\| ptr == CHAR_QUESTION_MARK \|\|
				3173	strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
				3174	return FALSE;
				3175
				3176	/* Do the property check. */
				3177
				3178	return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
				3179	}
				3180	#endif
				3181
				3182	default:
				3183	return FALSE;
				3184	}
				3185
				3186	/* In principle, support for Unicode properties should be integrated here as
				3187	well. It means re-organizing the above code so as to get hold of the property
				3188	values before switching on the op-code. However, I wonder how many patterns
				3189	combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
				3190	these op-codes are never generated.) */
				3191
				3192	case OP_DIGIT:
				3193	return next == -ESC_D \|\| next == -ESC_s \|\| next == -ESC_W \|\|
				3194	next == -ESC_h \|\| next == -ESC_v \|\| next == -ESC_R;
				3195
				3196	case OP_NOT_DIGIT:
				3197	return next == -ESC_d;
				3198
				3199	case OP_WHITESPACE:
				3200	return next == -ESC_S \|\| next == -ESC_d \|\| next == -ESC_w \|\| next == -ESC_R;
				3201
				3202	case OP_NOT_WHITESPACE:
				3203	return next == -ESC_s \|\| next == -ESC_h \|\| next == -ESC_v;
				3204
				3205	case OP_HSPACE:
				3206	return next == -ESC_S \|\| next == -ESC_H \|\| next == -ESC_d \|\|
				3207	next == -ESC_w \|\| next == -ESC_v \|\| next == -ESC_R;
				3208
				3209	case OP_NOT_HSPACE:
				3210	return next == -ESC_h;
				3211
				3212	/* Can't have \S in here because VT matches \S (Perl anomaly) */
				3213	case OP_ANYNL:
				3214	case OP_VSPACE:
				3215	return next == -ESC_V \|\| next == -ESC_d \|\| next == -ESC_w;
				3216
				3217	case OP_NOT_VSPACE:
				3218	return next == -ESC_v \|\| next == -ESC_R;
				3219
				3220	case OP_WORDCHAR:
				3221	return next == -ESC_W \|\| next == -ESC_s \|\| next == -ESC_h \|\|
				3222	next == -ESC_v \|\| next == -ESC_R;
				3223
				3224	case OP_NOT_WORDCHAR:
				3225	return next == -ESC_w \|\| next == -ESC_d;
				3226
				3227	default:
				3228	return FALSE;
				3229	}
				3230
				3231	/* Control does not reach here */
				3232	}
				3233
				3234
				3235
				3236	/*************************************************
				3237	* Compile one branch *
				3238	*************************************************/
				3239
				3240	/* Scan the pattern, compiling it into the a vector. If the options are
				3241	changed during the branch, the pointer is used to change the external options
				3242	bits. This function is used during the pre-compile phase when we are trying
				3243	to find out the amount of memory needed, as well as during the real compile
				3244	phase. The value of lengthptr distinguishes the two phases.
				3245
				3246	Arguments:
				3247	optionsptr pointer to the option bits
				3248	codeptr points to the pointer to the current code point
				3249	ptrptr points to the current pattern pointer
				3250	errorcodeptr points to error code variable
				3251	firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
				3252	reqbyteptr set to the last literal character required, else < 0
				3253	bcptr points to current branch chain
				3254	cond_depth conditional nesting depth
				3255	cd contains pointers to tables etc.
				3256	lengthptr NULL during the real compile phase
				3257	points to length accumulator during pre-compile phase
				3258
				3259	Returns: TRUE on success
				3260	FALSE, with *errorcodeptr set non-zero on error
				3261	*/
				3262
				3263	static BOOL
				3264	compile_branch(int optionsptr, uschar codeptr, const uschar *ptrptr,
				3265	int errorcodeptr, int firstbyteptr, int reqbyteptr, branch_chain bcptr,
				3266	int cond_depth, compile_data cd, int lengthptr)
				3267	{
				3268	int repeat_type, op_type;
				3269	int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
				3270	int bravalue = 0;
				3271	int greedy_default, greedy_non_default;
				3272	int firstbyte, reqbyte;
				3273	int zeroreqbyte, zerofirstbyte;
				3274	int req_caseopt, reqvary, tempreqvary;
				3275	int options = optionsptr; / May change dynamically */
				3276	int after_manual_callout = 0;
				3277	int length_prevgroup = 0;
				3278	register int c;
				3279	register uschar code = codeptr;
				3280	uschar *last_code = code;
				3281	uschar *orig_code = code;
				3282	uschar *tempcode;
				3283	BOOL inescq = FALSE;
				3284	BOOL groupsetfirstbyte = FALSE;
				3285	const uschar ptr = ptrptr;
				3286	const uschar *tempptr;
				3287	const uschar *nestptr = NULL;
				3288	uschar *previous = NULL;
				3289	uschar *previous_callout = NULL;
				3290	uschar *save_hwm = NULL;
				3291	uschar classbits[32];
				3292
				3293	/* We can fish out the UTF-8 setting once and for all into a BOOL, but we
				3294	must not do this for other options (e.g. PCRE_EXTENDED) because they may change
				3295	dynamically as we process the pattern. */
				3296
				3297	#ifdef SUPPORT_UTF8
				3298	BOOL class_utf8;
				3299	BOOL utf8 = (options & PCRE_UTF8) != 0;
				3300	uschar *class_utf8data;
				3301	uschar *class_utf8data_base;
				3302	uschar utf8_char[6];
				3303	#else
				3304	BOOL utf8 = FALSE;
				3305	#endif
				3306
				3307	#ifdef PCRE_DEBUG
				3308	if (lengthptr != NULL) DPRINTF((">> start branch\n"));
				3309	#endif
				3310
				3311	/* Set up the default and non-default settings for greediness */
				3312
				3313	greedy_default = ((options & PCRE_UNGREEDY) != 0);
				3314	greedy_non_default = greedy_default ^ 1;
				3315
				3316	/* Initialize no first byte, no required byte. REQ_UNSET means "no char
				3317	matching encountered yet". It gets changed to REQ_NONE if we hit something that
				3318	matches a non-fixed char first char; reqbyte just remains unset if we never
				3319	find one.
				3320
				3321	When we hit a repeat whose minimum is zero, we may have to adjust these values
				3322	to take the zero repeat into account. This is implemented by setting them to
				3323	zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
				3324	item types that can be repeated set these backoff variables appropriately. */
				3325
				3326	firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
				3327
				3328	/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
				3329	according to the current setting of the caseless flag. REQ_CASELESS is a bit
				3330	value > 255. It is added into the firstbyte or reqbyte variables to record the
				3331	case status of the value. This is used only for ASCII characters. */
				3332
				3333	req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
				3334
				3335	/* Switch on next character until the end of the branch */
				3336
				3337	for (;; ptr++)
				3338	{
				3339	BOOL negate_class;
				3340	BOOL should_flip_negation;
				3341	BOOL possessive_quantifier;
				3342	BOOL is_quantifier;
				3343	BOOL is_recurse;
				3344	BOOL reset_bracount;
				3345	int class_charcount;
				3346	int class_lastchar;
				3347	int newoptions;
				3348	int recno;
				3349	int refsign;
				3350	int skipbytes;
				3351	int subreqbyte;
				3352	int subfirstbyte;
				3353	int terminator;
				3354	int mclength;
				3355	int tempbracount;
				3356	uschar mcbuffer[8];
				3357
				3358	/* Get next byte in the pattern */
				3359
				3360	c = *ptr;
				3361
				3362	/* If we are at the end of a nested substitution, revert to the outer level
				3363	string. Nesting only happens one level deep. */
				3364
				3365	if (c == 0 && nestptr != NULL)
				3366	{
				3367	ptr = nestptr;
				3368	nestptr = NULL;
				3369	c = *ptr;
				3370	}
				3371
				3372	/* If we are in the pre-compile phase, accumulate the length used for the
				3373	previous cycle of this loop. */
				3374
				3375	if (lengthptr != NULL)
				3376	{
				3377	#ifdef PCRE_DEBUG
				3378	if (code > cd->hwm) cd->hwm = code; /* High water info */
				3379	#endif
				3380	if (code > cd->start_workspace + cd->workspace_size -
				3381	WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
				3382	{
				3383	*errorcodeptr = ERR52;
				3384	goto FAILED;
				3385	}
				3386
				3387	/* There is at least one situation where code goes backwards: this is the
				3388	case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
				3389	the class is simply eliminated. However, it is created first, so we have to
				3390	allow memory for it. Therefore, don't ever reduce the length at this point.
				3391	*/
				3392
				3393	if (code < last_code) code = last_code;
				3394
				3395	/* Paranoid check for integer overflow */
				3396
				3397	if (OFLOW_MAX - *lengthptr < code - last_code)
				3398	{
				3399	*errorcodeptr = ERR20;
				3400	goto FAILED;
				3401	}
				3402
				3403	*lengthptr += (int)(code - last_code);
				3404	DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),
				3405	c));
				3406
				3407	/* If "previous" is set and it is not at the start of the work space, move
				3408	it back to there, in order to avoid filling up the work space. Otherwise,
				3409	if "previous" is NULL, reset the current code pointer to the start. */
				3410
				3411	if (previous != NULL)
				3412	{
				3413	if (previous > orig_code)
				3414	{
				3415	memmove(orig_code, previous, code - previous);
				3416	code -= previous - orig_code;
				3417	previous = orig_code;
				3418	}
				3419	}
				3420	else code = orig_code;
				3421
				3422	/* Remember where this code item starts so we can pick up the length
				3423	next time round. */
				3424
				3425	last_code = code;
				3426	}
				3427
				3428	/* In the real compile phase, just check the workspace used by the forward
				3429	reference list. */
				3430
				3431	else if (cd->hwm > cd->start_workspace + cd->workspace_size -
				3432	WORK_SIZE_SAFETY_MARGIN)
				3433	{
				3434	*errorcodeptr = ERR52;
				3435	goto FAILED;
				3436	}
				3437
				3438	/* If in \Q...\E, check for the end; if not, we have a literal */
				3439
				3440	if (inescq && c != 0)
				3441	{
				3442	if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
				3443	{
				3444	inescq = FALSE;
				3445	ptr++;
				3446	continue;
				3447	}
				3448	else
				3449	{
				3450	if (previous_callout != NULL)
				3451	{
				3452	if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
				3453	complete_callout(previous_callout, ptr, cd);
				3454	previous_callout = NULL;
				3455	}
				3456	if ((options & PCRE_AUTO_CALLOUT) != 0)
				3457	{
				3458	previous_callout = code;
				3459	code = auto_callout(code, ptr, cd);
				3460	}
				3461	goto NORMAL_CHAR;
				3462	}
				3463	}
				3464
				3465	/* Fill in length of a previous callout, except when the next thing is
				3466	a quantifier. */
				3467
				3468	is_quantifier =
				3469	c == CHAR_ASTERISK \|\| c == CHAR_PLUS \|\| c == CHAR_QUESTION_MARK \|\|
				3470	(c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
				3471
				3472	if (!is_quantifier && previous_callout != NULL &&
				3473	after_manual_callout-- <= 0)
				3474	{
				3475	if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
				3476	complete_callout(previous_callout, ptr, cd);
				3477	previous_callout = NULL;
				3478	}
				3479
				3480	/* In extended mode, skip white space and comments. */
				3481
				3482	if ((options & PCRE_EXTENDED) != 0)
				3483	{
				3484	if ((cd->ctypes[c] & ctype_space) != 0) continue;
				3485	if (c == CHAR_NUMBER_SIGN)
				3486	{
				3487	ptr++;
				3488	while (*ptr != 0)
				3489	{
				3490	if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
				3491	ptr++;
				3492	#ifdef SUPPORT_UTF8
				3493	if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
				3494	#endif
				3495	}
				3496	if (*ptr != 0) continue;
				3497
				3498	/* Else fall through to handle end of string */
				3499	c = 0;
				3500	}
				3501	}
				3502
				3503	/* No auto callout for quantifiers. */
				3504
				3505	if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
				3506	{
				3507	previous_callout = code;
				3508	code = auto_callout(code, ptr, cd);
				3509	}
				3510
				3511	switch(c)
				3512	{
				3513	/* ===================================================================*/
				3514	case 0: /* The branch terminates at string end */
				3515	case CHAR_VERTICAL_LINE: /* or \| or ) */
				3516	case CHAR_RIGHT_PARENTHESIS:
				3517	*firstbyteptr = firstbyte;
				3518	*reqbyteptr = reqbyte;
				3519	*codeptr = code;
				3520	*ptrptr = ptr;
				3521	if (lengthptr != NULL)
				3522	{
				3523	if (OFLOW_MAX - *lengthptr < code - last_code)
				3524	{
				3525	*errorcodeptr = ERR20;
				3526	goto FAILED;
				3527	}
				3528	lengthptr += (int)(code - last_code); / To include callout length */
				3529	DPRINTF((">> end branch\n"));
				3530	}
				3531	return TRUE;
				3532
				3533
				3534	/* ===================================================================*/
				3535	/* Handle single-character metacharacters. In multiline mode, ^ disables
				3536	the setting of any following char as a first character. */
				3537
				3538	case CHAR_CIRCUMFLEX_ACCENT:
				3539	previous = NULL;
				3540	if ((options & PCRE_MULTILINE) != 0)
				3541	{
				3542	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
				3543	*code++ = OP_CIRCM;
				3544	}
				3545	else *code++ = OP_CIRC;
				3546	break;
				3547
				3548	case CHAR_DOLLAR_SIGN:
				3549	previous = NULL;
				3550	*code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
				3551	break;
				3552
				3553	/* There can never be a first char if '.' is first, whatever happens about
				3554	repeats. The value of reqbyte doesn't change either. */
				3555
				3556	case CHAR_DOT:
				3557	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
				3558	zerofirstbyte = firstbyte;
				3559	zeroreqbyte = reqbyte;
				3560	previous = code;
				3561	*code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
				3562	break;
				3563
				3564
				3565	/* ===================================================================*/
				3566	/* Character classes. If the included characters are all < 256, we build a
				3567	32-byte bitmap of the permitted characters, except in the special case
				3568	where there is only one such character. For negated classes, we build the
				3569	map as usual, then invert it at the end. However, we use a different opcode
				3570	so that data characters > 255 can be handled correctly.
				3571
				3572	If the class contains characters outside the 0-255 range, a different
				3573	opcode is compiled. It may optionally have a bit map for characters < 256,
				3574	but those above are are explicitly listed afterwards. A flag byte tells
				3575	whether the bitmap is present, and whether this is a negated class or not.
				3576
				3577	In JavaScript compatibility mode, an isolated ']' causes an error. In
				3578	default (Perl) mode, it is treated as a data character. */
				3579
				3580	case CHAR_RIGHT_SQUARE_BRACKET:
				3581	if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
				3582	{
				3583	*errorcodeptr = ERR64;
				3584	goto FAILED;
				3585	}
				3586	goto NORMAL_CHAR;
				3587
				3588	case CHAR_LEFT_SQUARE_BRACKET:
				3589	previous = code;
				3590
				3591	/* PCRE supports POSIX class stuff inside a class. Perl gives an error if
				3592	they are encountered at the top level, so we'll do that too. */
				3593
				3594	if ((ptr[1] == CHAR_COLON \|\| ptr[1] == CHAR_DOT \|\|
				3595	ptr[1] == CHAR_EQUALS_SIGN) &&
				3596	check_posix_syntax(ptr, &tempptr))
				3597	{
				3598	*errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
				3599	goto FAILED;
				3600	}
				3601
				3602	/* If the first character is '^', set the negation flag and skip it. Also,
				3603	if the first few characters (either before or after ^) are \Q\E or \E we
				3604	skip them too. This makes for compatibility with Perl. */
				3605
				3606	negate_class = FALSE;
				3607	for (;;)
				3608	{
				3609	c = *(++ptr);
				3610	if (c == CHAR_BACKSLASH)
				3611	{
				3612	if (ptr[1] == CHAR_E)
				3613	ptr++;
				3614	else if (strncmp((const char *)ptr+1,
				3615	STR_Q STR_BACKSLASH STR_E, 3) == 0)
				3616	ptr += 3;
				3617	else
				3618	break;
				3619	}
				3620	else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
				3621	negate_class = TRUE;
				3622	else break;
				3623	}
				3624
				3625	/* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
				3626	an initial ']' is taken as a data character -- the code below handles
				3627	that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
				3628	[^] must match any character, so generate OP_ALLANY. */
				3629
				3630	if (c == CHAR_RIGHT_SQUARE_BRACKET &&
				3631	(cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
				3632	{
				3633	*code++ = negate_class? OP_ALLANY : OP_FAIL;
				3634	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
				3635	zerofirstbyte = firstbyte;
				3636	break;
				3637	}
				3638
				3639	/* If a class contains a negative special such as \S, we need to flip the
				3640	negation flag at the end, so that support for characters > 255 works
				3641	correctly (they are all included in the class). */
				3642
				3643	should_flip_negation = FALSE;
				3644
				3645	/* Keep a count of chars with values < 256 so that we can optimize the case
				3646	of just a single character (as long as it's < 256). However, For higher
				3647	valued UTF-8 characters, we don't yet do any optimization. */
				3648
				3649	class_charcount = 0;
				3650	class_lastchar = -1;
				3651
				3652	/* Initialize the 32-char bit map to all zeros. We build the map in a
				3653	temporary bit of memory, in case the class contains only 1 character (less
				3654	than 256), because in that case the compiled code doesn't use the bit map.
				3655	*/
				3656
				3657	memset(classbits, 0, 32 * sizeof(uschar));
				3658
				3659	#ifdef SUPPORT_UTF8
				3660	class_utf8 = FALSE; /* No chars >= 256 */
				3661	class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
				3662	class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
				3663	#endif
				3664
				3665	/* Process characters until ] is reached. By writing this as a "do" it
				3666	means that an initial ] is taken as a data character. At the start of the
				3667	loop, c contains the first byte of the character. */
				3668
				3669	if (c != 0) do
				3670	{
				3671	const uschar *oldptr;
				3672
				3673	#ifdef SUPPORT_UTF8
				3674	if (utf8 && c > 127)
				3675	{ /* Braces are required because the */
				3676	GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
				3677	}
				3678
				3679	/* In the pre-compile phase, accumulate the length of any UTF-8 extra
				3680	data and reset the pointer. This is so that very large classes that
				3681	contain a zillion UTF-8 characters no longer overwrite the work space
				3682	(which is on the stack). */
				3683
				3684	if (lengthptr != NULL)
				3685	{
				3686	*lengthptr += (int)(class_utf8data - class_utf8data_base);
				3687	class_utf8data = class_utf8data_base;
				3688	}
				3689
				3690	#endif
				3691
				3692	/* Inside \Q...\E everything is literal except \E */
				3693
				3694	if (inescq)
				3695	{
				3696	if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
				3697	{
				3698	inescq = FALSE; /* Reset literal state */
				3699	ptr++; /* Skip the 'E' */
				3700	continue; /* Carry on with next */
				3701	}
				3702	goto CHECK_RANGE; /* Could be range if \E follows */
				3703	}
				3704
				3705	/* Handle POSIX class names. Perl allows a negation extension of the
				3706	form [:^name:]. A square bracket that doesn't match the syntax is
				3707	treated as a literal. We also recognize the POSIX constructions
				3708	[.ch.] and [=ch=] ("collating elements") and fault them, as Perl
				3709	5.6 and 5.8 do. */
				3710
				3711	if (c == CHAR_LEFT_SQUARE_BRACKET &&
				3712	(ptr[1] == CHAR_COLON \|\| ptr[1] == CHAR_DOT \|\|
				3713	ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
				3714	{
				3715	BOOL local_negate = FALSE;
				3716	int posix_class, taboffset, tabopt;
				3717	register const uschar *cbits = cd->cbits;
				3718	uschar pbits[32];
				3719
				3720	if (ptr[1] != CHAR_COLON)
				3721	{
				3722	*errorcodeptr = ERR31;
				3723	goto FAILED;
				3724	}
				3725
				3726	ptr += 2;
				3727	if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
				3728	{
				3729	local_negate = TRUE;
				3730	should_flip_negation = TRUE; /* Note negative special */
				3731	ptr++;
				3732	}
				3733
				3734	posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
				3735	if (posix_class < 0)
				3736	{
				3737	*errorcodeptr = ERR30;
				3738	goto FAILED;
				3739	}
				3740
				3741	/* If matching is caseless, upper and lower are converted to
				3742	alpha. This relies on the fact that the class table starts with
				3743	alpha, lower, upper as the first 3 entries. */
				3744
				3745	if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
				3746	posix_class = 0;
				3747
				3748	/* When PCRE_UCP is set, some of the POSIX classes are converted to
				3749	different escape sequences that use Unicode properties. */
				3750
				3751	#ifdef SUPPORT_UCP
				3752	if ((options & PCRE_UCP) != 0)
				3753	{
				3754	int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
				3755	if (posix_substitutes[pc] != NULL)
				3756	{
				3757	nestptr = tempptr + 1;
				3758	ptr = posix_substitutes[pc] - 1;
				3759	continue;
				3760	}
				3761	}
				3762	#endif
				3763	/* In the non-UCP case, we build the bit map for the POSIX class in a
				3764	chunk of local store because we may be adding and subtracting from it,
				3765	and we don't want to subtract bits that may be in the main map already.
				3766	At the end we or the result into the bit map that is being built. */
				3767
				3768	posix_class *= 3;
				3769
				3770	/* Copy in the first table (always present) */
				3771
				3772	memcpy(pbits, cbits + posix_class_maps[posix_class],
				3773	32 * sizeof(uschar));
				3774
				3775	/* If there is a second table, add or remove it as required. */
				3776
				3777	taboffset = posix_class_maps[posix_class + 1];
				3778	tabopt = posix_class_maps[posix_class + 2];
				3779
				3780	if (taboffset >= 0)
				3781	{
				3782	if (tabopt >= 0)
				3783	for (c = 0; c < 32; c++) pbits[c] \|= cbits[c + taboffset];
				3784	else
				3785	for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
				3786	}
				3787
				3788	/* Not see if we need to remove any special characters. An option
				3789	value of 1 removes vertical space and 2 removes underscore. */
				3790
				3791	if (tabopt < 0) tabopt = -tabopt;
				3792	if (tabopt == 1) pbits[1] &= ~0x3c;
				3793	else if (tabopt == 2) pbits[11] &= 0x7f;
				3794
				3795	/* Add the POSIX table or its complement into the main table that is
				3796	being built and we are done. */
				3797
				3798	if (local_negate)
				3799	for (c = 0; c < 32; c++) classbits[c] \|= ~pbits[c];
				3800	else
				3801	for (c = 0; c < 32; c++) classbits[c] \|= pbits[c];
				3802
				3803	ptr = tempptr + 1;
				3804	class_charcount = 10; /* Set > 1; assumes more than 1 per class */
				3805	continue; /* End of POSIX syntax handling */
				3806	}
				3807
				3808	/* Backslash may introduce a single character, or it may introduce one
				3809	of the specials, which just set a flag. The sequence \b is a special
				3810	case. Inside a class (and only there) it is treated as backspace. We
				3811	assume that other escapes have more than one character in them, so set
				3812	class_charcount bigger than one. Unrecognized escapes fall through and
				3813	are either treated as literal characters (by default), or are faulted if
				3814	PCRE_EXTRA is set. */
				3815
				3816	if (c == CHAR_BACKSLASH)
				3817	{
				3818	c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
				3819	if (*errorcodeptr != 0) goto FAILED;
				3820
				3821	if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
				3822	else if (-c == ESC_N) /* \N is not supported in a class */
				3823	{
				3824	*errorcodeptr = ERR71;
				3825	goto FAILED;
				3826	}
				3827	else if (-c == ESC_Q) /* Handle start of quoted string */
				3828	{
				3829	if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
				3830	{
				3831	ptr += 2; /* avoid empty string */
				3832	}
				3833	else inescq = TRUE;
				3834	continue;
				3835	}
				3836	else if (-c == ESC_E) continue; /* Ignore orphan \E */
				3837
				3838	if (c < 0)
				3839	{
				3840	register const uschar *cbits = cd->cbits;
				3841	class_charcount += 2; /* Greater than 1 is what matters */
				3842
				3843	switch (-c)
				3844	{
				3845	#ifdef SUPPORT_UCP
				3846	case ESC_du: /* These are the values given for \d etc */
				3847	case ESC_DU: /* when PCRE_UCP is set. We replace the */
				3848	case ESC_wu: /* escape sequence with an appropriate \p */
				3849	case ESC_WU: /* or \P to test Unicode properties instead */
				3850	case ESC_su: /* of the default ASCII testing. */
				3851	case ESC_SU:
				3852	nestptr = ptr;
				3853	ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
				3854	class_charcount -= 2; /* Undo! */
				3855	continue;
				3856	#endif
				3857	case ESC_d:
				3858	for (c = 0; c < 32; c++) classbits[c] \|= cbits[c+cbit_digit];
				3859	continue;
				3860
				3861	case ESC_D:
				3862	should_flip_negation = TRUE;
				3863	for (c = 0; c < 32; c++) classbits[c] \|= ~cbits[c+cbit_digit];
				3864	continue;
				3865
				3866	case ESC_w:
				3867	for (c = 0; c < 32; c++) classbits[c] \|= cbits[c+cbit_word];
				3868	continue;
				3869
				3870	case ESC_W:
				3871	should_flip_negation = TRUE;
				3872	for (c = 0; c < 32; c++) classbits[c] \|= ~cbits[c+cbit_word];
				3873	continue;
				3874
				3875	/* Perl 5.004 onwards omits VT from \s, but we must preserve it
				3876	if it was previously set by something earlier in the character
				3877	class. */
				3878
				3879	case ESC_s:
				3880	classbits[0] \|= cbits[cbit_space];
				3881	classbits[1] \|= cbits[cbit_space+1] & ~0x08;
				3882	for (c = 2; c < 32; c++) classbits[c] \|= cbits[c+cbit_space];
				3883	continue;
				3884
				3885	case ESC_S:
				3886	should_flip_negation = TRUE;
				3887	for (c = 0; c < 32; c++) classbits[c] \|= ~cbits[c+cbit_space];
				3888	classbits[1] \|= 0x08; /* Perl 5.004 onwards omits VT from \s */
				3889	continue;
				3890
				3891	case ESC_h:
				3892	SETBIT(classbits, 0x09); /* VT */
				3893	SETBIT(classbits, 0x20); /* SPACE */
				3894	SETBIT(classbits, 0xa0); /* NSBP */
				3895	#ifdef SUPPORT_UTF8
				3896	if (utf8)
				3897	{
				3898	class_utf8 = TRUE;
				3899	*class_utf8data++ = XCL_SINGLE;
				3900	class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
				3901	*class_utf8data++ = XCL_SINGLE;
				3902	class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
				3903	*class_utf8data++ = XCL_RANGE;
				3904	class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
				3905	class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
				3906	*class_utf8data++ = XCL_SINGLE;
				3907	class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
				3908	*class_utf8data++ = XCL_SINGLE;
				3909	class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
				3910	*class_utf8data++ = XCL_SINGLE;
				3911	class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
				3912	}
				3913	#endif
				3914	continue;
				3915
				3916	case ESC_H:
				3917	for (c = 0; c < 32; c++)
				3918	{
				3919	int x = 0xff;
				3920	switch (c)
				3921	{
				3922	case 0x09/8: x ^= 1 << (0x09%8); break;
				3923	case 0x20/8: x ^= 1 << (0x20%8); break;
				3924	case 0xa0/8: x ^= 1 << (0xa0%8); break;
				3925	default: break;
				3926	}
				3927	classbits[c] \|= x;
				3928	}
				3929
				3930	#ifdef SUPPORT_UTF8
				3931	if (utf8)
				3932	{
				3933	class_utf8 = TRUE;
				3934	*class_utf8data++ = XCL_RANGE;
				3935	class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
				3936	class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
				3937	*class_utf8data++ = XCL_RANGE;
				3938	class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
				3939	class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
				3940	*class_utf8data++ = XCL_RANGE;
				3941	class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
				3942	class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
				3943	*class_utf8data++ = XCL_RANGE;
				3944	class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
				3945	class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
				3946	*class_utf8data++ = XCL_RANGE;
				3947	class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
				3948	class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
				3949	*class_utf8data++ = XCL_RANGE;
				3950	class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
				3951	class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
				3952	*class_utf8data++ = XCL_RANGE;
				3953	class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
				3954	class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
				3955	}
				3956	#endif
				3957	continue;
				3958
				3959	case ESC_v:
				3960	SETBIT(classbits, 0x0a); /* LF */
				3961	SETBIT(classbits, 0x0b); /* VT */
				3962	SETBIT(classbits, 0x0c); /* FF */
				3963	SETBIT(classbits, 0x0d); /* CR */
				3964	SETBIT(classbits, 0x85); /* NEL */
				3965	#ifdef SUPPORT_UTF8
				3966	if (utf8)
				3967	{
				3968	class_utf8 = TRUE;
				3969	*class_utf8data++ = XCL_RANGE;
				3970	class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
				3971	class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
				3972	}
				3973	#endif
				3974	continue;
				3975
				3976	case ESC_V:
				3977	for (c = 0; c < 32; c++)
				3978	{
				3979	int x = 0xff;
				3980	switch (c)
				3981	{
				3982	case 0x0a/8: x ^= 1 << (0x0a%8);
				3983	x ^= 1 << (0x0b%8);
				3984	x ^= 1 << (0x0c%8);
				3985	x ^= 1 << (0x0d%8);
				3986	break;
				3987	case 0x85/8: x ^= 1 << (0x85%8); break;
				3988	default: break;
				3989	}
				3990	classbits[c] \|= x;
				3991	}
				3992
				3993	#ifdef SUPPORT_UTF8
				3994	if (utf8)
				3995	{
				3996	class_utf8 = TRUE;
				3997	*class_utf8data++ = XCL_RANGE;
				3998	class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
				3999	class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
				4000	*class_utf8data++ = XCL_RANGE;
				4001	class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
				4002	class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
				4003	}
				4004	#endif
				4005	continue;
				4006
				4007	#ifdef SUPPORT_UCP
				4008	case ESC_p:
				4009	case ESC_P:
				4010	{
				4011	BOOL negated;
				4012	int pdata;
				4013	int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
				4014	if (ptype < 0) goto FAILED;
				4015	class_utf8 = TRUE;
				4016	*class_utf8data++ = ((-c == ESC_p) != negated)?
				4017	XCL_PROP : XCL_NOTPROP;
				4018	*class_utf8data++ = ptype;
				4019	*class_utf8data++ = pdata;
				4020	class_charcount -= 2; /* Not a < 256 character */
				4021	continue;
				4022	}
				4023	#endif
				4024	/* Unrecognized escapes are faulted if PCRE is running in its
				4025	strict mode. By default, for compatibility with Perl, they are
				4026	treated as literals. */
				4027
				4028	default:
				4029	if ((options & PCRE_EXTRA) != 0)
				4030	{
				4031	*errorcodeptr = ERR7;
				4032	goto FAILED;
				4033	}
				4034	class_charcount -= 2; /* Undo the default count from above */
				4035	c = ptr; / Get the final character and fall through */
				4036	break;
				4037	}
				4038	}
				4039
				4040	/* Fall through if we have a single character (c >= 0). This may be
				4041	greater than 256 in UTF-8 mode. */
				4042
				4043	} /* End of backslash handling */
				4044
				4045	/* A single character may be followed by '-' to form a range. However,
				4046	Perl does not permit ']' to be the end of the range. A '-' character
				4047	at the end is treated as a literal. Perl ignores orphaned \E sequences
				4048	entirely. The code for handling \Q and \E is messy. */
				4049
				4050	CHECK_RANGE:
				4051	while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
				4052	{
				4053	inescq = FALSE;
				4054	ptr += 2;
				4055	}
				4056
				4057	oldptr = ptr;
				4058
				4059	/* Remember \r or \n */
				4060
				4061	if (c == CHAR_CR \|\| c == CHAR_NL) cd->external_flags \|= PCRE_HASCRORLF;
				4062
				4063	/* Check for range */
				4064
				4065	if (!inescq && ptr[1] == CHAR_MINUS)
				4066	{
				4067	int d;
				4068	ptr += 2;
				4069	while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
				4070
				4071	/* If we hit \Q (not followed by \E) at this point, go into escaped
				4072	mode. */
				4073
				4074	while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
				4075	{
				4076	ptr += 2;
				4077	if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
				4078	{ ptr += 2; continue; }
				4079	inescq = TRUE;
				4080	break;
				4081	}
				4082
				4083	if (ptr == 0 \|\| (!inescq && ptr == CHAR_RIGHT_SQUARE_BRACKET))
				4084	{
				4085	ptr = oldptr;
				4086	goto LONE_SINGLE_CHARACTER;
				4087	}
				4088
				4089	#ifdef SUPPORT_UTF8
				4090	if (utf8)
				4091	{ /* Braces are required because the */
				4092	GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
				4093	}
				4094	else
				4095	#endif
				4096	d = ptr; / Not UTF-8 mode */
				4097
				4098	/* The second part of a range can be a single-character escape, but
				4099	not any of the other escapes. Perl 5.6 treats a hyphen as a literal
				4100	in such circumstances. */
				4101
				4102	if (!inescq && d == CHAR_BACKSLASH)
				4103	{
				4104	d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
				4105	if (*errorcodeptr != 0) goto FAILED;
				4106
				4107	/* \b is backspace; any other special means the '-' was literal */
				4108
				4109	if (d < 0)
				4110	{
				4111	if (d == -ESC_b) d = CHAR_BS; else
				4112	{
				4113	ptr = oldptr;
				4114	goto LONE_SINGLE_CHARACTER; /* A few lines below */
				4115	}
				4116	}
				4117	}
				4118
				4119	/* Check that the two values are in the correct order. Optimize
				4120	one-character ranges */
				4121
				4122	if (d < c)
				4123	{
				4124	*errorcodeptr = ERR8;
				4125	goto FAILED;
				4126	}
				4127
				4128	if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
				4129
				4130	/* Remember \r or \n */
				4131
				4132	if (d == CHAR_CR \|\| d == CHAR_NL) cd->external_flags \|= PCRE_HASCRORLF;
				4133
				4134	/* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
				4135	matching, we have to use an XCLASS with extra data items. Caseless
				4136	matching for characters > 127 is available only if UCP support is
				4137	available. */
				4138
				4139	#ifdef SUPPORT_UTF8
				4140	if (utf8 && (d > 255 \|\| ((options & PCRE_CASELESS) != 0 && d > 127)))
				4141	{
				4142	class_utf8 = TRUE;
				4143
				4144	/* With UCP support, we can find the other case equivalents of
				4145	the relevant characters. There may be several ranges. Optimize how
				4146	they fit with the basic range. */
				4147
				4148	#ifdef SUPPORT_UCP
				4149	if ((options & PCRE_CASELESS) != 0)
				4150	{
				4151	unsigned int occ, ocd;
				4152	unsigned int cc = c;
				4153	unsigned int origd = d;
				4154	while (get_othercase_range(&cc, origd, &occ, &ocd))
				4155	{
				4156	if (occ >= (unsigned int)c &&
				4157	ocd <= (unsigned int)d)
				4158	continue; /* Skip embedded ranges */
				4159
				4160	if (occ < (unsigned int)c &&
				4161	ocd >= (unsigned int)c - 1) /* Extend the basic range */
				4162	{ /* if there is overlap, */
				4163	c = occ; /* noting that if occ < c */
				4164	continue; /* we can't have ocd > d */
				4165	} /* because a subrange is */
				4166	if (ocd > (unsigned int)d &&
				4167	occ <= (unsigned int)d + 1) /* always shorter than */
				4168	{ /* the basic range. */
				4169	d = ocd;
				4170	continue;
				4171	}
				4172
				4173	if (occ == ocd)
				4174	{
				4175	*class_utf8data++ = XCL_SINGLE;
				4176	}
				4177	else
				4178	{
				4179	*class_utf8data++ = XCL_RANGE;
				4180	class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
				4181	}
				4182	class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
				4183	}
				4184	}
				4185	#endif /* SUPPORT_UCP */
				4186
				4187	/* Now record the original range, possibly modified for UCP caseless
				4188	overlapping ranges. */
				4189
				4190	*class_utf8data++ = XCL_RANGE;
				4191	class_utf8data += _pcre_ord2utf8(c, class_utf8data);
				4192	class_utf8data += _pcre_ord2utf8(d, class_utf8data);
				4193
				4194	/* With UCP support, we are done. Without UCP support, there is no
				4195	caseless matching for UTF-8 characters > 127; we can use the bit map
				4196	for the smaller ones. */
				4197
				4198	#ifdef SUPPORT_UCP
				4199	continue; /* With next character in the class */
				4200	#else
				4201	if ((options & PCRE_CASELESS) == 0 \|\| c > 127) continue;
				4202
				4203	/* Adjust upper limit and fall through to set up the map */
				4204
				4205	d = 127;
				4206
				4207	#endif /* SUPPORT_UCP */
				4208	}
				4209	#endif /* SUPPORT_UTF8 */
				4210
				4211	/* We use the bit map for all cases when not in UTF-8 mode; else
				4212	ranges that lie entirely within 0-127 when there is UCP support; else
				4213	for partial ranges without UCP support. */
				4214
				4215	class_charcount += d - c + 1;
				4216	class_lastchar = d;
				4217
				4218	/* We can save a bit of time by skipping this in the pre-compile. */
				4219
				4220	if (lengthptr == NULL) for (; c <= d; c++)
				4221	{
				4222	classbits[c/8] \|= (1 << (c&7));
				4223	if ((options & PCRE_CASELESS) != 0)
				4224	{
				4225	int uc = cd->fcc[c]; /* flip case */
				4226	classbits[uc/8] \|= (1 << (uc&7));
				4227	}
				4228	}
				4229
				4230	continue; /* Go get the next char in the class */
				4231	}
				4232
				4233	/* Handle a lone single character - we can get here for a normal
				4234	non-escape char, or after \ that introduces a single character or for an
				4235	apparent range that isn't. */
				4236
				4237	LONE_SINGLE_CHARACTER:
				4238
				4239	/* Handle a character that cannot go in the bit map */
				4240
				4241	#ifdef SUPPORT_UTF8
				4242	if (utf8 && (c > 255 \|\| ((options & PCRE_CASELESS) != 0 && c > 127)))
				4243	{
				4244	class_utf8 = TRUE;
				4245	*class_utf8data++ = XCL_SINGLE;
				4246	class_utf8data += _pcre_ord2utf8(c, class_utf8data);
				4247
				4248	#ifdef SUPPORT_UCP
				4249	if ((options & PCRE_CASELESS) != 0)
				4250	{
				4251	unsigned int othercase;
				4252	if ((othercase = UCD_OTHERCASE(c)) != c)
				4253	{
				4254	*class_utf8data++ = XCL_SINGLE;
				4255	class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
				4256	}
				4257	}
				4258	#endif /* SUPPORT_UCP */
				4259
				4260	}
				4261	else
				4262	#endif /* SUPPORT_UTF8 */
				4263
				4264	/* Handle a single-byte character */
				4265	{
				4266	classbits[c/8] \|= (1 << (c&7));
				4267	if ((options & PCRE_CASELESS) != 0)
				4268	{
				4269	c = cd->fcc[c]; /* flip case */
				4270	classbits[c/8] \|= (1 << (c&7));
				4271	}
				4272	class_charcount++;
				4273	class_lastchar = c;
				4274	}
				4275	}
				4276
				4277	/* Loop until ']' reached. This "while" is the end of the "do" far above.
				4278	If we are at the end of an internal nested string, revert to the outer
				4279	string. */
				4280
				4281	while (((c = *(++ptr)) != 0 \|\|
				4282	(nestptr != NULL &&
				4283	(ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
				4284	(c != CHAR_RIGHT_SQUARE_BRACKET \|\| inescq));
				4285
				4286	/* Check for missing terminating ']' */
				4287
				4288	if (c == 0)
				4289	{
				4290	*errorcodeptr = ERR6;
				4291	goto FAILED;
				4292	}
				4293
				4294	/* If class_charcount is 1, we saw precisely one character whose value is
				4295	less than 256. As long as there were no characters >= 128 and there was no
				4296	use of \p or \P, in other words, no use of any XCLASS features, we can
				4297	optimize.
				4298
				4299	In UTF-8 mode, we can optimize the negative case only if there were no
				4300	characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
				4301	operate on single-bytes characters only. This is an historical hangover.
				4302	Maybe one day we can tidy these opcodes to handle multi-byte characters.
				4303
				4304	The optimization throws away the bit map. We turn the item into a
				4305	1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
				4306	Note that OP_NOT[I] does not support multibyte characters. In the positive
				4307	case, it can cause firstbyte to be set. Otherwise, there can be no first
				4308	char if this item is first, whatever repeat count may follow. In the case
				4309	of reqbyte, save the previous value for reinstating. */
				4310
				4311	#ifdef SUPPORT_UTF8
				4312	if (class_charcount == 1 && !class_utf8 &&
				4313	(!utf8 \|\| !negate_class \|\| class_lastchar < 128))
				4314	#else
				4315	if (class_charcount == 1)
				4316	#endif
				4317	{
				4318	zeroreqbyte = reqbyte;
				4319
				4320	/* The OP_NOT[I] opcodes work on one-byte characters only. */
				4321
				4322	if (negate_class)
				4323	{
				4324	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
				4325	zerofirstbyte = firstbyte;
				4326	*code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
				4327	*code++ = class_lastchar;
				4328	break;
				4329	}
				4330
				4331	/* For a single, positive character, get the value into mcbuffer, and
				4332	then we can handle this with the normal one-character code. */
				4333
				4334	#ifdef SUPPORT_UTF8
				4335	if (utf8 && class_lastchar > 127)
				4336	mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
				4337	else
				4338	#endif
				4339	{
				4340	mcbuffer[0] = class_lastchar;
				4341	mclength = 1;
				4342	}
				4343	goto ONE_CHAR;
				4344	} /* End of 1-char optimization */
				4345
				4346	/* The general case - not the one-char optimization. If this is the first
				4347	thing in the branch, there can be no first char setting, whatever the
				4348	repeat count. Any reqbyte setting must remain unchanged after any kind of
				4349	repeat. */
				4350
				4351	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
				4352	zerofirstbyte = firstbyte;
				4353	zeroreqbyte = reqbyte;
				4354
				4355	/* If there are characters with values > 255, we have to compile an
				4356	extended class, with its own opcode, unless there was a negated special
				4357	such as \S in the class, and PCRE_UCP is not set, because in that case all
				4358	characters > 255 are in the class, so any that were explicitly given as
				4359	well can be ignored. If (when there are explicit characters > 255 that must
				4360	be listed) there are no characters < 256, we can omit the bitmap in the
				4361	actual compiled code. */
				4362
				4363	#ifdef SUPPORT_UTF8
				4364	if (class_utf8 && (!should_flip_negation \|\| (options & PCRE_UCP) != 0))
				4365	{
				4366	class_utf8data++ = XCL_END; / Marks the end of extra data */
				4367	*code++ = OP_XCLASS;
				4368	code += LINK_SIZE;
				4369	*code = negate_class? XCL_NOT : 0;
				4370
				4371	/* If the map is required, move up the extra data to make room for it;
				4372	otherwise just move the code pointer to the end of the extra data. */
				4373
				4374	if (class_charcount > 0)
				4375	{
				4376	*code++ \|= XCL_MAP;
				4377	memmove(code + 32, code, class_utf8data - code);
				4378	memcpy(code, classbits, 32);
				4379	code = class_utf8data + 32;
				4380	}
				4381	else code = class_utf8data;
				4382
				4383	/* Now fill in the complete length of the item */
				4384
				4385	PUT(previous, 1, (int)(code - previous));
				4386	break; /* End of class handling */
				4387	}
				4388	#endif
				4389
				4390	/* If there are no characters > 255, or they are all to be included or
				4391	excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
				4392	whole class was negated and whether there were negative specials such as \S
				4393	(non-UCP) in the class. Then copy the 32-byte map into the code vector,
				4394	negating it if necessary. */
				4395
				4396	*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
				4397	if (negate_class)
				4398	{
				4399	if (lengthptr == NULL) /* Save time in the pre-compile phase */
				4400	for (c = 0; c < 32; c++) code[c] = ~classbits[c];
				4401	}
				4402	else
				4403	{
				4404	memcpy(code, classbits, 32);
				4405	}
				4406	code += 32;
				4407	break;
				4408
				4409
				4410	/* ===================================================================*/
				4411	/* Various kinds of repeat; '{' is not necessarily a quantifier, but this
				4412	has been tested above. */
				4413
				4414	case CHAR_LEFT_CURLY_BRACKET:
				4415	if (!is_quantifier) goto NORMAL_CHAR;
				4416	ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
				4417	if (*errorcodeptr != 0) goto FAILED;
				4418	goto REPEAT;
				4419
				4420	case CHAR_ASTERISK:
				4421	repeat_min = 0;
				4422	repeat_max = -1;
				4423	goto REPEAT;
				4424
				4425	case CHAR_PLUS:
				4426	repeat_min = 1;
				4427	repeat_max = -1;
				4428	goto REPEAT;
				4429
				4430	case CHAR_QUESTION_MARK:
				4431	repeat_min = 0;
				4432	repeat_max = 1;
				4433
				4434	REPEAT:
				4435	if (previous == NULL)
				4436	{
				4437	*errorcodeptr = ERR9;
				4438	goto FAILED;
				4439	}
				4440
				4441	if (repeat_min == 0)
				4442	{
				4443	firstbyte = zerofirstbyte; /* Adjust for zero repeat */
				4444	reqbyte = zeroreqbyte; /* Ditto */
				4445	}
				4446
				4447	/* Remember whether this is a variable length repeat */
				4448
				4449	reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
				4450
				4451	op_type = 0; /* Default single-char op codes */
				4452	possessive_quantifier = FALSE; /* Default not possessive quantifier */
				4453
				4454	/* Save start of previous item, in case we have to move it up in order to
				4455	insert something before it. */
				4456
				4457	tempcode = previous;
				4458
				4459	/* If the next character is '+', we have a possessive quantifier. This
				4460	implies greediness, whatever the setting of the PCRE_UNGREEDY option.
				4461	If the next character is '?' this is a minimizing repeat, by default,
				4462	but if PCRE_UNGREEDY is set, it works the other way round. We change the
				4463	repeat type to the non-default. */
				4464
				4465	if (ptr[1] == CHAR_PLUS)
				4466	{
				4467	repeat_type = 0; /* Force greedy */
				4468	possessive_quantifier = TRUE;
				4469	ptr++;
				4470	}
				4471	else if (ptr[1] == CHAR_QUESTION_MARK)
				4472	{
				4473	repeat_type = greedy_non_default;
				4474	ptr++;
				4475	}
				4476	else repeat_type = greedy_default;
				4477
				4478	/* If previous was a recursion call, wrap it in atomic brackets so that
				4479	previous becomes the atomic group. All recursions were so wrapped in the
				4480	past, but it no longer happens for non-repeated recursions. In fact, the
				4481	repeated ones could be re-implemented independently so as not to need this,
				4482	but for the moment we rely on the code for repeating groups. */
				4483
				4484	if (*previous == OP_RECURSE)
				4485	{
				4486	memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
				4487	*previous = OP_ONCE;
				4488	PUT(previous, 1, 2 + 2*LINK_SIZE);
				4489	previous[2 + 2*LINK_SIZE] = OP_KET;
				4490	PUT(previous, 3 + 2LINK_SIZE, 2 + 2LINK_SIZE);
				4491	code += 2 + 2 * LINK_SIZE;
				4492	length_prevgroup = 3 + 3*LINK_SIZE;
				4493
				4494	/* When actually compiling, we need to check whether this was a forward
				4495	reference, and if so, adjust the offset. */
				4496
				4497	if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
				4498	{
				4499	int offset = GET(cd->hwm, -LINK_SIZE);
				4500	if (offset == previous + 1 - cd->start_code)
				4501	PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
				4502	}
				4503	}
				4504
				4505	/* Now handle repetition for the different types of item. */
				4506
				4507	/* If previous was a character match, abolish the item and generate a
				4508	repeat item instead. If a char item has a minumum of more than one, ensure
				4509	that it is set in reqbyte - it might not be if a sequence such as x{3} is
				4510	the first thing in a branch because the x will have gone into firstbyte
				4511	instead. */
				4512
				4513	if (previous == OP_CHAR \|\| previous == OP_CHARI)
				4514	{
				4515	op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
				4516
				4517	/* Deal with UTF-8 characters that take up more than one byte. It's
				4518	easier to write this out separately than try to macrify it. Use c to
				4519	hold the length of the character in bytes, plus 0x80 to flag that it's a
				4520	length rather than a small character. */
				4521
				4522	#ifdef SUPPORT_UTF8
				4523	if (utf8 && (code[-1] & 0x80) != 0)
				4524	{
				4525	uschar *lastchar = code - 1;
				4526	while((*lastchar & 0xc0) == 0x80) lastchar--;
				4527	c = (int)(code - lastchar); /* Length of UTF-8 character */
				4528	memcpy(utf8_char, lastchar, c); /* Save the char */
				4529	c \|= 0x80; /* Flag c as a length */
				4530	}
				4531	else
				4532	#endif
				4533
				4534	/* Handle the case of a single byte - either with no UTF8 support, or
				4535	with UTF-8 disabled, or for a UTF-8 character < 128. */
				4536
				4537	{
				4538	c = code[-1];
				4539	if (repeat_min > 1) reqbyte = c \| req_caseopt \| cd->req_varyopt;
				4540	}
				4541
				4542	/* If the repetition is unlimited, it pays to see if the next thing on
				4543	the line is something that cannot possibly match this character. If so,
				4544	automatically possessifying this item gains some performance in the case
				4545	where the match fails. */
				4546
				4547	if (!possessive_quantifier &&
				4548	repeat_max < 0 &&
				4549	check_auto_possessive(previous, utf8, ptr + 1, options, cd))
				4550	{
				4551	repeat_type = 0; /* Force greedy */
				4552	possessive_quantifier = TRUE;
				4553	}
				4554
				4555	goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
				4556	}
				4557
				4558	/* If previous was a single negated character ([^a] or similar), we use
				4559	one of the special opcodes, replacing it. The code is shared with single-
				4560	character repeats by setting opt_type to add a suitable offset into
				4561	repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI
				4562	are currently used only for single-byte chars. */
				4563
				4564	else if (previous == OP_NOT \|\| previous == OP_NOTI)
				4565	{
				4566	op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;
				4567	c = previous[1];
				4568	if (!possessive_quantifier &&
				4569	repeat_max < 0 &&
				4570	check_auto_possessive(previous, utf8, ptr + 1, options, cd))
				4571	{
				4572	repeat_type = 0; /* Force greedy */
				4573	possessive_quantifier = TRUE;
				4574	}
				4575	goto OUTPUT_SINGLE_REPEAT;
				4576	}
				4577
				4578	/* If previous was a character type match (\d or similar), abolish it and
				4579	create a suitable repeat item. The code is shared with single-character
				4580	repeats by setting op_type to add a suitable offset into repeat_type. Note
				4581	the the Unicode property types will be present only when SUPPORT_UCP is
				4582	defined, but we don't wrap the little bits of code here because it just
				4583	makes it horribly messy. */
				4584
				4585	else if (*previous < OP_EODN)
				4586	{
				4587	uschar *oldcode;
				4588	int prop_type, prop_value;
				4589	op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
				4590	c = *previous;
				4591
				4592	if (!possessive_quantifier &&
				4593	repeat_max < 0 &&
				4594	check_auto_possessive(previous, utf8, ptr + 1, options, cd))
				4595	{
				4596	repeat_type = 0; /* Force greedy */
				4597	possessive_quantifier = TRUE;
				4598	}
				4599
				4600	OUTPUT_SINGLE_REPEAT:
				4601	if (previous == OP_PROP \|\| previous == OP_NOTPROP)
				4602	{
				4603	prop_type = previous[1];
				4604	prop_value = previous[2];
				4605	}
				4606	else prop_type = prop_value = -1;
				4607
				4608	oldcode = code;
				4609	code = previous; /* Usually overwrite previous item */
				4610
				4611	/* If the maximum is zero then the minimum must also be zero; Perl allows
				4612	this case, so we do too - by simply omitting the item altogether. */
				4613
				4614	if (repeat_max == 0) goto END_REPEAT;
				4615
				4616	/--------------------------------------------------------------------/
				4617	/* This code is obsolete from release 8.00; the restriction was finally
				4618	removed: */
				4619
				4620	/* All real repeats make it impossible to handle partial matching (maybe
				4621	one day we will be able to remove this restriction). */
				4622
				4623	/* if (repeat_max != 1) cd->external_flags \|= PCRE_NOPARTIAL; */
				4624	/--------------------------------------------------------------------/
				4625
				4626	/* Combine the op_type with the repeat_type */
				4627
				4628	repeat_type += op_type;
				4629
				4630	/* A minimum of zero is handled either as the special case * or ?, or as
				4631	an UPTO, with the maximum given. */
				4632
				4633	if (repeat_min == 0)
				4634	{
				4635	if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
				4636	else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
				4637	else
				4638	{
				4639	*code++ = OP_UPTO + repeat_type;
				4640	PUT2INC(code, 0, repeat_max);
				4641	}
				4642	}
				4643
				4644	/* A repeat minimum of 1 is optimized into some special cases. If the
				4645	maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
				4646	left in place and, if the maximum is greater than 1, we use OP_UPTO with
				4647	one less than the maximum. */
				4648
				4649	else if (repeat_min == 1)
				4650	{
				4651	if (repeat_max == -1)
				4652	*code++ = OP_PLUS + repeat_type;
				4653	else
				4654	{
				4655	code = oldcode; /* leave previous item in place */
				4656	if (repeat_max == 1) goto END_REPEAT;
				4657	*code++ = OP_UPTO + repeat_type;
				4658	PUT2INC(code, 0, repeat_max - 1);
				4659	}
				4660	}
				4661
				4662	/* The case {n,n} is just an EXACT, while the general case {n,m} is
				4663	handled as an EXACT followed by an UPTO. */
				4664
				4665	else
				4666	{
				4667	code++ = OP_EXACT + op_type; / NB EXACT doesn't have repeat_type */
				4668	PUT2INC(code, 0, repeat_min);
				4669
				4670	/* If the maximum is unlimited, insert an OP_STAR. Before doing so,
				4671	we have to insert the character for the previous code. For a repeated
				4672	Unicode property match, there are two extra bytes that define the
				4673	required property. In UTF-8 mode, long characters have their length in
				4674	c, with the 0x80 bit as a flag. */
				4675
				4676	if (repeat_max < 0)
				4677	{
				4678	#ifdef SUPPORT_UTF8
				4679	if (utf8 && c >= 128)
				4680	{
				4681	memcpy(code, utf8_char, c & 7);
				4682	code += c & 7;
				4683	}
				4684	else
				4685	#endif
				4686	{
				4687	*code++ = c;
				4688	if (prop_type >= 0)
				4689	{
				4690	*code++ = prop_type;
				4691	*code++ = prop_value;
				4692	}
				4693	}
				4694	*code++ = OP_STAR + repeat_type;
				4695	}
				4696
				4697	/* Else insert an UPTO if the max is greater than the min, again
				4698	preceded by the character, for the previously inserted code. If the
				4699	UPTO is just for 1 instance, we can use QUERY instead. */
				4700
				4701	else if (repeat_max != repeat_min)
				4702	{
				4703	#ifdef SUPPORT_UTF8
				4704	if (utf8 && c >= 128)
				4705	{
				4706	memcpy(code, utf8_char, c & 7);
				4707	code += c & 7;
				4708	}
				4709	else
				4710	#endif
				4711	*code++ = c;
				4712	if (prop_type >= 0)
				4713	{
				4714	*code++ = prop_type;
				4715	*code++ = prop_value;
				4716	}
				4717	repeat_max -= repeat_min;
				4718
				4719	if (repeat_max == 1)
				4720	{
				4721	*code++ = OP_QUERY + repeat_type;
				4722	}
				4723	else
				4724	{
				4725	*code++ = OP_UPTO + repeat_type;
				4726	PUT2INC(code, 0, repeat_max);
				4727	}
				4728	}
				4729	}
				4730
				4731	/* The character or character type itself comes last in all cases. */
				4732
				4733	#ifdef SUPPORT_UTF8
				4734	if (utf8 && c >= 128)
				4735	{
				4736	memcpy(code, utf8_char, c & 7);
				4737	code += c & 7;
				4738	}
				4739	else
				4740	#endif
				4741	*code++ = c;
				4742
				4743	/* For a repeated Unicode property match, there are two extra bytes that
				4744	define the required property. */
				4745
				4746	#ifdef SUPPORT_UCP
				4747	if (prop_type >= 0)
				4748	{
				4749	*code++ = prop_type;
				4750	*code++ = prop_value;
				4751	}
				4752	#endif
				4753	}
				4754
				4755	/* If previous was a character class or a back reference, we put the repeat
				4756	stuff after it, but just skip the item if the repeat was {0,0}. */
				4757
				4758	else if (*previous == OP_CLASS \|\|
				4759	*previous == OP_NCLASS \|\|
				4760	#ifdef SUPPORT_UTF8
				4761	*previous == OP_XCLASS \|\|
				4762	#endif
				4763	*previous == OP_REF \|\|
				4764	*previous == OP_REFI)
				4765	{
				4766	if (repeat_max == 0)
				4767	{
				4768	code = previous;
				4769	goto END_REPEAT;
				4770	}
				4771
				4772	/--------------------------------------------------------------------/
				4773	/* This code is obsolete from release 8.00; the restriction was finally
				4774	removed: */
				4775
				4776	/* All real repeats make it impossible to handle partial matching (maybe
				4777	one day we will be able to remove this restriction). */
				4778
				4779	/* if (repeat_max != 1) cd->external_flags \|= PCRE_NOPARTIAL; */
				4780	/--------------------------------------------------------------------/
				4781
				4782	if (repeat_min == 0 && repeat_max == -1)
				4783	*code++ = OP_CRSTAR + repeat_type;
				4784	else if (repeat_min == 1 && repeat_max == -1)
				4785	*code++ = OP_CRPLUS + repeat_type;
				4786	else if (repeat_min == 0 && repeat_max == 1)
				4787	*code++ = OP_CRQUERY + repeat_type;
				4788	else
				4789	{
				4790	*code++ = OP_CRRANGE + repeat_type;
				4791	PUT2INC(code, 0, repeat_min);
				4792	if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
				4793	PUT2INC(code, 0, repeat_max);
				4794	}
				4795	}
				4796
				4797	/* If previous was a bracket group, we may have to replicate it in certain
				4798	cases. Note that at this point we can encounter only the "basic" bracket
				4799	opcodes such as BRA and CBRA, as this is the place where they get converted
				4800	into the more special varieties such as BRAPOS and SBRA. A test for >=
				4801	OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
				4802	ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
				4803	repetition of assertions, but now it does, for Perl compatibility. */
				4804
				4805	else if (previous >= OP_ASSERT && previous <= OP_COND)
				4806	{
				4807	register int i;
				4808	int len = (int)(code - previous);
				4809	uschar *bralink = NULL;
				4810	uschar *brazeroptr = NULL;
				4811
				4812	/* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
				4813	we just ignore the repeat. */
				4814
				4815	if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
				4816	goto END_REPEAT;
				4817
				4818	/* There is no sense in actually repeating assertions. The only potential
				4819	use of repetition is in cases when the assertion is optional. Therefore,
				4820	if the minimum is greater than zero, just ignore the repeat. If the
				4821	maximum is not not zero or one, set it to 1. */
				4822
				4823	if (previous < OP_ONCE) / Assertion */
				4824	{
				4825	if (repeat_min > 0) goto END_REPEAT;
				4826	if (repeat_max < 0 \|\| repeat_max > 1) repeat_max = 1;
				4827	}
				4828
				4829	/* The case of a zero minimum is special because of the need to stick
				4830	OP_BRAZERO in front of it, and because the group appears once in the
				4831	data, whereas in other cases it appears the minimum number of times. For
				4832	this reason, it is simplest to treat this case separately, as otherwise
				4833	the code gets far too messy. There are several special subcases when the
				4834	minimum is zero. */
				4835
				4836	if (repeat_min == 0)
				4837	{
				4838	/* If the maximum is also zero, we used to just omit the group from the
				4839	output altogether, like this:
				4840
				4841	** if (repeat_max == 0)
				4842	** {
				4843	** code = previous;
				4844	** goto END_REPEAT;
				4845	** }
				4846
				4847	However, that fails when a group or a subgroup within it is referenced
				4848	as a subroutine from elsewhere in the pattern, so now we stick in
				4849	OP_SKIPZERO in front of it so that it is skipped on execution. As we
				4850	don't have a list of which groups are referenced, we cannot do this
				4851	selectively.
				4852
				4853	If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
				4854	and do no more at this point. However, we do need to adjust any
				4855	OP_RECURSE calls inside the group that refer to the group itself or any
				4856	internal or forward referenced group, because the offset is from the
				4857	start of the whole regex. Temporarily terminate the pattern while doing
				4858	this. */
				4859
				4860	if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
				4861	{
				4862	*code = OP_END;
				4863	adjust_recurse(previous, 1, utf8, cd, save_hwm);
				4864	memmove(previous+1, previous, len);
				4865	code++;
				4866	if (repeat_max == 0)
				4867	{
				4868	*previous++ = OP_SKIPZERO;
				4869	goto END_REPEAT;
				4870	}
				4871	brazeroptr = previous; /* Save for possessive optimizing */
				4872	*previous++ = OP_BRAZERO + repeat_type;
				4873	}
				4874
				4875	/* If the maximum is greater than 1 and limited, we have to replicate
				4876	in a nested fashion, sticking OP_BRAZERO before each set of brackets.
				4877	The first one has to be handled carefully because it's the original
				4878	copy, which has to be moved up. The remainder can be handled by code
				4879	that is common with the non-zero minimum case below. We have to
				4880	adjust the value or repeat_max, since one less copy is required. Once
				4881	again, we may have to adjust any OP_RECURSE calls inside the group. */
				4882
				4883	else
				4884	{
				4885	int offset;
				4886	*code = OP_END;
				4887	adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
				4888	memmove(previous + 2 + LINK_SIZE, previous, len);
				4889	code += 2 + LINK_SIZE;
				4890	*previous++ = OP_BRAZERO + repeat_type;
				4891	*previous++ = OP_BRA;
				4892
				4893	/* We chain together the bracket offset fields that have to be
				4894	filled in later when the ends of the brackets are reached. */
				4895
				4896	offset = (bralink == NULL)? 0 : (int)(previous - bralink);
				4897	bralink = previous;
				4898	PUTINC(previous, 0, offset);
				4899	}
				4900
				4901	repeat_max--;
				4902	}
				4903
				4904	/* If the minimum is greater than zero, replicate the group as many
				4905	times as necessary, and adjust the maximum to the number of subsequent
				4906	copies that we need. If we set a first char from the group, and didn't
				4907	set a required char, copy the latter from the former. If there are any
				4908	forward reference subroutine calls in the group, there will be entries on
				4909	the workspace list; replicate these with an appropriate increment. */
				4910
				4911	else
				4912	{
				4913	if (repeat_min > 1)
				4914	{
				4915	/* In the pre-compile phase, we don't actually do the replication. We
				4916	just adjust the length as if we had. Do some paranoid checks for
				4917	potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
				4918	integer type when available, otherwise double. */
				4919
				4920	if (lengthptr != NULL)
				4921	{
				4922	int delta = (repeat_min - 1)*length_prevgroup;
				4923	if ((INT64_OR_DOUBLE)(repeat_min - 1)*
				4924	(INT64_OR_DOUBLE)length_prevgroup >
				4925	(INT64_OR_DOUBLE)INT_MAX \|\|
				4926	OFLOW_MAX - *lengthptr < delta)
				4927	{
				4928	*errorcodeptr = ERR20;
				4929	goto FAILED;
				4930	}
				4931	*lengthptr += delta;
				4932	}
				4933
				4934	/* This is compiling for real. If there is a set first byte for
				4935	the group, and we have not yet set a "required byte", set it. Make
				4936	sure there is enough workspace for copying forward references before
				4937	doing the copy. */
				4938
				4939	else
				4940	{
				4941	if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
				4942
				4943	for (i = 1; i < repeat_min; i++)
				4944	{
				4945	uschar *hc;
				4946	uschar *this_hwm = cd->hwm;
				4947	memcpy(code, previous, len);
				4948
				4949	while (cd->hwm > cd->start_workspace + cd->workspace_size -
				4950	WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
				4951	{
				4952	int save_offset = save_hwm - cd->start_workspace;
				4953	int this_offset = this_hwm - cd->start_workspace;
				4954	*errorcodeptr = expand_workspace(cd);
				4955	if (*errorcodeptr != 0) goto FAILED;
				4956	save_hwm = (uschar *)cd->start_workspace + save_offset;
				4957	this_hwm = (uschar *)cd->start_workspace + this_offset;
				4958	}
				4959
				4960	for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
				4961	{
				4962	PUT(cd->hwm, 0, GET(hc, 0) + len);
				4963	cd->hwm += LINK_SIZE;
				4964	}
				4965	save_hwm = this_hwm;
				4966	code += len;
				4967	}
				4968	}
				4969	}
				4970
				4971	if (repeat_max > 0) repeat_max -= repeat_min;
				4972	}
				4973
				4974	/* This code is common to both the zero and non-zero minimum cases. If
				4975	the maximum is limited, it replicates the group in a nested fashion,
				4976	remembering the bracket starts on a stack. In the case of a zero minimum,
				4977	the first one was set up above. In all cases the repeat_max now specifies
				4978	the number of additional copies needed. Again, we must remember to
				4979	replicate entries on the forward reference list. */
				4980
				4981	if (repeat_max >= 0)
				4982	{
				4983	/* In the pre-compile phase, we don't actually do the replication. We
				4984	just adjust the length as if we had. For each repetition we must add 1
				4985	to the length for BRAZERO and for all but the last repetition we must
				4986	add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
				4987	paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
				4988	a 64-bit integer type when available, otherwise double. */
				4989
				4990	if (lengthptr != NULL && repeat_max > 0)
				4991	{
				4992	int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
				4993	2 - 2LINK_SIZE; / Last one doesn't nest */
				4994	if ((INT64_OR_DOUBLE)repeat_max *
				4995	(INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
				4996	> (INT64_OR_DOUBLE)INT_MAX \|\|
				4997	OFLOW_MAX - *lengthptr < delta)
				4998	{
				4999	*errorcodeptr = ERR20;
				5000	goto FAILED;
				5001	}
				5002	*lengthptr += delta;
				5003	}
				5004
				5005	/* This is compiling for real */
				5006
				5007	else for (i = repeat_max - 1; i >= 0; i--)
				5008	{
				5009	uschar *hc;
				5010	uschar *this_hwm = cd->hwm;
				5011
				5012	*code++ = OP_BRAZERO + repeat_type;
				5013
				5014	/* All but the final copy start a new nesting, maintaining the
				5015	chain of brackets outstanding. */
				5016
				5017	if (i != 0)
				5018	{
				5019	int offset;
				5020	*code++ = OP_BRA;
				5021	offset = (bralink == NULL)? 0 : (int)(code - bralink);
				5022	bralink = code;
				5023	PUTINC(code, 0, offset);
				5024	}
				5025
				5026	memcpy(code, previous, len);
				5027
				5028	/* Ensure there is enough workspace for forward references before
				5029	copying them. */
				5030
				5031	while (cd->hwm > cd->start_workspace + cd->workspace_size -
				5032	WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
				5033	{
				5034	int save_offset = save_hwm - cd->start_workspace;
				5035	int this_offset = this_hwm - cd->start_workspace;
				5036	*errorcodeptr = expand_workspace(cd);
				5037	if (*errorcodeptr != 0) goto FAILED;
				5038	save_hwm = (uschar *)cd->start_workspace + save_offset;
				5039	this_hwm = (uschar *)cd->start_workspace + this_offset;
				5040	}
				5041
				5042	for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
				5043	{
				5044	PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
				5045	cd->hwm += LINK_SIZE;
				5046	}
				5047	save_hwm = this_hwm;
				5048	code += len;
				5049	}
				5050
				5051	/* Now chain through the pending brackets, and fill in their length
				5052	fields (which are holding the chain links pro tem). */
				5053
				5054	while (bralink != NULL)
				5055	{
				5056	int oldlinkoffset;
				5057	int offset = (int)(code - bralink + 1);
				5058	uschar *bra = code - offset;
				5059	oldlinkoffset = GET(bra, 1);
				5060	bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
				5061	*code++ = OP_KET;
				5062	PUTINC(code, 0, offset);
				5063	PUT(bra, 1, offset);
				5064	}
				5065	}
				5066
				5067	/* If the maximum is unlimited, set a repeater in the final copy. For
				5068	ONCE brackets, that's all we need to do. However, possessively repeated
				5069	ONCE brackets can be converted into non-capturing brackets, as the
				5070	behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
				5071	deal with possessive ONCEs specially.
				5072
				5073	Otherwise, when we are doing the actual compile phase, check to see
				5074	whether this group is one that could match an empty string. If so,
				5075	convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
				5076	that runtime checking can be done. [This check is also applied to ONCE
				5077	groups at runtime, but in a different way.]
				5078
				5079	Then, if the quantifier was possessive and the bracket is not a
				5080	conditional, we convert the BRA code to the POS form, and the KET code to
				5081	KETRPOS. (It turns out to be convenient at runtime to detect this kind of
				5082	subpattern at both the start and at the end.) The use of special opcodes
				5083	makes it possible to reduce greatly the stack usage in pcre_exec(). If
				5084	the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
				5085
				5086	Then, if the minimum number of matches is 1 or 0, cancel the possessive
				5087	flag so that the default action below, of wrapping everything inside
				5088	atomic brackets, does not happen. When the minimum is greater than 1,
				5089	there will be earlier copies of the group, and so we still have to wrap
				5090	the whole thing. */
				5091
				5092	else
				5093	{
				5094	uschar *ketcode = code - 1 - LINK_SIZE;
				5095	uschar *bracode = ketcode - GET(ketcode, 1);
				5096
				5097	/* Convert possessive ONCE brackets to non-capturing */
				5098
				5099	if ((bracode == OP_ONCE \|\| bracode == OP_ONCE_NC) &&
				5100	possessive_quantifier) *bracode = OP_BRA;
				5101
				5102	/* For non-possessive ONCE brackets, all we need to do is to
				5103	set the KET. */
				5104
				5105	if (bracode == OP_ONCE \|\| bracode == OP_ONCE_NC)
				5106	*ketcode = OP_KETRMAX + repeat_type;
				5107
				5108	/* Handle non-ONCE brackets and possessive ONCEs (which have been
				5109	converted to non-capturing above). */
				5110
				5111	else
				5112	{
				5113	/* In the compile phase, check for empty string matching. */
				5114
				5115	if (lengthptr == NULL)
				5116	{
				5117	uschar *scode = bracode;
				5118	do
				5119	{
				5120	if (could_be_empty_branch(scode, ketcode, utf8, cd))
				5121	{
				5122	*bracode += OP_SBRA - OP_BRA;
				5123	break;
				5124	}
				5125	scode += GET(scode, 1);
				5126	}
				5127	while (*scode == OP_ALT);
				5128	}
				5129
				5130	/* Handle possessive quantifiers. */
				5131
				5132	if (possessive_quantifier)
				5133	{
				5134	/* For COND brackets, we wrap the whole thing in a possessively
				5135	repeated non-capturing bracket, because we have not invented POS
				5136	versions of the COND opcodes. Because we are moving code along, we
				5137	must ensure that any pending recursive references are updated. */
				5138
				5139	if (bracode == OP_COND \|\| bracode == OP_SCOND)
				5140	{
				5141	int nlen = (int)(code - bracode);
				5142	*code = OP_END;
				5143	adjust_recurse(bracode, 1 + LINK_SIZE, utf8, cd, save_hwm);
				5144	memmove(bracode + 1+LINK_SIZE, bracode, nlen);
				5145	code += 1 + LINK_SIZE;
				5146	nlen += 1 + LINK_SIZE;
				5147	*bracode = OP_BRAPOS;
				5148	*code++ = OP_KETRPOS;
				5149	PUTINC(code, 0, nlen);
				5150	PUT(bracode, 1, nlen);
				5151	}
				5152
				5153	/* For non-COND brackets, we modify the BRA code and use KETRPOS. */
				5154
				5155	else
				5156	{
				5157	bracode += 1; / Switch to xxxPOS opcodes */
				5158	*ketcode = OP_KETRPOS;
				5159	}
				5160
				5161	/* If the minimum is zero, mark it as possessive, then unset the
				5162	possessive flag when the minimum is 0 or 1. */
				5163
				5164	if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
				5165	if (repeat_min < 2) possessive_quantifier = FALSE;
				5166	}
				5167
				5168	/* Non-possessive quantifier */
				5169
				5170	else *ketcode = OP_KETRMAX + repeat_type;
				5171	}
				5172	}
				5173	}
				5174
				5175	/* If previous is OP_FAIL, it was generated by an empty class [] in
				5176	JavaScript mode. The other ways in which OP_FAIL can be generated, that is
				5177	by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
				5178	error above. We can just ignore the repeat in JS case. */
				5179
				5180	else if (*previous == OP_FAIL) goto END_REPEAT;
				5181
				5182	/* Else there's some kind of shambles */
				5183
				5184	else
				5185	{
				5186	*errorcodeptr = ERR11;
				5187	goto FAILED;
				5188	}
				5189
				5190	/* If the character following a repeat is '+', or if certain optimization
				5191	tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
				5192	there are special alternative opcodes for this case. For anything else, we
				5193	wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
				5194	notation is just syntactic sugar, taken from Sun's Java package, but the
				5195	special opcodes can optimize it.
				5196
				5197	Some (but not all) possessively repeated subpatterns have already been
				5198	completely handled in the code just above. For them, possessive_quantifier
				5199	is always FALSE at this stage.
				5200
				5201	Note that the repeated item starts at tempcode, not at previous, which
				5202	might be the first part of a string whose (former) last char we repeated.
				5203
				5204	Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
				5205	an 'upto' may follow. We skip over an 'exact' item, and then test the
				5206	length of what remains before proceeding. */
				5207
				5208	if (possessive_quantifier)
				5209	{
				5210	int len;
				5211
				5212	if (*tempcode == OP_TYPEEXACT)
				5213	tempcode += _pcre_OP_lengths[*tempcode] +
				5214	((tempcode[3] == OP_PROP \|\| tempcode[3] == OP_NOTPROP)? 2 : 0);
				5215
				5216	else if (tempcode == OP_EXACT \|\| tempcode == OP_NOTEXACT)
				5217	{
				5218	tempcode += _pcre_OP_lengths[*tempcode];
				5219	#ifdef SUPPORT_UTF8
				5220	if (utf8 && tempcode[-1] >= 0xc0)
				5221	tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
				5222	#endif
				5223	}
				5224
				5225	len = (int)(code - tempcode);
				5226	if (len > 0) switch (*tempcode)
				5227	{
				5228	case OP_STAR: *tempcode = OP_POSSTAR; break;
				5229	case OP_PLUS: *tempcode = OP_POSPLUS; break;
				5230	case OP_QUERY: *tempcode = OP_POSQUERY; break;
				5231	case OP_UPTO: *tempcode = OP_POSUPTO; break;
				5232
				5233	case OP_STARI: *tempcode = OP_POSSTARI; break;
				5234	case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
				5235	case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
				5236	case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
				5237
				5238	case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
				5239	case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
				5240	case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
				5241	case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
				5242
				5243	case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
				5244	case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
				5245	case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
				5246	case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
				5247
				5248	case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
				5249	case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
				5250	case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
				5251	case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
				5252
				5253	/* Because we are moving code along, we must ensure that any
				5254	pending recursive references are updated. */
				5255
				5256	default:
				5257	*code = OP_END;
				5258	adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
				5259	memmove(tempcode + 1+LINK_SIZE, tempcode, len);
				5260	code += 1 + LINK_SIZE;
				5261	len += 1 + LINK_SIZE;
				5262	tempcode[0] = OP_ONCE;
				5263	*code++ = OP_KET;
				5264	PUTINC(code, 0, len);
				5265	PUT(tempcode, 1, len);
				5266	break;
				5267	}
				5268	}
				5269
				5270	/* In all case we no longer have a previous item. We also set the
				5271	"follows varying string" flag for subsequently encountered reqbytes if
				5272	it isn't already set and we have just passed a varying length item. */
				5273
				5274	END_REPEAT:
				5275	previous = NULL;
				5276	cd->req_varyopt \|= reqvary;
				5277	break;
				5278
				5279
				5280	/* ===================================================================*/
				5281	/* Start of nested parenthesized sub-expression, or comment or lookahead or
				5282	lookbehind or option setting or condition or all the other extended
				5283	parenthesis forms. */
				5284
				5285	case CHAR_LEFT_PARENTHESIS:
				5286	newoptions = options;
				5287	skipbytes = 0;
				5288	bravalue = OP_CBRA;
				5289	save_hwm = cd->hwm;
				5290	reset_bracount = FALSE;
				5291
				5292	/* First deal with various "verbs" that can be introduced by ''. /
				5293
				5294	if (*(++ptr) == CHAR_ASTERISK &&
				5295	((cd->ctypes[ptr[1]] & ctype_letter) != 0 \|\| ptr[1] == ':'))
				5296	{
				5297	int i, namelen;
				5298	int arglen = 0;
				5299	const char *vn = verbnames;
				5300	const uschar *name = ptr + 1;
				5301	const uschar *arg = NULL;
				5302	previous = NULL;
				5303	while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
				5304	namelen = (int)(ptr - name);
				5305
				5306	/* It appears that Perl allows any characters whatsoever, other than
				5307	a closing parenthesis, to appear in arguments, so we no longer insist on
				5308	letters, digits, and underscores. */
				5309
				5310	if (*ptr == CHAR_COLON)
				5311	{
				5312	arg = ++ptr;
				5313	while (ptr != 0 && ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
				5314	arglen = (int)(ptr - arg);
				5315	}
				5316
				5317	if (*ptr != CHAR_RIGHT_PARENTHESIS)
				5318	{
				5319	*errorcodeptr = ERR60;
				5320	goto FAILED;
				5321	}
				5322
				5323	/* Scan the table of verb names */
				5324
				5325	for (i = 0; i < verbcount; i++)
				5326	{
				5327	if (namelen == verbs[i].len &&
				5328	strncmp((char *)name, vn, namelen) == 0)
				5329	{
				5330	/* Check for open captures before ACCEPT and convert it to
				5331	ASSERT_ACCEPT if in an assertion. */
				5332
				5333	if (verbs[i].op == OP_ACCEPT)
				5334	{
				5335	open_capitem *oc;
				5336	if (arglen != 0)
				5337	{
				5338	*errorcodeptr = ERR59;
				5339	goto FAILED;
				5340	}
				5341	cd->had_accept = TRUE;
				5342	for (oc = cd->open_caps; oc != NULL; oc = oc->next)
				5343	{
				5344	*code++ = OP_CLOSE;
				5345	PUT2INC(code, 0, oc->number);
				5346	}
				5347	*code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
				5348
				5349	/* Do not set firstbyte after ACCEPT /
				5350	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
				5351	}
				5352
				5353	/* Handle other cases with/without an argument */
				5354
				5355	else if (arglen == 0)
				5356	{
				5357	if (verbs[i].op < 0) /* Argument is mandatory */
				5358	{
				5359	*errorcodeptr = ERR66;
				5360	goto FAILED;
				5361	}
				5362	*code = verbs[i].op;
				5363	if (*code++ == OP_THEN) cd->external_flags \|= PCRE_HASTHEN;
				5364	}
				5365
				5366	else
				5367	{
				5368	if (verbs[i].op_arg < 0) /* Argument is forbidden */
				5369	{
				5370	*errorcodeptr = ERR59;
				5371	goto FAILED;
				5372	}
				5373	*code = verbs[i].op_arg;
				5374	if (*code++ == OP_THEN_ARG) cd->external_flags \|= PCRE_HASTHEN;
				5375	*code++ = arglen;
				5376	memcpy(code, arg, arglen);
				5377	code += arglen;
				5378	*code++ = 0;
				5379	}
				5380
				5381	break; /* Found verb, exit loop */
				5382	}
				5383
				5384	vn += verbs[i].len + 1;
				5385	}
				5386
				5387	if (i < verbcount) continue; /* Successfully handled a verb */
				5388	errorcodeptr = ERR60; / Verb not recognized */
				5389	goto FAILED;
				5390	}
				5391
				5392	/* Deal with the extended parentheses; all are introduced by '?', and the
				5393	appearance of any of them means that this is not a capturing group. */
				5394
				5395	else if (*ptr == CHAR_QUESTION_MARK)
				5396	{
				5397	int i, set, unset, namelen;
				5398	int *optset;
				5399	const uschar *name;
				5400	uschar *slot;
				5401
				5402	switch (*(++ptr))
				5403	{
				5404	case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
				5405	ptr++;
				5406	while (ptr != 0 && ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
				5407	if (*ptr == 0)
				5408	{
				5409	*errorcodeptr = ERR18;
				5410	goto FAILED;
				5411	}
				5412	continue;
				5413
				5414
				5415	/* ------------------------------------------------------------ */
				5416	case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
				5417	reset_bracount = TRUE;
				5418	/* Fall through */
				5419
				5420	/* ------------------------------------------------------------ */
				5421	case CHAR_COLON: /* Non-capturing bracket */
				5422	bravalue = OP_BRA;
				5423	ptr++;
				5424	break;
				5425
				5426
				5427	/* ------------------------------------------------------------ */
				5428	case CHAR_LEFT_PARENTHESIS:
				5429	bravalue = OP_COND; /* Conditional group */
				5430
				5431	/* A condition can be an assertion, a number (referring to a numbered
				5432	group), a name (referring to a named group), or 'R', referring to
				5433	recursion. R<digits> and R&name are also permitted for recursion tests.
				5434
				5435	There are several syntaxes for testing a named group: (?(name)) is used
				5436	by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
				5437
				5438	There are two unfortunate ambiguities, caused by history. (a) 'R' can
				5439	be the recursive thing or the name 'R' (and similarly for 'R' followed
				5440	by digits), and (b) a number could be a name that consists of digits.
				5441	In both cases, we look for a name first; if not found, we try the other
				5442	cases. */
				5443
				5444	/* For conditions that are assertions, check the syntax, and then exit
				5445	the switch. This will take control down to where bracketed groups,
				5446	including assertions, are processed. */
				5447
				5448	if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN \|\|
				5449	ptr[2] == CHAR_EXCLAMATION_MARK \|\| ptr[2] == CHAR_LESS_THAN_SIGN))
				5450	break;
				5451
				5452	/* Most other conditions use OP_CREF (a couple change to OP_RREF
				5453	below), and all need to skip 3 bytes at the start of the group. */
				5454
				5455	code[1+LINK_SIZE] = OP_CREF;
				5456	skipbytes = 3;
				5457	refsign = -1;
				5458
				5459	/* Check for a test for recursion in a named group. */
				5460
				5461	if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
				5462	{
				5463	terminator = -1;
				5464	ptr += 2;
				5465	code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
				5466	}
				5467
				5468	/* Check for a test for a named group's having been set, using the Perl
				5469	syntax (?(<name>) or (?('name') */
				5470
				5471	else if (ptr[1] == CHAR_LESS_THAN_SIGN)
				5472	{
				5473	terminator = CHAR_GREATER_THAN_SIGN;
				5474	ptr++;
				5475	}
				5476	else if (ptr[1] == CHAR_APOSTROPHE)
				5477	{
				5478	terminator = CHAR_APOSTROPHE;
				5479	ptr++;
				5480	}
				5481	else
				5482	{
				5483	terminator = 0;
				5484	if (ptr[1] == CHAR_MINUS \|\| ptr[1] == CHAR_PLUS) refsign = *(++ptr);
				5485	}
				5486
				5487	/* We now expect to read a name; any thing else is an error */
				5488
				5489	if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
				5490	{
				5491	ptr += 1; /* To get the right offset */
				5492	*errorcodeptr = ERR28;
				5493	goto FAILED;
				5494	}
				5495
				5496	/* Read the name, but also get it as a number if it's all digits */
				5497
				5498	recno = 0;
				5499	name = ++ptr;
				5500	while ((cd->ctypes[*ptr] & ctype_word) != 0)
				5501	{
				5502	if (recno >= 0)
				5503	recno = ((digitab[*ptr] & ctype_digit) != 0)?
				5504	recno * 10 + *ptr - CHAR_0 : -1;
				5505	ptr++;
				5506	}
				5507	namelen = (int)(ptr - name);
				5508
				5509	if ((terminator > 0 && *ptr++ != terminator) \|\|
				5510	*ptr++ != CHAR_RIGHT_PARENTHESIS)
				5511	{
				5512	ptr--; /* Error offset */
				5513	*errorcodeptr = ERR26;
				5514	goto FAILED;
				5515	}
				5516
				5517	/* Do no further checking in the pre-compile phase. */
				5518
				5519	if (lengthptr != NULL) break;
				5520
				5521	/* In the real compile we do the work of looking for the actual
				5522	reference. If the string started with "+" or "-" we require the rest to
				5523	be digits, in which case recno will be set. */
				5524
				5525	if (refsign > 0)
				5526	{
				5527	if (recno <= 0)
				5528	{
				5529	*errorcodeptr = ERR58;
				5530	goto FAILED;
				5531	}
				5532	recno = (refsign == CHAR_MINUS)?
				5533	cd->bracount - recno + 1 : recno +cd->bracount;
				5534	if (recno <= 0 \|\| recno > cd->final_bracount)
				5535	{
				5536	*errorcodeptr = ERR15;
				5537	goto FAILED;
				5538	}
				5539	PUT2(code, 2+LINK_SIZE, recno);
				5540	break;
				5541	}
				5542
				5543	/* Otherwise (did not start with "+" or "-"), start by looking for the
				5544	name. If we find a name, add one to the opcode to change OP_CREF or
				5545	OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
				5546	except they record that the reference was originally to a name. The
				5547	information is used to check duplicate names. */
				5548
				5549	slot = cd->name_table;
				5550	for (i = 0; i < cd->names_found; i++)
				5551	{
				5552	if (strncmp((char )name, (char )slot+2, namelen) == 0) break;
				5553	slot += cd->name_entry_size;
				5554	}
				5555
				5556	/* Found a previous named subpattern */
				5557
				5558	if (i < cd->names_found)
				5559	{
				5560	recno = GET2(slot, 0);
				5561	PUT2(code, 2+LINK_SIZE, recno);
				5562	code[1+LINK_SIZE]++;
				5563	}
				5564
				5565	/* Search the pattern for a forward reference */
				5566
				5567	else if ((i = find_parens(cd, name, namelen,
				5568	(options & PCRE_EXTENDED) != 0, utf8)) > 0)
				5569	{
				5570	PUT2(code, 2+LINK_SIZE, i);
				5571	code[1+LINK_SIZE]++;
				5572	}
				5573
				5574	/* If terminator == 0 it means that the name followed directly after
				5575	the opening parenthesis [e.g. (?(abc)...] and in this case there are
				5576	some further alternatives to try. For the cases where terminator != 0
				5577	[things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
				5578	now checked all the possibilities, so give an error. */
				5579
				5580	else if (terminator != 0)
				5581	{
				5582	*errorcodeptr = ERR15;
				5583	goto FAILED;
				5584	}
				5585
				5586	/* Check for (?(R) for recursion. Allow digits after R to specify a
				5587	specific group number. */
				5588
				5589	else if (*name == CHAR_R)
				5590	{
				5591	recno = 0;
				5592	for (i = 1; i < namelen; i++)
				5593	{
				5594	if ((digitab[name[i]] & ctype_digit) == 0)
				5595	{
				5596	*errorcodeptr = ERR15;
				5597	goto FAILED;
				5598	}
				5599	recno = recno * 10 + name[i] - CHAR_0;
				5600	}
				5601	if (recno == 0) recno = RREF_ANY;
				5602	code[1+LINK_SIZE] = OP_RREF; /* Change test type */
				5603	PUT2(code, 2+LINK_SIZE, recno);
				5604	}
				5605
				5606	/* Similarly, check for the (?(DEFINE) "condition", which is always
				5607	false. */
				5608
				5609	else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
				5610	{
				5611	code[1+LINK_SIZE] = OP_DEF;
				5612	skipbytes = 1;
				5613	}
				5614
				5615	/* Check for the "name" actually being a subpattern number. We are
				5616	in the second pass here, so final_bracount is set. */
				5617
				5618	else if (recno > 0 && recno <= cd->final_bracount)
				5619	{
				5620	PUT2(code, 2+LINK_SIZE, recno);
				5621	}
				5622
				5623	/* Either an unidentified subpattern, or a reference to (?(0) */
				5624
				5625	else
				5626	{
				5627	*errorcodeptr = (recno == 0)? ERR35: ERR15;
				5628	goto FAILED;
				5629	}
				5630	break;
				5631
				5632
				5633	/* ------------------------------------------------------------ */
				5634	case CHAR_EQUALS_SIGN: /* Positive lookahead */
				5635	bravalue = OP_ASSERT;
				5636	cd->assert_depth += 1;
				5637	ptr++;
				5638	break;
				5639
				5640
				5641	/* ------------------------------------------------------------ */
				5642	case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
				5643	ptr++;
				5644	if (ptr == CHAR_RIGHT_PARENTHESIS) / Optimize (?!) */
				5645	{
				5646	*code++ = OP_FAIL;
				5647	previous = NULL;
				5648	continue;
				5649	}
				5650	bravalue = OP_ASSERT_NOT;
				5651	cd->assert_depth += 1;
				5652	break;
				5653
				5654
				5655	/* ------------------------------------------------------------ */
				5656	case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
				5657	switch (ptr[1])
				5658	{
				5659	case CHAR_EQUALS_SIGN: /* Positive lookbehind */
				5660	bravalue = OP_ASSERTBACK;
				5661	cd->assert_depth += 1;
				5662	ptr += 2;
				5663	break;
				5664
				5665	case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
				5666	bravalue = OP_ASSERTBACK_NOT;
				5667	cd->assert_depth += 1;
				5668	ptr += 2;
				5669	break;
				5670
				5671	default: /* Could be name define, else bad */
				5672	if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
				5673	ptr++; /* Correct offset for error */
				5674	*errorcodeptr = ERR24;
				5675	goto FAILED;
				5676	}
				5677	break;
				5678
				5679
				5680	/* ------------------------------------------------------------ */
				5681	case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
				5682	bravalue = OP_ONCE;
				5683	ptr++;
				5684	break;
				5685
				5686
				5687	/* ------------------------------------------------------------ */
				5688	case CHAR_C: /* Callout - may be followed by digits; */
				5689	previous_callout = code; /* Save for later completion */
				5690	after_manual_callout = 1; /* Skip one item before completing */
				5691	*code++ = OP_CALLOUT;
				5692	{
				5693	int n = 0;
				5694	while ((digitab[*(++ptr)] & ctype_digit) != 0)
				5695	n = n * 10 + *ptr - CHAR_0;
				5696	if (*ptr != CHAR_RIGHT_PARENTHESIS)
				5697	{
				5698	*errorcodeptr = ERR39;
				5699	goto FAILED;
				5700	}
				5701	if (n > 255)
				5702	{
				5703	*errorcodeptr = ERR38;
				5704	goto FAILED;
				5705	}
				5706	*code++ = n;
				5707	PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
				5708	PUT(code, LINK_SIZE, 0); /* Default length */
				5709	code += 2 * LINK_SIZE;
				5710	}
				5711	previous = NULL;
				5712	continue;
				5713
				5714
				5715	/* ------------------------------------------------------------ */
				5716	case CHAR_P: /* Python-style named subpattern handling */
				5717	if (*(++ptr) == CHAR_EQUALS_SIGN \|\|
				5718	ptr == CHAR_GREATER_THAN_SIGN) / Reference or recursion */
				5719	{
				5720	is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
				5721	terminator = CHAR_RIGHT_PARENTHESIS;
				5722	goto NAMED_REF_OR_RECURSE;
				5723	}
				5724	else if (ptr != CHAR_LESS_THAN_SIGN) / Test for Python-style defn */
				5725	{
				5726	*errorcodeptr = ERR41;
				5727	goto FAILED;
				5728	}
				5729	/* Fall through to handle (?P< as (?< is handled */
				5730
				5731
				5732	/* ------------------------------------------------------------ */
				5733	DEFINE_NAME: /* Come here from (?< handling */
				5734	case CHAR_APOSTROPHE:
				5735	{
				5736	terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
				5737	CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
				5738	name = ++ptr;
				5739
				5740	while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
				5741	namelen = (int)(ptr - name);
				5742
				5743	/* In the pre-compile phase, just do a syntax check. */
				5744
				5745	if (lengthptr != NULL)
				5746	{
				5747	if (*ptr != terminator)
				5748	{
				5749	*errorcodeptr = ERR42;
				5750	goto FAILED;
				5751	}
				5752	if (cd->names_found >= MAX_NAME_COUNT)
				5753	{
				5754	*errorcodeptr = ERR49;
				5755	goto FAILED;
				5756	}
				5757	if (namelen + 3 > cd->name_entry_size)
				5758	{
				5759	cd->name_entry_size = namelen + 3;
				5760	if (namelen > MAX_NAME_SIZE)
				5761	{
				5762	*errorcodeptr = ERR48;
				5763	goto FAILED;
				5764	}
				5765	}
				5766	}
				5767
				5768	/* In the real compile, create the entry in the table, maintaining
				5769	alphabetical order. Duplicate names for different numbers are
				5770	permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
				5771	number are always OK. (An existing number can be re-used if (?\|
				5772	appears in the pattern.) In either event, a duplicate name results in
				5773	a duplicate entry in the table, even if the number is the same. This
				5774	is because the number of names, and hence the table size, is computed
				5775	in the pre-compile, and it affects various numbers and pointers which
				5776	would all have to be modified, and the compiled code moved down, if
				5777	duplicates with the same number were omitted from the table. This
				5778	doesn't seem worth the hassle. However, different names for the
				5779	same number are not permitted. */
				5780
				5781	else
				5782	{
				5783	BOOL dupname = FALSE;
				5784	slot = cd->name_table;
				5785
				5786	for (i = 0; i < cd->names_found; i++)
				5787	{
				5788	int crc = memcmp(name, slot+2, namelen);
				5789	if (crc == 0)
				5790	{
				5791	if (slot[2+namelen] == 0)
				5792	{
				5793	if (GET2(slot, 0) != cd->bracount + 1 &&
				5794	(options & PCRE_DUPNAMES) == 0)
				5795	{
				5796	*errorcodeptr = ERR43;
				5797	goto FAILED;
				5798	}
				5799	else dupname = TRUE;
				5800	}
				5801	else crc = -1; /* Current name is a substring */
				5802	}
				5803
				5804	/* Make space in the table and break the loop for an earlier
				5805	name. For a duplicate or later name, carry on. We do this for
				5806	duplicates so that in the simple case (when ?(\| is not used) they
				5807	are in order of their numbers. */
				5808
				5809	if (crc < 0)
				5810	{
				5811	memmove(slot + cd->name_entry_size, slot,
				5812	(cd->names_found - i) * cd->name_entry_size);
				5813	break;
				5814	}
				5815
				5816	/* Continue the loop for a later or duplicate name */
				5817
				5818	slot += cd->name_entry_size;
				5819	}
				5820
				5821	/* For non-duplicate names, check for a duplicate number before
				5822	adding the new name. */
				5823
				5824	if (!dupname)
				5825	{
				5826	uschar *cslot = cd->name_table;
				5827	for (i = 0; i < cd->names_found; i++)
				5828	{
				5829	if (cslot != slot)
				5830	{
				5831	if (GET2(cslot, 0) == cd->bracount + 1)
				5832	{
				5833	*errorcodeptr = ERR65;
				5834	goto FAILED;
				5835	}
				5836	}
				5837	else i--;
				5838	cslot += cd->name_entry_size;
				5839	}
				5840	}
				5841
				5842	PUT2(slot, 0, cd->bracount + 1);
				5843	memcpy(slot + 2, name, namelen);
				5844	slot[2+namelen] = 0;
				5845	}
				5846	}
				5847
				5848	/* In both pre-compile and compile, count the number of names we've
				5849	encountered. */
				5850
				5851	cd->names_found++;
				5852	ptr++; /* Move past > or ' */
				5853	goto NUMBERED_GROUP;
				5854
				5855
				5856	/* ------------------------------------------------------------ */
				5857	case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
				5858	terminator = CHAR_RIGHT_PARENTHESIS;
				5859	is_recurse = TRUE;
				5860	/* Fall through */
				5861
				5862	/* We come here from the Python syntax above that handles both
				5863	references (?P=name) and recursion (?P>name), as well as falling
				5864	through from the Perl recursion syntax (?&name). We also come here from
				5865	the Perl \k<name> or \k'name' back reference syntax and the \k{name}
				5866	.NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
				5867
				5868	NAMED_REF_OR_RECURSE:
				5869	name = ++ptr;
				5870	while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
				5871	namelen = (int)(ptr - name);
				5872
				5873	/* In the pre-compile phase, do a syntax check. We used to just set
				5874	a dummy reference number, because it was not used in the first pass.
				5875	However, with the change of recursive back references to be atomic,
				5876	we have to look for the number so that this state can be identified, as
				5877	otherwise the incorrect length is computed. If it's not a backwards
				5878	reference, the dummy number will do. */
				5879
				5880	if (lengthptr != NULL)
				5881	{
				5882	const uschar *temp;
				5883
				5884	if (namelen == 0)
				5885	{
				5886	*errorcodeptr = ERR62;
				5887	goto FAILED;
				5888	}
				5889	if (*ptr != terminator)
				5890	{
				5891	*errorcodeptr = ERR42;
				5892	goto FAILED;
				5893	}
				5894	if (namelen > MAX_NAME_SIZE)
				5895	{
				5896	*errorcodeptr = ERR48;
				5897	goto FAILED;
				5898	}
				5899
				5900	/* The name table does not exist in the first pass, so we cannot
				5901	do a simple search as in the code below. Instead, we have to scan the
				5902	pattern to find the number. It is important that we scan it only as
				5903	far as we have got because the syntax of named subpatterns has not
				5904	been checked for the rest of the pattern, and find_parens() assumes
				5905	correct syntax. In any case, it's a waste of resources to scan
				5906	further. We stop the scan at the current point by temporarily
				5907	adjusting the value of cd->endpattern. */
				5908
				5909	temp = cd->end_pattern;
				5910	cd->end_pattern = ptr;
				5911	recno = find_parens(cd, name, namelen,
				5912	(options & PCRE_EXTENDED) != 0, utf8);
				5913	cd->end_pattern = temp;
				5914	if (recno < 0) recno = 0; /* Forward ref; set dummy number */
				5915	}
				5916
				5917	/* In the real compile, seek the name in the table. We check the name
				5918	first, and then check that we have reached the end of the name in the
				5919	table. That way, if the name that is longer than any in the table,
				5920	the comparison will fail without reading beyond the table entry. */
				5921
				5922	else
				5923	{
				5924	slot = cd->name_table;
				5925	for (i = 0; i < cd->names_found; i++)
				5926	{
				5927	if (strncmp((char )name, (char )slot+2, namelen) == 0 &&
				5928	slot[2+namelen] == 0)
				5929	break;
				5930	slot += cd->name_entry_size;
				5931	}
				5932
				5933	if (i < cd->names_found) /* Back reference */
				5934	{
				5935	recno = GET2(slot, 0);
				5936	}
				5937	else if ((recno = /* Forward back reference */
				5938	find_parens(cd, name, namelen,
				5939	(options & PCRE_EXTENDED) != 0, utf8)) <= 0)
				5940	{
				5941	*errorcodeptr = ERR15;
				5942	goto FAILED;
				5943	}
				5944	}
				5945
				5946	/* In both phases, we can now go to the code than handles numerical
				5947	recursion or backreferences. */
				5948
				5949	if (is_recurse) goto HANDLE_RECURSION;
				5950	else goto HANDLE_REFERENCE;
				5951
				5952
				5953	/* ------------------------------------------------------------ */
				5954	case CHAR_R: /* Recursion */
				5955	ptr++; /* Same as (?0) */
				5956	/* Fall through */
				5957
				5958
				5959	/* ------------------------------------------------------------ */
				5960	case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
				5961	case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
				5962	case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
				5963	{
				5964	const uschar *called;
				5965	terminator = CHAR_RIGHT_PARENTHESIS;
				5966
				5967	/* Come here from the \g<...> and \g'...' code (Oniguruma
				5968	compatibility). However, the syntax has been checked to ensure that
				5969	the ... are a (signed) number, so that neither ERR63 nor ERR29 will
				5970	be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
				5971	ever be taken. */
				5972
				5973	HANDLE_NUMERICAL_RECURSION:
				5974
				5975	if ((refsign = *ptr) == CHAR_PLUS)
				5976	{
				5977	ptr++;
				5978	if ((digitab[*ptr] & ctype_digit) == 0)
				5979	{
				5980	*errorcodeptr = ERR63;
				5981	goto FAILED;
				5982	}
				5983	}
				5984	else if (refsign == CHAR_MINUS)
				5985	{
				5986	if ((digitab[ptr[1]] & ctype_digit) == 0)
				5987	goto OTHER_CHAR_AFTER_QUERY;
				5988	ptr++;
				5989	}
				5990
				5991	recno = 0;
				5992	while((digitab[*ptr] & ctype_digit) != 0)
				5993	recno = recno * 10 + *ptr++ - CHAR_0;
				5994
				5995	if (*ptr != terminator)
				5996	{
				5997	*errorcodeptr = ERR29;
				5998	goto FAILED;
				5999	}
				6000
				6001	if (refsign == CHAR_MINUS)
				6002	{
				6003	if (recno == 0)
				6004	{
				6005	*errorcodeptr = ERR58;
				6006	goto FAILED;
				6007	}
				6008	recno = cd->bracount - recno + 1;
				6009	if (recno <= 0)
				6010	{
				6011	*errorcodeptr = ERR15;
				6012	goto FAILED;
				6013	}
				6014	}
				6015	else if (refsign == CHAR_PLUS)
				6016	{
				6017	if (recno == 0)
				6018	{
				6019	*errorcodeptr = ERR58;
				6020	goto FAILED;
				6021	}
				6022	recno += cd->bracount;
				6023	}
				6024
				6025	/* Come here from code above that handles a named recursion */
				6026
				6027	HANDLE_RECURSION:
				6028
				6029	previous = code;
				6030	called = cd->start_code;
				6031
				6032	/* When we are actually compiling, find the bracket that is being
				6033	referenced. Temporarily end the regex in case it doesn't exist before
				6034	this point. If we end up with a forward reference, first check that
				6035	the bracket does occur later so we can give the error (and position)
				6036	now. Then remember this forward reference in the workspace so it can
				6037	be filled in at the end. */
				6038
				6039	if (lengthptr == NULL)
				6040	{
				6041	*code = OP_END;
				6042	if (recno != 0)
				6043	called = _pcre_find_bracket(cd->start_code, utf8, recno);
				6044
				6045	/* Forward reference */
				6046
				6047	if (called == NULL)
				6048	{
				6049	if (find_parens(cd, NULL, recno,
				6050	(options & PCRE_EXTENDED) != 0, utf8) < 0)
				6051	{
				6052	*errorcodeptr = ERR15;
				6053	goto FAILED;
				6054	}
				6055
				6056	/* Fudge the value of "called" so that when it is inserted as an
				6057	offset below, what it actually inserted is the reference number
				6058	of the group. Then remember the forward reference. */
				6059
				6060	called = cd->start_code + recno;
				6061	if (cd->hwm >= cd->start_workspace + cd->workspace_size -
				6062	WORK_SIZE_SAFETY_MARGIN)
				6063	{
				6064	*errorcodeptr = expand_workspace(cd);
				6065	if (*errorcodeptr != 0) goto FAILED;
				6066	}
				6067	PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
				6068	}
				6069
				6070	/* If not a forward reference, and the subpattern is still open,
				6071	this is a recursive call. We check to see if this is a left
				6072	recursion that could loop for ever, and diagnose that case. We
				6073	must not, however, do this check if we are in a conditional
				6074	subpattern because the condition might be testing for recursion in
				6075	a pattern such as /(?(R)a+\|(?R)b)/, which is perfectly valid.
				6076	Forever loops are also detected at runtime, so those that occur in
				6077	conditional subpatterns will be picked up then. */
				6078
				6079	else if (GET(called, 1) == 0 && cond_depth <= 0 &&
				6080	could_be_empty(called, code, bcptr, utf8, cd))
				6081	{
				6082	*errorcodeptr = ERR40;
				6083	goto FAILED;
				6084	}
				6085	}
				6086
				6087	/* Insert the recursion/subroutine item. It does not have a set first
				6088	byte (relevant if it is repeated, because it will then be wrapped
				6089	with ONCE brackets). */
				6090
				6091	*code = OP_RECURSE;
				6092	PUT(code, 1, (int)(called - cd->start_code));
				6093	code += 1 + LINK_SIZE;
				6094	groupsetfirstbyte = FALSE;
				6095	}
				6096
				6097	/* Can't determine a first byte now */
				6098
				6099	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
				6100	continue;
				6101
				6102
				6103	/* ------------------------------------------------------------ */
				6104	default: /* Other characters: check option setting */
				6105	OTHER_CHAR_AFTER_QUERY:
				6106	set = unset = 0;
				6107	optset = &set;
				6108
				6109	while (ptr != CHAR_RIGHT_PARENTHESIS && ptr != CHAR_COLON)
				6110	{
				6111	switch (*ptr++)
				6112	{
				6113	case CHAR_MINUS: optset = &unset; break;
				6114
				6115	case CHAR_J: /* Record that it changed in the external options */
				6116	*optset \|= PCRE_DUPNAMES;
				6117	cd->external_flags \|= PCRE_JCHANGED;
				6118	break;
				6119
				6120	case CHAR_i: *optset \|= PCRE_CASELESS; break;
				6121	case CHAR_m: *optset \|= PCRE_MULTILINE; break;
				6122	case CHAR_s: *optset \|= PCRE_DOTALL; break;
				6123	case CHAR_x: *optset \|= PCRE_EXTENDED; break;
				6124	case CHAR_U: *optset \|= PCRE_UNGREEDY; break;
				6125	case CHAR_X: *optset \|= PCRE_EXTRA; break;
				6126
				6127	default: *errorcodeptr = ERR12;
				6128	ptr--; /* Correct the offset */
				6129	goto FAILED;
				6130	}
				6131	}
				6132
				6133	/* Set up the changed option bits, but don't change anything yet. */
				6134
				6135	newoptions = (options \| set) & (~unset);
				6136
				6137	/* If the options ended with ')' this is not the start of a nested
				6138	group with option changes, so the options change at this level. If this
				6139	item is right at the start of the pattern, the options can be
				6140	abstracted and made external in the pre-compile phase, and ignored in
				6141	the compile phase. This can be helpful when matching -- for instance in
				6142	caseless checking of required bytes.
				6143
				6144	If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
				6145	definitely not at the start of the pattern because something has been
				6146	compiled. In the pre-compile phase, however, the code pointer can have
				6147	that value after the start, because it gets reset as code is discarded
				6148	during the pre-compile. However, this can happen only at top level - if
				6149	we are within parentheses, the starting BRA will still be present. At
				6150	any parenthesis level, the length value can be used to test if anything
				6151	has been compiled at that level. Thus, a test for both these conditions
				6152	is necessary to ensure we correctly detect the start of the pattern in
				6153	both phases.
				6154
				6155	If we are not at the pattern start, reset the greedy defaults and the
				6156	case value for firstbyte and reqbyte. */
				6157
				6158	if (*ptr == CHAR_RIGHT_PARENTHESIS)
				6159	{
				6160	if (code == cd->start_code + 1 + LINK_SIZE &&
				6161	(lengthptr == NULL \|\| lengthptr == 2 + 2LINK_SIZE))
				6162	{
				6163	cd->external_options = newoptions;
				6164	}
				6165	else
				6166	{
				6167	greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
				6168	greedy_non_default = greedy_default ^ 1;
				6169	req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
				6170	}
				6171
				6172	/* Change options at this level, and pass them back for use
				6173	in subsequent branches. */
				6174
				6175	*optionsptr = options = newoptions;
				6176	previous = NULL; /* This item can't be repeated */
				6177	continue; /* It is complete */
				6178	}
				6179
				6180	/* If the options ended with ':' we are heading into a nested group
				6181	with possible change of options. Such groups are non-capturing and are
				6182	not assertions of any kind. All we need to do is skip over the ':';
				6183	the newoptions value is handled below. */
				6184
				6185	bravalue = OP_BRA;
				6186	ptr++;
				6187	} /* End of switch for character following (? */
				6188	} /* End of (? handling */
				6189
				6190	/* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
				6191	is set, all unadorned brackets become non-capturing and behave like (?:...)
				6192	brackets. */
				6193
				6194	else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
				6195	{
				6196	bravalue = OP_BRA;
				6197	}
				6198
				6199	/* Else we have a capturing group. */
				6200
				6201	else
				6202	{
				6203	NUMBERED_GROUP:
				6204	cd->bracount += 1;
				6205	PUT2(code, 1+LINK_SIZE, cd->bracount);
				6206	skipbytes = 2;
				6207	}
				6208
				6209	/* Process nested bracketed regex. Assertions used not to be repeatable,
				6210	but this was changed for Perl compatibility, so all kinds can now be
				6211	repeated. We copy code into a non-register variable (tempcode) in order to
				6212	be able to pass its address because some compilers complain otherwise. */
				6213
				6214	previous = code; /* For handling repetition */
				6215	*code = bravalue;
				6216	tempcode = code;
				6217	tempreqvary = cd->req_varyopt; /* Save value before bracket */
				6218	tempbracount = cd->bracount; /* Save value before bracket */
				6219	length_prevgroup = 0; /* Initialize for pre-compile phase */
				6220
				6221	if (!compile_regex(
				6222	newoptions, /* The complete new option state */
				6223	&tempcode, /* Where to put code (updated) */
				6224	&ptr, /* Input pointer (updated) */
				6225	errorcodeptr, /* Where to put an error message */
				6226	(bravalue == OP_ASSERTBACK \|\|
				6227	bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
				6228	reset_bracount, /* True if (?\| group */
				6229	skipbytes, /* Skip over bracket number */
				6230	cond_depth +
				6231	((bravalue == OP_COND)?1:0), /* Depth of condition subpatterns */
				6232	&subfirstbyte, /* For possible first char */
				6233	&subreqbyte, /* For possible last char */
				6234	bcptr, /* Current branch chain */
				6235	cd, /* Tables block */
				6236	(lengthptr == NULL)? NULL : /* Actual compile phase */
				6237	&length_prevgroup /* Pre-compile phase */
				6238	))
				6239	goto FAILED;
				6240
				6241	/* If this was an atomic group and there are no capturing groups within it,
				6242	generate OP_ONCE_NC instead of OP_ONCE. */
				6243
				6244	if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
				6245	*code = OP_ONCE_NC;
				6246
				6247	if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
				6248	cd->assert_depth -= 1;
				6249
				6250	/* At the end of compiling, code is still pointing to the start of the
				6251	group, while tempcode has been updated to point past the end of the group.
				6252	The pattern pointer (ptr) is on the bracket.
				6253
				6254	If this is a conditional bracket, check that there are no more than
				6255	two branches in the group, or just one if it's a DEFINE group. We do this
				6256	in the real compile phase, not in the pre-pass, where the whole group may
				6257	not be available. */
				6258
				6259	if (bravalue == OP_COND && lengthptr == NULL)
				6260	{
				6261	uschar *tc = code;
				6262	int condcount = 0;
				6263
				6264	do {
				6265	condcount++;
				6266	tc += GET(tc,1);
				6267	}
				6268	while (*tc != OP_KET);
				6269
				6270	/* A DEFINE group is never obeyed inline (the "condition" is always
				6271	false). It must have only one branch. */
				6272
				6273	if (code[LINK_SIZE+1] == OP_DEF)
				6274	{
				6275	if (condcount > 1)
				6276	{
				6277	*errorcodeptr = ERR54;
				6278	goto FAILED;
				6279	}
				6280	bravalue = OP_DEF; /* Just a flag to suppress char handling below */
				6281	}
				6282
				6283	/* A "normal" conditional group. If there is just one branch, we must not
				6284	make use of its firstbyte or reqbyte, because this is equivalent to an
				6285	empty second branch. */
				6286
				6287	else
				6288	{
				6289	if (condcount > 2)
				6290	{
				6291	*errorcodeptr = ERR27;
				6292	goto FAILED;
				6293	}
				6294	if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
				6295	}
				6296	}
				6297
				6298	/* Error if hit end of pattern */
				6299
				6300	if (*ptr != CHAR_RIGHT_PARENTHESIS)
				6301	{
				6302	*errorcodeptr = ERR14;
				6303	goto FAILED;
				6304	}
				6305
				6306	/* In the pre-compile phase, update the length by the length of the group,
				6307	less the brackets at either end. Then reduce the compiled code to just a
				6308	set of non-capturing brackets so that it doesn't use much memory if it is
				6309	duplicated by a quantifier.*/
				6310
				6311	if (lengthptr != NULL)
				6312	{
				6313	if (OFLOW_MAX - lengthptr < length_prevgroup - 2 - 2LINK_SIZE)
				6314	{
				6315	*errorcodeptr = ERR20;
				6316	goto FAILED;
				6317	}
				6318	lengthptr += length_prevgroup - 2 - 2LINK_SIZE;
				6319	code++; /* This already contains bravalue */
				6320	PUTINC(code, 0, 1 + LINK_SIZE);
				6321	*code++ = OP_KET;
				6322	PUTINC(code, 0, 1 + LINK_SIZE);
				6323	break; /* No need to waste time with special character handling */
				6324	}
				6325
				6326	/* Otherwise update the main code pointer to the end of the group. */
				6327
				6328	code = tempcode;
				6329
				6330	/* For a DEFINE group, required and first character settings are not
				6331	relevant. */
				6332
				6333	if (bravalue == OP_DEF) break;
				6334
				6335	/* Handle updating of the required and first characters for other types of
				6336	group. Update for normal brackets of all kinds, and conditions with two
				6337	branches (see code above). If the bracket is followed by a quantifier with
				6338	zero repeat, we have to back off. Hence the definition of zeroreqbyte and
				6339	zerofirstbyte outside the main loop so that they can be accessed for the
				6340	back off. */
				6341
				6342	zeroreqbyte = reqbyte;
				6343	zerofirstbyte = firstbyte;
				6344	groupsetfirstbyte = FALSE;
				6345
				6346	if (bravalue >= OP_ONCE)
				6347	{
				6348	/* If we have not yet set a firstbyte in this branch, take it from the
				6349	subpattern, remembering that it was set here so that a repeat of more
				6350	than one can replicate it as reqbyte if necessary. If the subpattern has
				6351	no firstbyte, set "none" for the whole branch. In both cases, a zero
				6352	repeat forces firstbyte to "none". */
				6353
				6354	if (firstbyte == REQ_UNSET)
				6355	{
				6356	if (subfirstbyte >= 0)
				6357	{
				6358	firstbyte = subfirstbyte;
				6359	groupsetfirstbyte = TRUE;
				6360	}
				6361	else firstbyte = REQ_NONE;
				6362	zerofirstbyte = REQ_NONE;
				6363	}
				6364
				6365	/* If firstbyte was previously set, convert the subpattern's firstbyte
				6366	into reqbyte if there wasn't one, using the vary flag that was in
				6367	existence beforehand. */
				6368
				6369	else if (subfirstbyte >= 0 && subreqbyte < 0)
				6370	subreqbyte = subfirstbyte \| tempreqvary;
				6371
				6372	/* If the subpattern set a required byte (or set a first byte that isn't
				6373	really the first byte - see above), set it. */
				6374
				6375	if (subreqbyte >= 0) reqbyte = subreqbyte;
				6376	}
				6377
				6378	/* For a forward assertion, we take the reqbyte, if set. This can be
				6379	helpful if the pattern that follows the assertion doesn't set a different
				6380	char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
				6381	for an assertion, however because it leads to incorrect effect for patterns
				6382	such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
				6383	of a firstbyte. This is overcome by a scan at the end if there's no
				6384	firstbyte, looking for an asserted first char. */
				6385
				6386	else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
				6387	break; /* End of processing '(' */
				6388
				6389
				6390	/* ===================================================================*/
				6391	/* Handle metasequences introduced by \. For ones like \d, the ESC_ values
				6392	are arranged to be the negation of the corresponding OP_values in the
				6393	default case when PCRE_UCP is not set. For the back references, the values
				6394	are ESC_REF plus the reference number. Only back references and those types
				6395	that consume a character may be repeated. We can test for values between
				6396	ESC_b and ESC_Z for the latter; this may have to change if any new ones are
				6397	ever created. */
				6398
				6399	case CHAR_BACKSLASH:
				6400	tempptr = ptr;
				6401	c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
				6402	if (*errorcodeptr != 0) goto FAILED;
				6403
				6404	if (c < 0)
				6405	{
				6406	if (-c == ESC_Q) /* Handle start of quoted string */
				6407	{
				6408	if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
				6409	ptr += 2; /* avoid empty string */
				6410	else inescq = TRUE;
				6411	continue;
				6412	}
				6413
				6414	if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
				6415
				6416	/* For metasequences that actually match a character, we disable the
				6417	setting of a first character if it hasn't already been set. */
				6418
				6419	if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
				6420	firstbyte = REQ_NONE;
				6421
				6422	/* Set values to reset to if this is followed by a zero repeat. */
				6423
				6424	zerofirstbyte = firstbyte;
				6425	zeroreqbyte = reqbyte;
				6426
				6427	/* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
				6428	is a subroutine call by number (Oniguruma syntax). In fact, the value
				6429	-ESC_g is returned only for these cases. So we don't need to check for <
				6430	or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
				6431	-ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
				6432	that is a synonym for a named back reference). */
				6433
				6434	if (-c == ESC_g)
				6435	{
				6436	const uschar *p;
				6437	save_hwm = cd->hwm; /* Normally this is set when '(' is read */
				6438	terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
				6439	CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
				6440
				6441	/* These two statements stop the compiler for warning about possibly
				6442	unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
				6443	fact, because we actually check for a number below, the paths that
				6444	would actually be in error are never taken. */
				6445
				6446	skipbytes = 0;
				6447	reset_bracount = FALSE;
				6448
				6449	/* Test for a name */
				6450
				6451	if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
				6452	{
				6453	BOOL isnumber = TRUE;
				6454	for (p = ptr + 1; p != 0 && p != terminator; p++)
				6455	{
				6456	if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
				6457	if ((cd->ctypes[*p] & ctype_word) == 0) break;
				6458	}
				6459	if (*p != terminator)
				6460	{
				6461	*errorcodeptr = ERR57;
				6462	break;
				6463	}
				6464	if (isnumber)
				6465	{
				6466	ptr++;
				6467	goto HANDLE_NUMERICAL_RECURSION;
				6468	}
				6469	is_recurse = TRUE;
				6470	goto NAMED_REF_OR_RECURSE;
				6471	}
				6472
				6473	/* Test a signed number in angle brackets or quotes. */
				6474
				6475	p = ptr + 2;
				6476	while ((digitab[*p] & ctype_digit) != 0) p++;
				6477	if (*p != terminator)
				6478	{
				6479	*errorcodeptr = ERR57;
				6480	break;
				6481	}
				6482	ptr++;
				6483	goto HANDLE_NUMERICAL_RECURSION;
				6484	}
				6485
				6486	/* \k<name> or \k'name' is a back reference by name (Perl syntax).
				6487	We also support \k{name} (.NET syntax). */
				6488
				6489	if (-c == ESC_k)
				6490	{
				6491	if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
				6492	ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
				6493	{
				6494	*errorcodeptr = ERR69;
				6495	break;
				6496	}
				6497	is_recurse = FALSE;
				6498	terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
				6499	CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
				6500	CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
				6501	goto NAMED_REF_OR_RECURSE;
				6502	}
				6503
				6504	/* Back references are handled specially; must disable firstbyte if
				6505	not set to cope with cases like (?=(\w+))\1: which would otherwise set
				6506	':' later. */
				6507
				6508	if (-c >= ESC_REF)
				6509	{
				6510	open_capitem *oc;
				6511	recno = -c - ESC_REF;
				6512
				6513	HANDLE_REFERENCE: /* Come here from named backref handling */
				6514	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
				6515	previous = code;
				6516	*code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
				6517	PUT2INC(code, 0, recno);
				6518	cd->backref_map \|= (recno < 32)? (1 << recno) : 1;
				6519	if (recno > cd->top_backref) cd->top_backref = recno;
				6520
				6521	/* Check to see if this back reference is recursive, that it, it
				6522	is inside the group that it references. A flag is set so that the
				6523	group can be made atomic. */
				6524
				6525	for (oc = cd->open_caps; oc != NULL; oc = oc->next)
				6526	{
				6527	if (oc->number == recno)
				6528	{
				6529	oc->flag = TRUE;
				6530	break;
				6531	}
				6532	}
				6533	}
				6534
				6535	/* So are Unicode property matches, if supported. */
				6536
				6537	#ifdef SUPPORT_UCP
				6538	else if (-c == ESC_P \|\| -c == ESC_p)
				6539	{
				6540	BOOL negated;
				6541	int pdata;
				6542	int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
				6543	if (ptype < 0) goto FAILED;
				6544	previous = code;
				6545	*code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
				6546	*code++ = ptype;
				6547	*code++ = pdata;
				6548	}
				6549	#else
				6550
				6551	/* If Unicode properties are not supported, \X, \P, and \p are not
				6552	allowed. */
				6553
				6554	else if (-c == ESC_X \|\| -c == ESC_P \|\| -c == ESC_p)
				6555	{
				6556	*errorcodeptr = ERR45;
				6557	goto FAILED;
				6558	}
				6559	#endif
				6560
				6561	/* For the rest (including \X when Unicode properties are supported), we
				6562	can obtain the OP value by negating the escape value in the default
				6563	situation when PCRE_UCP is not set. When it is set, we substitute
				6564	Unicode property tests. */
				6565
				6566	else
				6567	{
				6568	#ifdef SUPPORT_UCP
				6569	if (-c >= ESC_DU && -c <= ESC_wu)
				6570	{
				6571	nestptr = ptr + 1; /* Where to resume */
				6572	ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
				6573	}
				6574	else
				6575	#endif
				6576	/* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
				6577	so that it works in DFA mode and in lookbehinds. */
				6578
				6579	{
				6580	previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
				6581	*code++ = (!utf8 && c == -ESC_C)? OP_ALLANY : -c;
				6582	}
				6583	}
				6584	continue;
				6585	}
				6586
				6587	/* We have a data character whose value is in c. In UTF-8 mode it may have
				6588	a value > 127. We set its representation in the length/buffer, and then
				6589	handle it as a data character. */
				6590
				6591	#ifdef SUPPORT_UTF8
				6592	if (utf8 && c > 127)
				6593	mclength = _pcre_ord2utf8(c, mcbuffer);
				6594	else
				6595	#endif
				6596
				6597	{
				6598	mcbuffer[0] = c;
				6599	mclength = 1;
				6600	}
				6601	goto ONE_CHAR;
				6602
				6603
				6604	/* ===================================================================*/
				6605	/* Handle a literal character. It is guaranteed not to be whitespace or #
				6606	when the extended flag is set. If we are in UTF-8 mode, it may be a
				6607	multi-byte literal character. */
				6608
				6609	default:
				6610	NORMAL_CHAR:
				6611	mclength = 1;
				6612	mcbuffer[0] = c;
				6613
				6614	#ifdef SUPPORT_UTF8
				6615	if (utf8 && c >= 0xc0)
				6616	{
				6617	while ((ptr[1] & 0xc0) == 0x80)
				6618	mcbuffer[mclength++] = *(++ptr);
				6619	}
				6620	#endif
				6621
				6622	/* At this point we have the character's bytes in mcbuffer, and the length
				6623	in mclength. When not in UTF-8 mode, the length is always 1. */
				6624
				6625	ONE_CHAR:
				6626	previous = code;
				6627	*code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
				6628	for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
				6629
				6630	/* Remember if \r or \n were seen */
				6631
				6632	if (mcbuffer[0] == CHAR_CR \|\| mcbuffer[0] == CHAR_NL)
				6633	cd->external_flags \|= PCRE_HASCRORLF;
				6634
				6635	/* Set the first and required bytes appropriately. If no previous first
				6636	byte, set it from this character, but revert to none on a zero repeat.
				6637	Otherwise, leave the firstbyte value alone, and don't change it on a zero
				6638	repeat. */
				6639
				6640	if (firstbyte == REQ_UNSET)
				6641	{
				6642	zerofirstbyte = REQ_NONE;
				6643	zeroreqbyte = reqbyte;
				6644
				6645	/* If the character is more than one byte long, we can set firstbyte
				6646	only if it is not to be matched caselessly. */
				6647
				6648	if (mclength == 1 \|\| req_caseopt == 0)
				6649	{
				6650	firstbyte = mcbuffer[0] \| req_caseopt;
				6651	if (mclength != 1) reqbyte = code[-1] \| cd->req_varyopt;
				6652	}
				6653	else firstbyte = reqbyte = REQ_NONE;
				6654	}
				6655
				6656	/* firstbyte was previously set; we can set reqbyte only if the length is
				6657	1 or the matching is caseful. */
				6658
				6659	else
				6660	{
				6661	zerofirstbyte = firstbyte;
				6662	zeroreqbyte = reqbyte;
				6663	if (mclength == 1 \|\| req_caseopt == 0)
				6664	reqbyte = code[-1] \| req_caseopt \| cd->req_varyopt;
				6665	}
				6666
				6667	break; /* End of literal character handling */
				6668	}
				6669	} /* end of big loop */
				6670
				6671
				6672	/* Control never reaches here by falling through, only by a goto for all the
				6673	error states. Pass back the position in the pattern so that it can be displayed
				6674	to the user for diagnosing the error. */
				6675
				6676	FAILED:
				6677	*ptrptr = ptr;
				6678	return FALSE;
				6679	}
				6680
				6681
				6682
				6683
				6684	/*************************************************
				6685	* Compile sequence of alternatives *
				6686	*************************************************/
				6687
				6688	/* On entry, ptr is pointing past the bracket character, but on return it
				6689	points to the closing bracket, or vertical bar, or end of string. The code
				6690	variable is pointing at the byte into which the BRA operator has been stored.
				6691	This function is used during the pre-compile phase when we are trying to find
				6692	out the amount of memory needed, as well as during the real compile phase. The
				6693	value of lengthptr distinguishes the two phases.
				6694
				6695	Arguments:
				6696	options option bits, including any changes for this subpattern
				6697	codeptr -> the address of the current code pointer
				6698	ptrptr -> the address of the current pattern pointer
				6699	errorcodeptr -> pointer to error code variable
				6700	lookbehind TRUE if this is a lookbehind assertion
				6701	reset_bracount TRUE to reset the count for each branch
				6702	skipbytes skip this many bytes at start (for brackets and OP_COND)
				6703	cond_depth depth of nesting for conditional subpatterns
				6704	firstbyteptr place to put the first required character, or a negative number
				6705	reqbyteptr place to put the last required character, or a negative number
				6706	bcptr pointer to the chain of currently open branches
				6707	cd points to the data block with tables pointers etc.
				6708	lengthptr NULL during the real compile phase
				6709	points to length accumulator during pre-compile phase
				6710
				6711	Returns: TRUE on success
				6712	*/
				6713
				6714	static BOOL
				6715	compile_regex(int options, uschar codeptr, const uschar ptrptr,
				6716	int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
				6717	int cond_depth, int firstbyteptr, int reqbyteptr, branch_chain *bcptr,
				6718	compile_data cd, int lengthptr)
				6719	{
				6720	const uschar ptr = ptrptr;
				6721	uschar code = codeptr;
				6722	uschar *last_branch = code;
				6723	uschar *start_bracket = code;
				6724	uschar *reverse_count = NULL;
				6725	open_capitem capitem;
				6726	int capnumber = 0;
				6727	int firstbyte, reqbyte;
				6728	int branchfirstbyte, branchreqbyte;
				6729	int length;
				6730	int orig_bracount;
				6731	int max_bracount;
				6732	branch_chain bc;
				6733
				6734	bc.outer = bcptr;
				6735	bc.current_branch = code;
				6736
				6737	firstbyte = reqbyte = REQ_UNSET;
				6738
				6739	/* Accumulate the length for use in the pre-compile phase. Start with the
				6740	length of the BRA and KET and any extra bytes that are required at the
				6741	beginning. We accumulate in a local variable to save frequent testing of
				6742	lenthptr for NULL. We cannot do this by looking at the value of code at the
				6743	start and end of each alternative, because compiled items are discarded during
				6744	the pre-compile phase so that the work space is not exceeded. */
				6745
				6746	length = 2 + 2*LINK_SIZE + skipbytes;
				6747
				6748	/* WARNING: If the above line is changed for any reason, you must also change
				6749	the code that abstracts option settings at the start of the pattern and makes
				6750	them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
				6751	pre-compile phase to find out whether anything has yet been compiled or not. */
				6752
				6753	/* If this is a capturing subpattern, add to the chain of open capturing items
				6754	so that we can detect them if (*ACCEPT) is encountered. This is also used to
				6755	detect groups that contain recursive back references to themselves. Note that
				6756	only OP_CBRA need be tested here; changing this opcode to one of its variants,
				6757	e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
				6758
				6759	if (*code == OP_CBRA)
				6760	{
				6761	capnumber = GET2(code, 1 + LINK_SIZE);
				6762	capitem.number = capnumber;
				6763	capitem.next = cd->open_caps;
				6764	capitem.flag = FALSE;
				6765	cd->open_caps = &capitem;
				6766	}
				6767
				6768	/* Offset is set zero to mark that this bracket is still open */
				6769
				6770	PUT(code, 1, 0);
				6771	code += 1 + LINK_SIZE + skipbytes;
				6772
				6773	/* Loop for each alternative branch */
				6774
				6775	orig_bracount = max_bracount = cd->bracount;
				6776	for (;;)
				6777	{
				6778	/* For a (?\| group, reset the capturing bracket count so that each branch
				6779	uses the same numbers. */
				6780
				6781	if (reset_bracount) cd->bracount = orig_bracount;
				6782
				6783	/* Set up dummy OP_REVERSE if lookbehind assertion */
				6784
				6785	if (lookbehind)
				6786	{
				6787	*code++ = OP_REVERSE;
				6788	reverse_count = code;
				6789	PUTINC(code, 0, 0);
				6790	length += 1 + LINK_SIZE;
				6791	}
				6792
				6793	/* Now compile the branch; in the pre-compile phase its length gets added
				6794	into the length. */
				6795
				6796	if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
				6797	&branchreqbyte, &bc, cond_depth, cd,
				6798	(lengthptr == NULL)? NULL : &length))
				6799	{
				6800	*ptrptr = ptr;
				6801	return FALSE;
				6802	}
				6803
				6804	/* Keep the highest bracket count in case (?\| was used and some branch
				6805	has fewer than the rest. */
				6806
				6807	if (cd->bracount > max_bracount) max_bracount = cd->bracount;
				6808
				6809	/* In the real compile phase, there is some post-processing to be done. */
				6810
				6811	if (lengthptr == NULL)
				6812	{
				6813	/* If this is the first branch, the firstbyte and reqbyte values for the
				6814	branch become the values for the regex. */
				6815
				6816	if (*last_branch != OP_ALT)
				6817	{
				6818	firstbyte = branchfirstbyte;
				6819	reqbyte = branchreqbyte;
				6820	}
				6821
				6822	/* If this is not the first branch, the first char and reqbyte have to
				6823	match the values from all the previous branches, except that if the
				6824	previous value for reqbyte didn't have REQ_VARY set, it can still match,
				6825	and we set REQ_VARY for the regex. */
				6826
				6827	else
				6828	{
				6829	/* If we previously had a firstbyte, but it doesn't match the new branch,
				6830	we have to abandon the firstbyte for the regex, but if there was
				6831	previously no reqbyte, it takes on the value of the old firstbyte. */
				6832
				6833	if (firstbyte >= 0 && firstbyte != branchfirstbyte)
				6834	{
				6835	if (reqbyte < 0) reqbyte = firstbyte;
				6836	firstbyte = REQ_NONE;
				6837	}
				6838
				6839	/* If we (now or from before) have no firstbyte, a firstbyte from the
				6840	branch becomes a reqbyte if there isn't a branch reqbyte. */
				6841
				6842	if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
				6843	branchreqbyte = branchfirstbyte;
				6844
				6845	/* Now ensure that the reqbytes match */
				6846
				6847	if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
				6848	reqbyte = REQ_NONE;
				6849	else reqbyte \|= branchreqbyte; /* To "or" REQ_VARY */
				6850	}
				6851
				6852	/* If lookbehind, check that this branch matches a fixed-length string, and
				6853	put the length into the OP_REVERSE item. Temporarily mark the end of the
				6854	branch with OP_END. If the branch contains OP_RECURSE, the result is -3
				6855	because there may be forward references that we can't check here. Set a
				6856	flag to cause another lookbehind check at the end. Why not do it all at the
				6857	end? Because common, erroneous checks are picked up here and the offset of
				6858	the problem can be shown. */
				6859
				6860	if (lookbehind)
				6861	{
				6862	int fixed_length;
				6863	*code = OP_END;
				6864	fixed_length = find_fixedlength(last_branch, (options & PCRE_UTF8) != 0,
				6865	FALSE, cd);
				6866	DPRINTF(("fixed length = %d\n", fixed_length));
				6867	if (fixed_length == -3)
				6868	{
				6869	cd->check_lookbehind = TRUE;
				6870	}
				6871	else if (fixed_length < 0)
				6872	{
				6873	*errorcodeptr = (fixed_length == -2)? ERR36 :
				6874	(fixed_length == -4)? ERR70: ERR25;
				6875	*ptrptr = ptr;
				6876	return FALSE;
				6877	}
				6878	else { PUT(reverse_count, 0, fixed_length); }
				6879	}
				6880	}
				6881
				6882	/* Reached end of expression, either ')' or end of pattern. In the real
				6883	compile phase, go back through the alternative branches and reverse the chain
				6884	of offsets, with the field in the BRA item now becoming an offset to the
				6885	first alternative. If there are no alternatives, it points to the end of the
				6886	group. The length in the terminating ket is always the length of the whole
				6887	bracketed item. Return leaving the pointer at the terminating char. */
				6888
				6889	if (*ptr != CHAR_VERTICAL_LINE)
				6890	{
				6891	if (lengthptr == NULL)
				6892	{
				6893	int branch_length = (int)(code - last_branch);
				6894	do
				6895	{
				6896	int prev_length = GET(last_branch, 1);
				6897	PUT(last_branch, 1, branch_length);
				6898	branch_length = prev_length;
				6899	last_branch -= branch_length;
				6900	}
				6901	while (branch_length > 0);
				6902	}
				6903
				6904	/* Fill in the ket */
				6905
				6906	*code = OP_KET;
				6907	PUT(code, 1, (int)(code - start_bracket));
				6908	code += 1 + LINK_SIZE;
				6909
				6910	/* If it was a capturing subpattern, check to see if it contained any
				6911	recursive back references. If so, we must wrap it in atomic brackets.
				6912	In any event, remove the block from the chain. */
				6913
				6914	if (capnumber > 0)
				6915	{
				6916	if (cd->open_caps->flag)
				6917	{
				6918	memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
				6919	code - start_bracket);
				6920	*start_bracket = OP_ONCE;
				6921	code += 1 + LINK_SIZE;
				6922	PUT(start_bracket, 1, (int)(code - start_bracket));
				6923	*code = OP_KET;
				6924	PUT(code, 1, (int)(code - start_bracket));
				6925	code += 1 + LINK_SIZE;
				6926	length += 2 + 2*LINK_SIZE;
				6927	}
				6928	cd->open_caps = cd->open_caps->next;
				6929	}
				6930
				6931	/* Retain the highest bracket number, in case resetting was used. */
				6932
				6933	cd->bracount = max_bracount;
				6934
				6935	/* Set values to pass back */
				6936
				6937	*codeptr = code;
				6938	*ptrptr = ptr;
				6939	*firstbyteptr = firstbyte;
				6940	*reqbyteptr = reqbyte;
				6941	if (lengthptr != NULL)
				6942	{
				6943	if (OFLOW_MAX - *lengthptr < length)
				6944	{
				6945	*errorcodeptr = ERR20;
				6946	return FALSE;
				6947	}
				6948	*lengthptr += length;
				6949	}
				6950	return TRUE;
				6951	}
				6952
				6953	/* Another branch follows. In the pre-compile phase, we can move the code
				6954	pointer back to where it was for the start of the first branch. (That is,
				6955	pretend that each branch is the only one.)
				6956
				6957	In the real compile phase, insert an ALT node. Its length field points back
				6958	to the previous branch while the bracket remains open. At the end the chain
				6959	is reversed. It's done like this so that the start of the bracket has a
				6960	zero offset until it is closed, making it possible to detect recursion. */
				6961
				6962	if (lengthptr != NULL)
				6963	{
				6964	code = *codeptr + 1 + LINK_SIZE + skipbytes;
				6965	length += 1 + LINK_SIZE;
				6966	}
				6967	else
				6968	{
				6969	*code = OP_ALT;
				6970	PUT(code, 1, (int)(code - last_branch));
				6971	bc.current_branch = last_branch = code;
				6972	code += 1 + LINK_SIZE;
				6973	}
				6974
				6975	ptr++;
				6976	}
				6977	/* Control never reaches here */
				6978	}
				6979
				6980
				6981
				6982
				6983	/*************************************************
				6984	* Check for anchored expression *
				6985	*************************************************/
				6986
				6987	/* Try to find out if this is an anchored regular expression. Consider each
				6988	alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
				6989	all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
				6990	it's anchored. However, if this is a multiline pattern, then only OP_SOD will
				6991	be found, because ^ generates OP_CIRCM in that mode.
				6992
				6993	We can also consider a regex to be anchored if OP_SOM starts all its branches.
				6994	This is the code for \G, which means "match at start of match position, taking
				6995	into account the match offset".
				6996
				6997	A branch is also implicitly anchored if it starts with .* and DOTALL is set,
				6998	because that will try the rest of the pattern at all possible matching points,
				6999	so there is no point trying again.... er ....
				7000
				7001	.... except when the .* appears inside capturing parentheses, and there is a
				7002	subsequent back reference to those parentheses. We haven't enough information
				7003	to catch that case precisely.
				7004
				7005	At first, the best we could do was to detect when .* was in capturing brackets
				7006	and the highest back reference was greater than or equal to that level.
				7007	However, by keeping a bitmap of the first 31 back references, we can catch some
				7008	of the more common cases more precisely.
				7009
				7010	Arguments:
				7011	code points to start of expression (the bracket)
				7012	bracket_map a bitmap of which brackets we are inside while testing; this
				7013	handles up to substring 31; after that we just have to take
				7014	the less precise approach
				7015	backref_map the back reference bitmap
				7016
				7017	Returns: TRUE or FALSE
				7018	*/
				7019
				7020	static BOOL
				7021	is_anchored(register const uschar *code, unsigned int bracket_map,
				7022	unsigned int backref_map)
				7023	{
				7024	do {
				7025	const uschar scode = first_significant_code(code + _pcre_OP_lengths[code],
				7026	FALSE);
				7027	register int op = *scode;
				7028
				7029	/* Non-capturing brackets */
				7030
				7031	if (op == OP_BRA \|\| op == OP_BRAPOS \|\|
				7032	op == OP_SBRA \|\| op == OP_SBRAPOS)
				7033	{
				7034	if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
				7035	}
				7036
				7037	/* Capturing brackets */
				7038
				7039	else if (op == OP_CBRA \|\| op == OP_CBRAPOS \|\|
				7040	op == OP_SCBRA \|\| op == OP_SCBRAPOS)
				7041	{
				7042	int n = GET2(scode, 1+LINK_SIZE);
				7043	int new_map = bracket_map \| ((n < 32)? (1 << n) : 1);
				7044	if (!is_anchored(scode, new_map, backref_map)) return FALSE;
				7045	}
				7046
				7047	/* Other brackets */
				7048
				7049	else if (op == OP_ASSERT \|\| op == OP_ONCE \|\| op == OP_ONCE_NC \|\|
				7050	op == OP_COND)
				7051	{
				7052	if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
				7053	}
				7054
				7055	/* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
				7056	it isn't in brackets that are or may be referenced. */
				7057
				7058	else if ((op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR \|\|
				7059	op == OP_TYPEPOSSTAR))
				7060	{
				7061	if (scode[1] != OP_ALLANY \|\| (bracket_map & backref_map) != 0)
				7062	return FALSE;
				7063	}
				7064
				7065	/* Check for explicit anchoring */
				7066
				7067	else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
				7068	code += GET(code, 1);
				7069	}
				7070	while (code == OP_ALT); / Loop for each alternative */
				7071	return TRUE;
				7072	}
				7073
				7074
				7075
				7076	/*************************************************
				7077	* Check for starting with ^ or .* *
				7078	*************************************************/
				7079
				7080	/* This is called to find out if every branch starts with ^ or .* so that
				7081	"first char" processing can be done to speed things up in multiline
				7082	matching and for non-DOTALL patterns that start with .* (which must start at
				7083	the beginning or after \n). As in the case of is_anchored() (see above), we
				7084	have to take account of back references to capturing brackets that contain .*
				7085	because in that case we can't make the assumption.
				7086
				7087	Arguments:
				7088	code points to start of expression (the bracket)
				7089	bracket_map a bitmap of which brackets we are inside while testing; this
				7090	handles up to substring 31; after that we just have to take
				7091	the less precise approach
				7092	backref_map the back reference bitmap
				7093
				7094	Returns: TRUE or FALSE
				7095	*/
				7096
				7097	static BOOL
				7098	is_startline(const uschar *code, unsigned int bracket_map,
				7099	unsigned int backref_map)
				7100	{
				7101	do {
				7102	const uschar scode = first_significant_code(code + _pcre_OP_lengths[code],
				7103	FALSE);
				7104	register int op = *scode;
				7105
				7106	/* If we are at the start of a conditional assertion group, both the
				7107	conditional assertion and what follows the condition must satisfy the test
				7108	for start of line. Other kinds of condition fail. Note that there may be an
				7109	auto-callout at the start of a condition. */
				7110
				7111	if (op == OP_COND)
				7112	{
				7113	scode += 1 + LINK_SIZE;
				7114	if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
				7115	switch (*scode)
				7116	{
				7117	case OP_CREF:
				7118	case OP_NCREF:
				7119	case OP_RREF:
				7120	case OP_NRREF:
				7121	case OP_DEF:
				7122	return FALSE;
				7123
				7124	default: /* Assertion */
				7125	if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
				7126	do scode += GET(scode, 1); while (*scode == OP_ALT);
				7127	scode += 1 + LINK_SIZE;
				7128	break;
				7129	}
				7130	scode = first_significant_code(scode, FALSE);
				7131	op = *scode;
				7132	}
				7133
				7134	/* Non-capturing brackets */
				7135
				7136	if (op == OP_BRA \|\| op == OP_BRAPOS \|\|
				7137	op == OP_SBRA \|\| op == OP_SBRAPOS)
				7138	{
				7139	if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
				7140	}
				7141
				7142	/* Capturing brackets */
				7143
				7144	else if (op == OP_CBRA \|\| op == OP_CBRAPOS \|\|
				7145	op == OP_SCBRA \|\| op == OP_SCBRAPOS)
				7146	{
				7147	int n = GET2(scode, 1+LINK_SIZE);
				7148	int new_map = bracket_map \| ((n < 32)? (1 << n) : 1);
				7149	if (!is_startline(scode, new_map, backref_map)) return FALSE;
				7150	}
				7151
				7152	/* Other brackets */
				7153
				7154	else if (op == OP_ASSERT \|\| op == OP_ONCE \|\| op == OP_ONCE_NC)
				7155	{
				7156	if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
				7157	}
				7158
				7159	/* .* means "start at start or after \n" if it isn't in brackets that
				7160	may be referenced. */
				7161
				7162	else if (op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR \|\| op == OP_TYPEPOSSTAR)
				7163	{
				7164	if (scode[1] != OP_ANY \|\| (bracket_map & backref_map) != 0) return FALSE;
				7165	}
				7166
				7167	/* Check for explicit circumflex */
				7168
				7169	else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
				7170
				7171	/* Move on to the next alternative */
				7172
				7173	code += GET(code, 1);
				7174	}
				7175	while (code == OP_ALT); / Loop for each alternative */
				7176	return TRUE;
				7177	}
				7178
				7179
				7180
				7181	/*************************************************
				7182	* Check for asserted fixed first char *
				7183	*************************************************/
				7184
				7185	/* During compilation, the "first char" settings from forward assertions are
				7186	discarded, because they can cause conflicts with actual literals that follow.
				7187	However, if we end up without a first char setting for an unanchored pattern,
				7188	it is worth scanning the regex to see if there is an initial asserted first
				7189	char. If all branches start with the same asserted char, or with a bracket all
				7190	of whose alternatives start with the same asserted char (recurse ad lib), then
				7191	we return that char, otherwise -1.
				7192
				7193	Arguments:
				7194	code points to start of expression (the bracket)
				7195	inassert TRUE if in an assertion
				7196
				7197	Returns: -1 or the fixed first char
				7198	*/
				7199
				7200	static int
				7201	find_firstassertedchar(const uschar *code, BOOL inassert)
				7202	{
				7203	register int c = -1;
				7204	do {
				7205	int d;
				7206	int xl = (code == OP_CBRA \|\| code == OP_SCBRA \|\|
				7207	code == OP_CBRAPOS \|\| code == OP_SCBRAPOS)? 2:0;
				7208	const uschar *scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
				7209	register int op = *scode;
				7210
				7211	switch(op)
				7212	{
				7213	default:
				7214	return -1;
				7215
				7216	case OP_BRA:
				7217	case OP_BRAPOS:
				7218	case OP_CBRA:
				7219	case OP_SCBRA:
				7220	case OP_CBRAPOS:
				7221	case OP_SCBRAPOS:
				7222	case OP_ASSERT:
				7223	case OP_ONCE:
				7224	case OP_ONCE_NC:
				7225	case OP_COND:
				7226	if ((d = find_firstassertedchar(scode, op == OP_ASSERT)) < 0)
				7227	return -1;
				7228	if (c < 0) c = d; else if (c != d) return -1;
				7229	break;
				7230
				7231	case OP_EXACT:
				7232	scode += 2;
				7233	/* Fall through */
				7234
				7235	case OP_CHAR:
				7236	case OP_PLUS:
				7237	case OP_MINPLUS:
				7238	case OP_POSPLUS:
				7239	if (!inassert) return -1;
				7240	if (c < 0) c = scode[1];
				7241	else if (c != scode[1]) return -1;
				7242	break;
				7243
				7244	case OP_EXACTI:
				7245	scode += 2;
				7246	/* Fall through */
				7247
				7248	case OP_CHARI:
				7249	case OP_PLUSI:
				7250	case OP_MINPLUSI:
				7251	case OP_POSPLUSI:
				7252	if (!inassert) return -1;
				7253	if (c < 0) c = scode[1] \| REQ_CASELESS;
				7254	else if (c != scode[1]) return -1;
				7255	break;
				7256	}
				7257
				7258	code += GET(code, 1);
				7259	}
				7260	while (*code == OP_ALT);
				7261	return c;
				7262	}
				7263
				7264
				7265
				7266	/*************************************************
				7267	* Compile a Regular Expression *
				7268	*************************************************/
				7269
				7270	/* This function takes a string and returns a pointer to a block of store
				7271	holding a compiled version of the expression. The original API for this
				7272	function had no error code return variable; it is retained for backwards
				7273	compatibility. The new function is given a new name.
				7274
				7275	Arguments:
				7276	pattern the regular expression
				7277	options various option bits
				7278	errorcodeptr pointer to error code variable (pcre_compile2() only)
				7279	can be NULL if you don't want a code value
				7280	errorptr pointer to pointer to error text
				7281	erroroffset ptr offset in pattern where error was detected
				7282	tables pointer to character tables or NULL
				7283
				7284	Returns: pointer to compiled data block, or NULL on error,
				7285	with errorptr and erroroffset set
				7286	*/
				7287
				7288	PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
				7289	pcre_compile(const char pattern, int options, const char *errorptr,
				7290	int erroroffset, const unsigned char tables)
				7291	{
				7292	return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
				7293	}
				7294
				7295
				7296	PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
				7297	pcre_compile2(const char pattern, int options, int errorcodeptr,
				7298	const char *errorptr, int erroroffset, const unsigned char *tables)
				7299	{
				7300	real_pcre *re;
				7301	int length = 1; /* For final END opcode */
				7302	int firstbyte, reqbyte, newline;
				7303	int errorcode = 0;
				7304	int skipatstart = 0;
				7305	BOOL utf8;
				7306	size_t size;
				7307	uschar *code;
				7308	const uschar *codestart;
				7309	const uschar *ptr;
				7310	compile_data compile_block;
				7311	compile_data *cd = &compile_block;
				7312
				7313	/* This space is used for "compiling" into during the first phase, when we are
				7314	computing the amount of memory that is needed. Compiled items are thrown away
				7315	as soon as possible, so that a fairly large buffer should be sufficient for
				7316	this purpose. The same space is used in the second phase for remembering where
				7317	to fill in forward references to subpatterns. That may overflow, in which case
				7318	new memory is obtained from malloc(). */
				7319
				7320	uschar cworkspace[COMPILE_WORK_SIZE];
				7321
				7322	/* Set this early so that early errors get offset 0. */
				7323
				7324	ptr = (const uschar *)pattern;
				7325
				7326	/* We can't pass back an error message if errorptr is NULL; I guess the best we
				7327	can do is just return NULL, but we can set a code value if there is a code
				7328	pointer. */
				7329
				7330	if (errorptr == NULL)
				7331	{
				7332	if (errorcodeptr != NULL) *errorcodeptr = 99;
				7333	return NULL;
				7334	}
				7335
				7336	*errorptr = NULL;
				7337	if (errorcodeptr != NULL) *errorcodeptr = ERR0;
				7338
				7339	/* However, we can give a message for this error */
				7340
				7341	if (erroroffset == NULL)
				7342	{
				7343	errorcode = ERR16;
				7344	goto PCRE_EARLY_ERROR_RETURN2;
				7345	}
				7346
				7347	*erroroffset = 0;
				7348
				7349	/* Set up pointers to the individual character tables */
				7350
				7351	if (tables == NULL) tables = _pcre_default_tables;
				7352	cd->lcc = tables + lcc_offset;
				7353	cd->fcc = tables + fcc_offset;
				7354	cd->cbits = tables + cbits_offset;
				7355	cd->ctypes = tables + ctypes_offset;
				7356
				7357	/* Check that all undefined public option bits are zero */
				7358
				7359	if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
				7360	{
				7361	errorcode = ERR17;
				7362	goto PCRE_EARLY_ERROR_RETURN;
				7363	}
				7364
				7365	/* Check for global one-time settings at the start of the pattern, and remember
				7366	the offset for later. */
				7367
				7368	while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
				7369	ptr[skipatstart+1] == CHAR_ASTERISK)
				7370	{
				7371	int newnl = 0;
				7372	int newbsr = 0;
				7373
				7374	if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
				7375	{ skipatstart += 7; options \|= PCRE_UTF8; continue; }
				7376	else if (strncmp((char *)(ptr+skipatstart+2), STRING_UCP_RIGHTPAR, 4) == 0)
				7377	{ skipatstart += 6; options \|= PCRE_UCP; continue; }
				7378	else if (strncmp((char *)(ptr+skipatstart+2), STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
				7379	{ skipatstart += 15; options \|= PCRE_NO_START_OPTIMIZE; continue; }
				7380
				7381	if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
				7382	{ skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
				7383	else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3) == 0)
				7384	{ skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
				7385	else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5) == 0)
				7386	{ skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
				7387	else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
				7388	{ skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
				7389	else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
				7390	{ skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
				7391
				7392	else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
				7393	{ skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
				7394	else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
				7395	{ skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
				7396
				7397	if (newnl != 0)
				7398	options = (options & ~PCRE_NEWLINE_BITS) \| newnl;
				7399	else if (newbsr != 0)
				7400	options = (options & ~(PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE)) \| newbsr;
				7401	else break;
				7402	}
				7403
				7404	utf8 = (options & PCRE_UTF8) != 0;
				7405
				7406	/* Can't support UTF8 unless PCRE has been compiled to include the code. The
				7407	return of an error code from _pcre_valid_utf8() is a new feature, introduced in
				7408	release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
				7409	not used here. */
				7410
				7411	#ifdef SUPPORT_UTF8
				7412	if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
				7413	(errorcode = _pcre_valid_utf8((USPTR)pattern, -1, erroroffset)) != 0)
				7414	{
				7415	errorcode = ERR44;
				7416	goto PCRE_EARLY_ERROR_RETURN2;
				7417	}
				7418	#else
				7419	if (utf8)
				7420	{
				7421	errorcode = ERR32;
				7422	goto PCRE_EARLY_ERROR_RETURN;
				7423	}
				7424	#endif
				7425
				7426	/* Can't support UCP unless PCRE has been compiled to include the code. */
				7427
				7428	#ifndef SUPPORT_UCP
				7429	if ((options & PCRE_UCP) != 0)
				7430	{
				7431	errorcode = ERR67;
				7432	goto PCRE_EARLY_ERROR_RETURN;
				7433	}
				7434	#endif
				7435
				7436	/* Check validity of \R options. */
				7437
				7438	if ((options & (PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE)) ==
				7439	(PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE))
				7440	{
				7441	errorcode = ERR56;
				7442	goto PCRE_EARLY_ERROR_RETURN;
				7443	}
				7444
				7445	/* Handle different types of newline. The three bits give seven cases. The
				7446	current code allows for fixed one- or two-byte sequences, plus "any" and
				7447	"anycrlf". */
				7448
				7449	switch (options & PCRE_NEWLINE_BITS)
				7450	{
				7451	case 0: newline = NEWLINE; break; /* Build-time default */
				7452	case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
				7453	case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
				7454	case PCRE_NEWLINE_CR+
				7455	PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) \| CHAR_NL; break;
				7456	case PCRE_NEWLINE_ANY: newline = -1; break;
				7457	case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
				7458	default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
				7459	}
				7460
				7461	if (newline == -2)
				7462	{
				7463	cd->nltype = NLTYPE_ANYCRLF;
				7464	}
				7465	else if (newline < 0)
				7466	{
				7467	cd->nltype = NLTYPE_ANY;
				7468	}
				7469	else
				7470	{
				7471	cd->nltype = NLTYPE_FIXED;
				7472	if (newline > 255)
				7473	{
				7474	cd->nllen = 2;
				7475	cd->nl[0] = (newline >> 8) & 255;
				7476	cd->nl[1] = newline & 255;
				7477	}
				7478	else
				7479	{
				7480	cd->nllen = 1;
				7481	cd->nl[0] = newline;
				7482	}
				7483	}
				7484
				7485	/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
				7486	references to help in deciding whether (.*) can be treated as anchored or not.
				7487	*/
				7488
				7489	cd->top_backref = 0;
				7490	cd->backref_map = 0;
				7491
				7492	/* Reflect pattern for debugging output */
				7493
				7494	DPRINTF(("------------------------------------------------------------------\n"));
				7495	DPRINTF(("%s\n", pattern));
				7496
				7497	/* Pretend to compile the pattern while actually just accumulating the length
				7498	of memory required. This behaviour is triggered by passing a non-NULL final
				7499	argument to compile_regex(). We pass a block of workspace (cworkspace) for it
				7500	to compile parts of the pattern into; the compiled code is discarded when it is
				7501	no longer needed, so hopefully this workspace will never overflow, though there
				7502	is a test for its doing so. */
				7503
				7504	cd->bracount = cd->final_bracount = 0;
				7505	cd->names_found = 0;
				7506	cd->name_entry_size = 0;
				7507	cd->name_table = NULL;
				7508	cd->start_code = cworkspace;
				7509	cd->hwm = cworkspace;
				7510	cd->start_workspace = cworkspace;
				7511	cd->workspace_size = COMPILE_WORK_SIZE;
				7512	cd->start_pattern = (const uschar *)pattern;
				7513	cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
				7514	cd->req_varyopt = 0;
				7515	cd->external_options = options;
				7516	cd->external_flags = 0;
				7517	cd->open_caps = NULL;
				7518
				7519	/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
				7520	don't need to look at the result of the function here. The initial options have
				7521	been put into the cd block so that they can be changed if an option setting is
				7522	found within the regex right at the beginning. Bringing initial option settings
				7523	outside can help speed up starting point checks. */
				7524
				7525	ptr += skipatstart;
				7526	code = cworkspace;
				7527	*code = OP_BRA;
				7528	(void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
				7529	FALSE, 0, 0, &firstbyte, &reqbyte, NULL, cd, &length);
				7530	if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
				7531
				7532	DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
				7533	cd->hwm - cworkspace));
				7534
				7535	if (length > MAX_PATTERN_SIZE)
				7536	{
				7537	errorcode = ERR20;
				7538	goto PCRE_EARLY_ERROR_RETURN;
				7539	}
				7540
				7541	/* Compute the size of data block needed and get it, either from malloc or
				7542	externally provided function. Integer overflow should no longer be possible
				7543	because nowadays we limit the maximum value of cd->names_found and
				7544	cd->name_entry_size. */
				7545
				7546	size = length + sizeof(real_pcre) + cd->names_found * cd->name_entry_size;
				7547	re = (real_pcre *)(pcre_malloc)(size);
				7548
				7549	if (re == NULL)
				7550	{
				7551	errorcode = ERR21;
				7552	goto PCRE_EARLY_ERROR_RETURN;
				7553	}
				7554
				7555	/* Put in the magic number, and save the sizes, initial options, internal
				7556	flags, and character table pointer. NULL is used for the default character
				7557	tables. The nullpad field is at the end; it's there to help in the case when a
				7558	regex compiled on a system with 4-byte pointers is run on another with 8-byte
				7559	pointers. */
				7560
				7561	re->magic_number = MAGIC_NUMBER;
				7562	re->size = (int)size;
				7563	re->options = cd->external_options;
				7564	re->flags = cd->external_flags;
				7565	re->dummy1 = 0;
				7566	re->first_byte = 0;
				7567	re->req_byte = 0;
				7568	re->name_table_offset = sizeof(real_pcre);
				7569	re->name_entry_size = cd->name_entry_size;
				7570	re->name_count = cd->names_found;
				7571	re->ref_count = 0;
				7572	re->tables = (tables == _pcre_default_tables)? NULL : tables;
				7573	re->nullpad = NULL;
				7574
				7575	/* The starting points of the name/number translation table and of the code are
				7576	passed around in the compile data block. The start/end pattern and initial
				7577	options are already set from the pre-compile phase, as is the name_entry_size
				7578	field. Reset the bracket count and the names_found field. Also reset the hwm
				7579	field; this time it's used for remembering forward references to subpatterns.
				7580	*/
				7581
				7582	cd->final_bracount = cd->bracount; /* Save for checking forward references */
				7583	cd->assert_depth = 0;
				7584	cd->bracount = 0;
				7585	cd->names_found = 0;
				7586	cd->name_table = (uschar *)re + re->name_table_offset;
				7587	codestart = cd->name_table + re->name_entry_size * re->name_count;
				7588	cd->start_code = codestart;
				7589	cd->hwm = (uschar *)(cd->start_workspace);
				7590	cd->req_varyopt = 0;
				7591	cd->had_accept = FALSE;
				7592	cd->check_lookbehind = FALSE;
				7593	cd->open_caps = NULL;
				7594
				7595	/* Set up a starting, non-extracting bracket, then compile the expression. On
				7596	error, errorcode will be set non-zero, so we don't need to look at the result
				7597	of the function here. */
				7598
				7599	ptr = (const uschar *)pattern + skipatstart;
				7600	code = (uschar *)codestart;
				7601	*code = OP_BRA;
				7602	(void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
				7603	&firstbyte, &reqbyte, NULL, cd, NULL);
				7604	re->top_bracket = cd->bracount;
				7605	re->top_backref = cd->top_backref;
				7606	re->flags = cd->external_flags;
				7607
				7608	if (cd->had_accept) reqbyte = REQ_NONE; /* Must disable after (ACCEPT) /
				7609
				7610	/* If not reached end of pattern on success, there's an excess bracket. */
				7611
				7612	if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
				7613
				7614	/* Fill in the terminating state and check for disastrous overflow, but
				7615	if debugging, leave the test till after things are printed out. */
				7616
				7617	*code++ = OP_END;
				7618
				7619	#ifndef PCRE_DEBUG
				7620	if (code - codestart > length) errorcode = ERR23;
				7621	#endif
				7622
				7623	/* Fill in any forward references that are required. There may be repeated
				7624	references; optimize for them, as searching a large regex takes time. */
				7625
				7626	if (cd->hwm > cd->start_workspace)
				7627	{
				7628	int prev_recno = -1;
				7629	const uschar *groupptr = NULL;
				7630	while (errorcode == 0 && cd->hwm > cd->start_workspace)
				7631	{
				7632	int offset, recno;
				7633	cd->hwm -= LINK_SIZE;
				7634	offset = GET(cd->hwm, 0);
				7635	recno = GET(codestart, offset);
				7636	if (recno != prev_recno)
				7637	{
				7638	groupptr = _pcre_find_bracket(codestart, utf8, recno);
				7639	prev_recno = recno;
				7640	}
				7641	if (groupptr == NULL) errorcode = ERR53;
				7642	else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart));
				7643	}
				7644	}
				7645
				7646	/* If the workspace had to be expanded, free the new memory. */
				7647
				7648	if (cd->workspace_size > COMPILE_WORK_SIZE)
				7649	(pcre_free)((void *)cd->start_workspace);
				7650
				7651	/* Give an error if there's back reference to a non-existent capturing
				7652	subpattern. */
				7653
				7654	if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
				7655
				7656	/* If there were any lookbehind assertions that contained OP_RECURSE
				7657	(recursions or subroutine calls), a flag is set for them to be checked here,
				7658	because they may contain forward references. Actual recursions can't be fixed
				7659	length, but subroutine calls can. It is done like this so that those without
				7660	OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
				7661	exceptional ones forgo this. We scan the pattern to check that they are fixed
				7662	length, and set their lengths. */
				7663
				7664	if (cd->check_lookbehind)
				7665	{
				7666	uschar cc = (uschar )codestart;
				7667
				7668	/* Loop, searching for OP_REVERSE items, and process those that do not have
				7669	their length set. (Actually, it will also re-process any that have a length
				7670	of zero, but that is a pathological case, and it does no harm.) When we find
				7671	one, we temporarily terminate the branch it is in while we scan it. */
				7672
				7673	for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1);
				7674	cc != NULL;
				7675	cc = (uschar *)_pcre_find_bracket(cc, utf8, -1))
				7676	{
				7677	if (GET(cc, 1) == 0)
				7678	{
				7679	int fixed_length;
				7680	uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
				7681	int end_op = *be;
				7682	*be = OP_END;
				7683	fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
				7684	cd);
				7685	*be = end_op;
				7686	DPRINTF(("fixed length = %d\n", fixed_length));
				7687	if (fixed_length < 0)
				7688	{
				7689	errorcode = (fixed_length == -2)? ERR36 :
				7690	(fixed_length == -4)? ERR70 : ERR25;
				7691	break;
				7692	}
				7693	PUT(cc, 1, fixed_length);
				7694	}
				7695	cc += 1 + LINK_SIZE;
				7696	}
				7697	}
				7698
				7699	/* Failed to compile, or error while post-processing */
				7700
				7701	if (errorcode != 0)
				7702	{
				7703	(pcre_free)(re);
				7704	PCRE_EARLY_ERROR_RETURN:
				7705	erroroffset = (int)(ptr - (const uschar )pattern);
				7706	PCRE_EARLY_ERROR_RETURN2:
				7707	*errorptr = find_error_text(errorcode);
				7708	if (errorcodeptr != NULL) *errorcodeptr = errorcode;
				7709	return NULL;
				7710	}
				7711
				7712	/* If the anchored option was not passed, set the flag if we can determine that
				7713	the pattern is anchored by virtue of ^ characters or \A or anything else (such
				7714	as starting with .* when DOTALL is set).
				7715
				7716	Otherwise, if we know what the first byte has to be, save it, because that
				7717	speeds up unanchored matches no end. If not, see if we can set the
				7718	PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
				7719	start with ^. and also when all branches start with .* for non-DOTALL matches.
				7720	*/
				7721
				7722	if ((re->options & PCRE_ANCHORED) == 0)
				7723	{
				7724	if (is_anchored(codestart, 0, cd->backref_map))
				7725	re->options \|= PCRE_ANCHORED;
				7726	else
				7727	{
				7728	if (firstbyte < 0)
				7729	firstbyte = find_firstassertedchar(codestart, FALSE);
				7730	if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
				7731	{
				7732	int ch = firstbyte & 255;
				7733	re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
				7734	cd->fcc[ch] == ch)? ch : firstbyte;
				7735	re->flags \|= PCRE_FIRSTSET;
				7736	}
				7737	else if (is_startline(codestart, 0, cd->backref_map))
				7738	re->flags \|= PCRE_STARTLINE;
				7739	}
				7740	}
				7741
				7742	/* For an anchored pattern, we use the "required byte" only if it follows a
				7743	variable length item in the regex. Remove the caseless flag for non-caseable
				7744	bytes. */
				7745
				7746	if (reqbyte >= 0 &&
				7747	((re->options & PCRE_ANCHORED) == 0 \|\| (reqbyte & REQ_VARY) != 0))
				7748	{
				7749	int ch = reqbyte & 255;
				7750	re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
				7751	cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
				7752	re->flags \|= PCRE_REQCHSET;
				7753	}
				7754
				7755	/* Print out the compiled data if debugging is enabled. This is never the
				7756	case when building a production library. */
				7757
				7758	#ifdef PCRE_DEBUG
				7759	printf("Length = %d top_bracket = %d top_backref = %d\n",
				7760	length, re->top_bracket, re->top_backref);
				7761
				7762	printf("Options=%08x\n", re->options);
				7763
				7764	if ((re->flags & PCRE_FIRSTSET) != 0)
				7765	{
				7766	int ch = re->first_byte & 255;
				7767	const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
				7768	"" : " (caseless)";
				7769	if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
				7770	else printf("First char = \\x%02x%s\n", ch, caseless);
				7771	}
				7772
				7773	if ((re->flags & PCRE_REQCHSET) != 0)
				7774	{
				7775	int ch = re->req_byte & 255;
				7776	const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
				7777	"" : " (caseless)";
				7778	if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
				7779	else printf("Req char = \\x%02x%s\n", ch, caseless);
				7780	}
				7781
				7782	pcre_printint(re, stdout, TRUE);
				7783
				7784	/* This check is done here in the debugging case so that the code that
				7785	was compiled can be seen. */
				7786
				7787	if (code - codestart > length)
				7788	{
				7789	(pcre_free)(re);
				7790	*errorptr = find_error_text(ERR23);
				7791	erroroffset = ptr - (uschar )pattern;
				7792	if (errorcodeptr != NULL) *errorcodeptr = ERR23;
				7793	return NULL;
				7794	}
				7795	#endif /* PCRE_DEBUG */
				7796
				7797	return (pcre *)re;
				7798	}
				7799
				7800	/* End of pcre_compile.c */