blob: 46498d590cf2bea42dbf2c8b8719ffa582ebf2da [file] [log] [blame]
Tristan Matthews04616462013-11-14 16:09:34 -05001/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40
41/* This module contains pcre_exec(), the externally visible function that does
42pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43possible. There are also some static supporting functions. */
44
45#ifdef HAVE_CONFIG_H
46#include "config.h"
47#endif
48
49#define NLBLOCK md /* Block containing newline information */
50#define PSSTART start_subject /* Field containing processed string start */
51#define PSEND end_subject /* Field containing processed string end */
52
53#include "pcre_internal.h"
54
55/* Undefine some potentially clashing cpp symbols */
56
57#undef min
58#undef max
59
60/* Values for setting in md->match_function_type to indicate two special types
61of call to match(). We do it this way to save on using another stack variable,
62as stack usage is to be discouraged. */
63
64#define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65#define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67/* Non-error returns from the match() function. Error returns are externally
68defined PCRE_ERROR_xxx codes, which are all negative. */
69
70#define MATCH_MATCH 1
71#define MATCH_NOMATCH 0
72
73/* Special internal returns from the match() function. Make them sufficiently
74negative to avoid the external error codes. */
75
76#define MATCH_ACCEPT (-999)
77#define MATCH_COMMIT (-998)
78#define MATCH_KETRPOS (-997)
79#define MATCH_ONCE (-996)
80#define MATCH_PRUNE (-995)
81#define MATCH_SKIP (-994)
82#define MATCH_SKIP_ARG (-993)
83#define MATCH_THEN (-992)
84
85/* Maximum number of ints of offset to save on the stack for recursive calls.
86If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87because the offset vector is always a multiple of 3 long. */
88
89#define REC_STACK_SAVE_MAX 30
90
91/* Min and max values for the common repeats; for the maxima, 0 => infinity */
92
93static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95
96
97
98#ifdef PCRE_DEBUG
99/*************************************************
100* Debugging function to print chars *
101*************************************************/
102
103/* Print a sequence of chars in printable format, stopping at the end of the
104subject if the requested.
105
106Arguments:
107 p points to characters
108 length number to print
109 is_subject TRUE if printing from within md->start_subject
110 md pointer to matching data block, if is_subject is TRUE
111
112Returns: nothing
113*/
114
115static void
116pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
117{
118unsigned int c;
119if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120while (length-- > 0)
121 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122}
123#endif
124
125
126
127/*************************************************
128* Match a back-reference *
129*************************************************/
130
131/* Normally, if a back reference hasn't been set, the length that is passed is
132negative, so the match always fails. However, in JavaScript compatibility mode,
133the length passed is zero. Note that in caseless UTF-8 mode, the number of
134subject bytes matched may be different to the number of reference bytes.
135
136Arguments:
137 offset index into the offset vector
138 eptr pointer into the subject
139 length length of reference to be matched (number of bytes)
140 md points to match data block
141 caseless TRUE if caseless
142
143Returns: < 0 if not matched, otherwise the number of subject bytes matched
144*/
145
146static int
147match_ref(int offset, register USPTR eptr, int length, match_data *md,
148 BOOL caseless)
149{
150USPTR eptr_start = eptr;
151register USPTR p = md->start_subject + md->offset_vector[offset];
152
153#ifdef PCRE_DEBUG
154if (eptr >= md->end_subject)
155 printf("matching subject <null>");
156else
157 {
158 printf("matching subject ");
159 pchars(eptr, length, TRUE, md);
160 }
161printf(" against backref ");
162pchars(p, length, FALSE, md);
163printf("\n");
164#endif
165
166/* Always fail if reference not set (and not JavaScript compatible). */
167
168if (length < 0) return -1;
169
170/* Separate the caseless case for speed. In UTF-8 mode we can only do this
171properly if Unicode properties are supported. Otherwise, we can check only
172ASCII characters. */
173
174if (caseless)
175 {
176#ifdef SUPPORT_UTF8
177#ifdef SUPPORT_UCP
178 if (md->utf8)
179 {
180 /* Match characters up to the end of the reference. NOTE: the number of
181 bytes matched may differ, because there are some characters whose upper and
182 lower case versions code as different numbers of bytes. For example, U+023A
183 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
184 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
185 the latter. It is important, therefore, to check the length along the
186 reference, not along the subject (earlier code did this wrong). */
187
188 USPTR endptr = p + length;
189 while (p < endptr)
190 {
191 int c, d;
192 if (eptr >= md->end_subject) return -1;
193 GETCHARINC(c, eptr);
194 GETCHARINC(d, p);
195 if (c != d && c != UCD_OTHERCASE(d)) return -1;
196 }
197 }
198 else
199#endif
200#endif
201
202 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203 is no UCP support. */
204 {
205 if (eptr + length > md->end_subject) return -1;
206 while (length-- > 0)
207 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
208 }
209 }
210
211/* In the caseful case, we can just compare the bytes, whether or not we
212are in UTF-8 mode. */
213
214else
215 {
216 if (eptr + length > md->end_subject) return -1;
217 while (length-- > 0) if (*p++ != *eptr++) return -1;
218 }
219
220return (int)(eptr - eptr_start);
221}
222
223
224
225/***************************************************************************
226****************************************************************************
227 RECURSION IN THE match() FUNCTION
228
229The match() function is highly recursive, though not every recursive call
230increases the recursive depth. Nevertheless, some regular expressions can cause
231it to recurse to a great depth. I was writing for Unix, so I just let it call
232itself recursively. This uses the stack for saving everything that has to be
233saved for a recursive call. On Unix, the stack can be large, and this works
234fine.
235
236It turns out that on some non-Unix-like systems there are problems with
237programs that use a lot of stack. (This despite the fact that every last chip
238has oodles of memory these days, and techniques for extending the stack have
239been known for decades.) So....
240
241There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
242calls by keeping local variables that need to be preserved in blocks of memory
243obtained from malloc() instead instead of on the stack. Macros are used to
244achieve this so that the actual code doesn't look very different to what it
245always used to.
246
247The original heap-recursive code used longjmp(). However, it seems that this
248can be very slow on some operating systems. Following a suggestion from Stan
249Switzer, the use of longjmp() has been abolished, at the cost of having to
250provide a unique number for each call to RMATCH. There is no way of generating
251a sequence of numbers at compile time in C. I have given them names, to make
252them stand out more clearly.
253
254Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
255FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
256tests. Furthermore, not using longjmp() means that local dynamic variables
257don't have indeterminate values; this has meant that the frame size can be
258reduced because the result can be "passed back" by straight setting of the
259variable instead of being passed in the frame.
260****************************************************************************
261***************************************************************************/
262
263/* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
264below must be updated in sync. */
265
266enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
267 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
268 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
269 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
270 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
271 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
272 RM61, RM62, RM63, RM64, RM65, RM66 };
273
274/* These versions of the macros use the stack, as normal. There are debugging
275versions and production versions. Note that the "rw" argument of RMATCH isn't
276actually used in this definition. */
277
278#ifndef NO_RECURSE
279#define REGISTER register
280
281#ifdef PCRE_DEBUG
282#define RMATCH(ra,rb,rc,rd,re,rw) \
283 { \
284 printf("match() called in line %d\n", __LINE__); \
285 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
286 printf("to line %d\n", __LINE__); \
287 }
288#define RRETURN(ra) \
289 { \
290 printf("match() returned %d from line %d ", ra, __LINE__); \
291 return ra; \
292 }
293#else
294#define RMATCH(ra,rb,rc,rd,re,rw) \
295 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
296#define RRETURN(ra) return ra
297#endif
298
299#else
300
301
302/* These versions of the macros manage a private stack on the heap. Note that
303the "rd" argument of RMATCH isn't actually used in this definition. It's the md
304argument of match(), which never changes. */
305
306#define REGISTER
307
308#define RMATCH(ra,rb,rc,rd,re,rw)\
309 {\
310 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
311 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
312 frame->Xwhere = rw; \
313 newframe->Xeptr = ra;\
314 newframe->Xecode = rb;\
315 newframe->Xmstart = mstart;\
316 newframe->Xoffset_top = rc;\
317 newframe->Xeptrb = re;\
318 newframe->Xrdepth = frame->Xrdepth + 1;\
319 newframe->Xprevframe = frame;\
320 frame = newframe;\
321 DPRINTF(("restarting from line %d\n", __LINE__));\
322 goto HEAP_RECURSE;\
323 L_##rw:\
324 DPRINTF(("jumped back to line %d\n", __LINE__));\
325 }
326
327#define RRETURN(ra)\
328 {\
329 heapframe *oldframe = frame;\
330 frame = oldframe->Xprevframe;\
331 (pcre_stack_free)(oldframe);\
332 if (frame != NULL)\
333 {\
334 rrc = ra;\
335 goto HEAP_RETURN;\
336 }\
337 return ra;\
338 }
339
340
341/* Structure for remembering the local variables in a private frame */
342
343typedef struct heapframe {
344 struct heapframe *Xprevframe;
345
346 /* Function arguments that may change */
347
348 USPTR Xeptr;
349 const uschar *Xecode;
350 USPTR Xmstart;
351 int Xoffset_top;
352 eptrblock *Xeptrb;
353 unsigned int Xrdepth;
354
355 /* Function local variables */
356
357 USPTR Xcallpat;
358#ifdef SUPPORT_UTF8
359 USPTR Xcharptr;
360#endif
361 USPTR Xdata;
362 USPTR Xnext;
363 USPTR Xpp;
364 USPTR Xprev;
365 USPTR Xsaved_eptr;
366
367 recursion_info Xnew_recursive;
368
369 BOOL Xcur_is_word;
370 BOOL Xcondition;
371 BOOL Xprev_is_word;
372
373#ifdef SUPPORT_UCP
374 int Xprop_type;
375 int Xprop_value;
376 int Xprop_fail_result;
377 int Xoclength;
378 uschar Xocchars[8];
379#endif
380
381 int Xcodelink;
382 int Xctype;
383 unsigned int Xfc;
384 int Xfi;
385 int Xlength;
386 int Xmax;
387 int Xmin;
388 int Xnumber;
389 int Xoffset;
390 int Xop;
391 int Xsave_capture_last;
392 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
393 int Xstacksave[REC_STACK_SAVE_MAX];
394
395 eptrblock Xnewptrb;
396
397 /* Where to jump back to */
398
399 int Xwhere;
400
401} heapframe;
402
403#endif
404
405
406/***************************************************************************
407***************************************************************************/
408
409
410
411/*************************************************
412* Match from current position *
413*************************************************/
414
415/* This function is called recursively in many circumstances. Whenever it
416returns a negative (error) response, the outer incarnation must also return the
417same response. */
418
419/* These macros pack up tests that are used for partial matching, and which
420appear several times in the code. We set the "hit end" flag if the pointer is
421at the end of the subject and also past the start of the subject (i.e.
422something has been matched). For hard partial matching, we then return
423immediately. The second one is used when we already know we are past the end of
424the subject. */
425
426#define CHECK_PARTIAL()\
427 if (md->partial != 0 && eptr >= md->end_subject && \
428 eptr > md->start_used_ptr) \
429 { \
430 md->hitend = TRUE; \
431 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
432 }
433
434#define SCHECK_PARTIAL()\
435 if (md->partial != 0 && eptr > md->start_used_ptr) \
436 { \
437 md->hitend = TRUE; \
438 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
439 }
440
441
442/* Performance note: It might be tempting to extract commonly used fields from
443the md structure (e.g. utf8, end_subject) into individual variables to improve
444performance. Tests using gcc on a SPARC disproved this; in the first case, it
445made performance worse.
446
447Arguments:
448 eptr pointer to current character in subject
449 ecode pointer to current position in compiled code
450 mstart pointer to the current match start position (can be modified
451 by encountering \K)
452 offset_top current top pointer
453 md pointer to "static" info for the match
454 eptrb pointer to chain of blocks containing eptr at start of
455 brackets - for testing for empty matches
456 rdepth the recursion depth
457
458Returns: MATCH_MATCH if matched ) these values are >= 0
459 MATCH_NOMATCH if failed to match )
460 a negative MATCH_xxx value for PRUNE, SKIP, etc
461 a negative PCRE_ERROR_xxx value if aborted by an error condition
462 (e.g. stopped by repeated call or recursion limit)
463*/
464
465static int
466match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
467 int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
468{
469/* These variables do not need to be preserved over recursion in this function,
470so they can be ordinary variables in all cases. Mark some of them with
471"register" because they are used a lot in loops. */
472
473register int rrc; /* Returns from recursive calls */
474register int i; /* Used for loops not involving calls to RMATCH() */
475register unsigned int c; /* Character values not kept over RMATCH() calls */
476register BOOL utf8; /* Local copy of UTF-8 flag for speed */
477
478BOOL minimize, possessive; /* Quantifier options */
479BOOL caseless;
480int condcode;
481
482/* When recursion is not being used, all "local" variables that have to be
483preserved over calls to RMATCH() are part of a "frame" which is obtained from
484heap storage. Set up the top-level frame here; others are obtained from the
485heap whenever RMATCH() does a "recursion". See the macro definitions above. */
486
487#ifdef NO_RECURSE
488heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
489if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
490frame->Xprevframe = NULL; /* Marks the top level */
491
492/* Copy in the original argument variables */
493
494frame->Xeptr = eptr;
495frame->Xecode = ecode;
496frame->Xmstart = mstart;
497frame->Xoffset_top = offset_top;
498frame->Xeptrb = eptrb;
499frame->Xrdepth = rdepth;
500
501/* This is where control jumps back to to effect "recursion" */
502
503HEAP_RECURSE:
504
505/* Macros make the argument variables come from the current frame */
506
507#define eptr frame->Xeptr
508#define ecode frame->Xecode
509#define mstart frame->Xmstart
510#define offset_top frame->Xoffset_top
511#define eptrb frame->Xeptrb
512#define rdepth frame->Xrdepth
513
514/* Ditto for the local variables */
515
516#ifdef SUPPORT_UTF8
517#define charptr frame->Xcharptr
518#endif
519#define callpat frame->Xcallpat
520#define codelink frame->Xcodelink
521#define data frame->Xdata
522#define next frame->Xnext
523#define pp frame->Xpp
524#define prev frame->Xprev
525#define saved_eptr frame->Xsaved_eptr
526
527#define new_recursive frame->Xnew_recursive
528
529#define cur_is_word frame->Xcur_is_word
530#define condition frame->Xcondition
531#define prev_is_word frame->Xprev_is_word
532
533#ifdef SUPPORT_UCP
534#define prop_type frame->Xprop_type
535#define prop_value frame->Xprop_value
536#define prop_fail_result frame->Xprop_fail_result
537#define oclength frame->Xoclength
538#define occhars frame->Xocchars
539#endif
540
541#define ctype frame->Xctype
542#define fc frame->Xfc
543#define fi frame->Xfi
544#define length frame->Xlength
545#define max frame->Xmax
546#define min frame->Xmin
547#define number frame->Xnumber
548#define offset frame->Xoffset
549#define op frame->Xop
550#define save_capture_last frame->Xsave_capture_last
551#define save_offset1 frame->Xsave_offset1
552#define save_offset2 frame->Xsave_offset2
553#define save_offset3 frame->Xsave_offset3
554#define stacksave frame->Xstacksave
555
556#define newptrb frame->Xnewptrb
557
558/* When recursion is being used, local variables are allocated on the stack and
559get preserved during recursion in the normal way. In this environment, fi and
560i, and fc and c, can be the same variables. */
561
562#else /* NO_RECURSE not defined */
563#define fi i
564#define fc c
565
566/* Many of the following variables are used only in small blocks of the code.
567My normal style of coding would have declared them within each of those blocks.
568However, in order to accommodate the version of this code that uses an external
569"stack" implemented on the heap, it is easier to declare them all here, so the
570declarations can be cut out in a block. The only declarations within blocks
571below are for variables that do not have to be preserved over a recursive call
572to RMATCH(). */
573
574#ifdef SUPPORT_UTF8
575const uschar *charptr;
576#endif
577const uschar *callpat;
578const uschar *data;
579const uschar *next;
580USPTR pp;
581const uschar *prev;
582USPTR saved_eptr;
583
584recursion_info new_recursive;
585
586BOOL cur_is_word;
587BOOL condition;
588BOOL prev_is_word;
589
590#ifdef SUPPORT_UCP
591int prop_type;
592int prop_value;
593int prop_fail_result;
594int oclength;
595uschar occhars[8];
596#endif
597
598int codelink;
599int ctype;
600int length;
601int max;
602int min;
603int number;
604int offset;
605int op;
606int save_capture_last;
607int save_offset1, save_offset2, save_offset3;
608int stacksave[REC_STACK_SAVE_MAX];
609
610eptrblock newptrb;
611#endif /* NO_RECURSE */
612
613/* To save space on the stack and in the heap frame, I have doubled up on some
614of the local variables that are used only in localised parts of the code, but
615still need to be preserved over recursive calls of match(). These macros define
616the alternative names that are used. */
617
618#define allow_zero cur_is_word
619#define cbegroup condition
620#define code_offset codelink
621#define condassert condition
622#define matched_once prev_is_word
623
624/* These statements are here to stop the compiler complaining about unitialized
625variables. */
626
627#ifdef SUPPORT_UCP
628prop_value = 0;
629prop_fail_result = 0;
630#endif
631
632
633/* This label is used for tail recursion, which is used in a few cases even
634when NO_RECURSE is not defined, in order to reduce the amount of stack that is
635used. Thanks to Ian Taylor for noticing this possibility and sending the
636original patch. */
637
638TAIL_RECURSE:
639
640/* OK, now we can get on with the real code of the function. Recursive calls
641are specified by the macro RMATCH and RRETURN is used to return. When
642NO_RECURSE is *not* defined, these just turn into a recursive call to match()
643and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
644defined). However, RMATCH isn't like a function call because it's quite a
645complicated macro. It has to be used in one particular way. This shouldn't,
646however, impact performance when true recursion is being used. */
647
648#ifdef SUPPORT_UTF8
649utf8 = md->utf8; /* Local copy of the flag */
650#else
651utf8 = FALSE;
652#endif
653
654/* First check that we haven't called match() too many times, or that we
655haven't exceeded the recursive call limit. */
656
657if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
658if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
659
660/* At the start of a group with an unlimited repeat that may match an empty
661string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
662done this way to save having to use another function argument, which would take
663up space on the stack. See also MATCH_CONDASSERT below.
664
665When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
666such remembered pointers, to be checked when we hit the closing ket, in order
667to break infinite loops that match no characters. When match() is called in
668other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
669NOT be used with tail recursion, because the memory block that is used is on
670the stack, so a new one may be required for each match(). */
671
672if (md->match_function_type == MATCH_CBEGROUP)
673 {
674 newptrb.epb_saved_eptr = eptr;
675 newptrb.epb_prev = eptrb;
676 eptrb = &newptrb;
677 md->match_function_type = 0;
678 }
679
680/* Now start processing the opcodes. */
681
682for (;;)
683 {
684 minimize = possessive = FALSE;
685 op = *ecode;
686
687 switch(op)
688 {
689 case OP_MARK:
690 md->nomatch_mark = ecode + 2;
691 md->mark = NULL; /* In case previously set by assertion */
692 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
693 eptrb, RM55);
694 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
695 md->mark == NULL) md->mark = ecode + 2;
696
697 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
698 argument, and we must check whether that argument matches this MARK's
699 argument. It is passed back in md->start_match_ptr (an overloading of that
700 variable). If it does match, we reset that variable to the current subject
701 position and return MATCH_SKIP. Otherwise, pass back the return code
702 unaltered. */
703
704 else if (rrc == MATCH_SKIP_ARG &&
705 strcmp((char *)(ecode + 2), (char *)(md->start_match_ptr)) == 0)
706 {
707 md->start_match_ptr = eptr;
708 RRETURN(MATCH_SKIP);
709 }
710 RRETURN(rrc);
711
712 case OP_FAIL:
713 RRETURN(MATCH_NOMATCH);
714
715 /* COMMIT overrides PRUNE, SKIP, and THEN */
716
717 case OP_COMMIT:
718 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
719 eptrb, RM52);
720 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
721 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
722 rrc != MATCH_THEN)
723 RRETURN(rrc);
724 RRETURN(MATCH_COMMIT);
725
726 /* PRUNE overrides THEN */
727
728 case OP_PRUNE:
729 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
730 eptrb, RM51);
731 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
732 RRETURN(MATCH_PRUNE);
733
734 case OP_PRUNE_ARG:
735 md->nomatch_mark = ecode + 2;
736 md->mark = NULL; /* In case previously set by assertion */
737 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
738 eptrb, RM56);
739 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
740 md->mark == NULL) md->mark = ecode + 2;
741 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
742 RRETURN(MATCH_PRUNE);
743
744 /* SKIP overrides PRUNE and THEN */
745
746 case OP_SKIP:
747 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
748 eptrb, RM53);
749 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
750 RRETURN(rrc);
751 md->start_match_ptr = eptr; /* Pass back current position */
752 RRETURN(MATCH_SKIP);
753
754 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
755 nomatch_mark. There is a flag that disables this opcode when re-matching a
756 pattern that ended with a SKIP for which there was not a matching MARK. */
757
758 case OP_SKIP_ARG:
759 if (md->ignore_skip_arg)
760 {
761 ecode += _pcre_OP_lengths[*ecode] + ecode[1];
762 break;
763 }
764 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
765 eptrb, RM57);
766 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
767 RRETURN(rrc);
768
769 /* Pass back the current skip name by overloading md->start_match_ptr and
770 returning the special MATCH_SKIP_ARG return code. This will either be
771 caught by a matching MARK, or get to the top, where it causes a rematch
772 with the md->ignore_skip_arg flag set. */
773
774 md->start_match_ptr = ecode + 2;
775 RRETURN(MATCH_SKIP_ARG);
776
777 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
778 the branch in which it occurs can be determined. Overload the start of
779 match pointer to do this. */
780
781 case OP_THEN:
782 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
783 eptrb, RM54);
784 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
785 md->start_match_ptr = ecode;
786 RRETURN(MATCH_THEN);
787
788 case OP_THEN_ARG:
789 md->nomatch_mark = ecode + 2;
790 md->mark = NULL; /* In case previously set by assertion */
791 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top,
792 md, eptrb, RM58);
793 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
794 md->mark == NULL) md->mark = ecode + 2;
795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
796 md->start_match_ptr = ecode;
797 RRETURN(MATCH_THEN);
798
799 /* Handle an atomic group that does not contain any capturing parentheses.
800 This can be handled like an assertion. Prior to 8.13, all atomic groups
801 were handled this way. In 8.13, the code was changed as below for ONCE, so
802 that backups pass through the group and thereby reset captured values.
803 However, this uses a lot more stack, so in 8.20, atomic groups that do not
804 contain any captures generate OP_ONCE_NC, which can be handled in the old,
805 less stack intensive way.
806
807 Check the alternative branches in turn - the matching won't pass the KET
808 for this kind of subpattern. If any one branch matches, we carry on as at
809 the end of a normal bracket, leaving the subject pointer, but resetting
810 the start-of-match value in case it was changed by \K. */
811
812 case OP_ONCE_NC:
813 prev = ecode;
814 saved_eptr = eptr;
815 do
816 {
817 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
818 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
819 {
820 mstart = md->start_match_ptr;
821 break;
822 }
823 if (rrc == MATCH_THEN)
824 {
825 next = ecode + GET(ecode,1);
826 if (md->start_match_ptr < next &&
827 (*ecode == OP_ALT || *next == OP_ALT))
828 rrc = MATCH_NOMATCH;
829 }
830
831 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
832 ecode += GET(ecode,1);
833 }
834 while (*ecode == OP_ALT);
835
836 /* If hit the end of the group (which could be repeated), fail */
837
838 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
839
840 /* Continue as from after the group, updating the offsets high water
841 mark, since extracts may have been taken. */
842
843 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
844
845 offset_top = md->end_offset_top;
846 eptr = md->end_match_ptr;
847
848 /* For a non-repeating ket, just continue at this level. This also
849 happens for a repeating ket if no characters were matched in the group.
850 This is the forcible breaking of infinite loops as implemented in Perl
851 5.005. */
852
853 if (*ecode == OP_KET || eptr == saved_eptr)
854 {
855 ecode += 1+LINK_SIZE;
856 break;
857 }
858
859 /* The repeating kets try the rest of the pattern or restart from the
860 preceding bracket, in the appropriate order. The second "call" of match()
861 uses tail recursion, to avoid using another stack frame. */
862
863 if (*ecode == OP_KETRMIN)
864 {
865 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
866 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
867 ecode = prev;
868 goto TAIL_RECURSE;
869 }
870 else /* OP_KETRMAX */
871 {
872 md->match_function_type = MATCH_CBEGROUP;
873 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
874 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
875 ecode += 1 + LINK_SIZE;
876 goto TAIL_RECURSE;
877 }
878 /* Control never gets here */
879
880 /* Handle a capturing bracket, other than those that are possessive with an
881 unlimited repeat. If there is space in the offset vector, save the current
882 subject position in the working slot at the top of the vector. We mustn't
883 change the current values of the data slot, because they may be set from a
884 previous iteration of this group, and be referred to by a reference inside
885 the group. A failure to match might occur after the group has succeeded,
886 if something later on doesn't match. For this reason, we need to restore
887 the working value and also the values of the final offsets, in case they
888 were set by a previous iteration of the same bracket.
889
890 If there isn't enough space in the offset vector, treat this as if it were
891 a non-capturing bracket. Don't worry about setting the flag for the error
892 case here; that is handled in the code for KET. */
893
894 case OP_CBRA:
895 case OP_SCBRA:
896 number = GET2(ecode, 1+LINK_SIZE);
897 offset = number << 1;
898
899#ifdef PCRE_DEBUG
900 printf("start bracket %d\n", number);
901 printf("subject=");
902 pchars(eptr, 16, TRUE, md);
903 printf("\n");
904#endif
905
906 if (offset < md->offset_max)
907 {
908 save_offset1 = md->offset_vector[offset];
909 save_offset2 = md->offset_vector[offset+1];
910 save_offset3 = md->offset_vector[md->offset_end - number];
911 save_capture_last = md->capture_last;
912
913 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
914 md->offset_vector[md->offset_end - number] =
915 (int)(eptr - md->start_subject);
916
917 for (;;)
918 {
919 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
920 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
921 eptrb, RM1);
922 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
923
924 /* If we backed up to a THEN, check whether it is within the current
925 branch by comparing the address of the THEN that is passed back with
926 the end of the branch. If it is within the current branch, and the
927 branch is one of two or more alternatives (it either starts or ends
928 with OP_ALT), we have reached the limit of THEN's action, so convert
929 the return code to NOMATCH, which will cause normal backtracking to
930 happen from now on. Otherwise, THEN is passed back to an outer
931 alternative. This implements Perl's treatment of parenthesized groups,
932 where a group not containing | does not affect the current alternative,
933 that is, (X) is NOT the same as (X|(*F)). */
934
935 if (rrc == MATCH_THEN)
936 {
937 next = ecode + GET(ecode,1);
938 if (md->start_match_ptr < next &&
939 (*ecode == OP_ALT || *next == OP_ALT))
940 rrc = MATCH_NOMATCH;
941 }
942
943 /* Anything other than NOMATCH is passed back. */
944
945 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
946 md->capture_last = save_capture_last;
947 ecode += GET(ecode, 1);
948 if (*ecode != OP_ALT) break;
949 }
950
951 DPRINTF(("bracket %d failed\n", number));
952 md->offset_vector[offset] = save_offset1;
953 md->offset_vector[offset+1] = save_offset2;
954 md->offset_vector[md->offset_end - number] = save_offset3;
955
956 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
957
958 RRETURN(rrc);
959 }
960
961 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
962 as a non-capturing bracket. */
963
964 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
965 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
966
967 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
968
969 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
970 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
971
972 /* Non-capturing or atomic group, except for possessive with unlimited
973 repeat and ONCE group with no captures. Loop for all the alternatives.
974
975 When we get to the final alternative within the brackets, we used to return
976 the result of a recursive call to match() whatever happened so it was
977 possible to reduce stack usage by turning this into a tail recursion,
978 except in the case of a possibly empty group. However, now that there is
979 the possiblity of (*THEN) occurring in the final alternative, this
980 optimization is no longer always possible.
981
982 We can optimize if we know there are no (*THEN)s in the pattern; at present
983 this is the best that can be done.
984
985 MATCH_ONCE is returned when the end of an atomic group is successfully
986 reached, but subsequent matching fails. It passes back up the tree (causing
987 captured values to be reset) until the original atomic group level is
988 reached. This is tested by comparing md->once_target with the start of the
989 group. At this point, the return is converted into MATCH_NOMATCH so that
990 previous backup points can be taken. */
991
992 case OP_ONCE:
993 case OP_BRA:
994 case OP_SBRA:
995 DPRINTF(("start non-capturing bracket\n"));
996
997 for (;;)
998 {
999 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1000
1001 /* If this is not a possibly empty group, and there are no (*THEN)s in
1002 the pattern, and this is the final alternative, optimize as described
1003 above. */
1004
1005 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1006 {
1007 ecode += _pcre_OP_lengths[*ecode];
1008 goto TAIL_RECURSE;
1009 }
1010
1011 /* In all other cases, we have to make another call to match(). */
1012
1013 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
1014 RM2);
1015
1016 /* See comment in the code for capturing groups above about handling
1017 THEN. */
1018
1019 if (rrc == MATCH_THEN)
1020 {
1021 next = ecode + GET(ecode,1);
1022 if (md->start_match_ptr < next &&
1023 (*ecode == OP_ALT || *next == OP_ALT))
1024 rrc = MATCH_NOMATCH;
1025 }
1026
1027 if (rrc != MATCH_NOMATCH)
1028 {
1029 if (rrc == MATCH_ONCE)
1030 {
1031 const uschar *scode = ecode;
1032 if (*scode != OP_ONCE) /* If not at start, find it */
1033 {
1034 while (*scode == OP_ALT) scode += GET(scode, 1);
1035 scode -= GET(scode, 1);
1036 }
1037 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1038 }
1039 RRETURN(rrc);
1040 }
1041 ecode += GET(ecode, 1);
1042 if (*ecode != OP_ALT) break;
1043 }
1044
1045 RRETURN(MATCH_NOMATCH);
1046
1047 /* Handle possessive capturing brackets with an unlimited repeat. We come
1048 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1049 handled similarly to the normal case above. However, the matching is
1050 different. The end of these brackets will always be OP_KETRPOS, which
1051 returns MATCH_KETRPOS without going further in the pattern. By this means
1052 we can handle the group by iteration rather than recursion, thereby
1053 reducing the amount of stack needed. */
1054
1055 case OP_CBRAPOS:
1056 case OP_SCBRAPOS:
1057 allow_zero = FALSE;
1058
1059 POSSESSIVE_CAPTURE:
1060 number = GET2(ecode, 1+LINK_SIZE);
1061 offset = number << 1;
1062
1063#ifdef PCRE_DEBUG
1064 printf("start possessive bracket %d\n", number);
1065 printf("subject=");
1066 pchars(eptr, 16, TRUE, md);
1067 printf("\n");
1068#endif
1069
1070 if (offset < md->offset_max)
1071 {
1072 matched_once = FALSE;
1073 code_offset = (int)(ecode - md->start_code);
1074
1075 save_offset1 = md->offset_vector[offset];
1076 save_offset2 = md->offset_vector[offset+1];
1077 save_offset3 = md->offset_vector[md->offset_end - number];
1078 save_capture_last = md->capture_last;
1079
1080 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1081
1082 /* Each time round the loop, save the current subject position for use
1083 when the group matches. For MATCH_MATCH, the group has matched, so we
1084 restart it with a new subject starting position, remembering that we had
1085 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1086 usual. If we haven't matched any alternatives in any iteration, check to
1087 see if a previous iteration matched. If so, the group has matched;
1088 continue from afterwards. Otherwise it has failed; restore the previous
1089 capture values before returning NOMATCH. */
1090
1091 for (;;)
1092 {
1093 md->offset_vector[md->offset_end - number] =
1094 (int)(eptr - md->start_subject);
1095 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1096 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1097 eptrb, RM63);
1098 if (rrc == MATCH_KETRPOS)
1099 {
1100 offset_top = md->end_offset_top;
1101 eptr = md->end_match_ptr;
1102 ecode = md->start_code + code_offset;
1103 save_capture_last = md->capture_last;
1104 matched_once = TRUE;
1105 continue;
1106 }
1107
1108 /* See comment in the code for capturing groups above about handling
1109 THEN. */
1110
1111 if (rrc == MATCH_THEN)
1112 {
1113 next = ecode + GET(ecode,1);
1114 if (md->start_match_ptr < next &&
1115 (*ecode == OP_ALT || *next == OP_ALT))
1116 rrc = MATCH_NOMATCH;
1117 }
1118
1119 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1120 md->capture_last = save_capture_last;
1121 ecode += GET(ecode, 1);
1122 if (*ecode != OP_ALT) break;
1123 }
1124
1125 if (!matched_once)
1126 {
1127 md->offset_vector[offset] = save_offset1;
1128 md->offset_vector[offset+1] = save_offset2;
1129 md->offset_vector[md->offset_end - number] = save_offset3;
1130 }
1131
1132 if (allow_zero || matched_once)
1133 {
1134 ecode += 1 + LINK_SIZE;
1135 break;
1136 }
1137
1138 RRETURN(MATCH_NOMATCH);
1139 }
1140
1141 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1142 as a non-capturing bracket. */
1143
1144 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1145 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1146
1147 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1148
1149 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1150 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1151
1152 /* Non-capturing possessive bracket with unlimited repeat. We come here
1153 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1154 without the capturing complication. It is written out separately for speed
1155 and cleanliness. */
1156
1157 case OP_BRAPOS:
1158 case OP_SBRAPOS:
1159 allow_zero = FALSE;
1160
1161 POSSESSIVE_NON_CAPTURE:
1162 matched_once = FALSE;
1163 code_offset = (int)(ecode - md->start_code);
1164
1165 for (;;)
1166 {
1167 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1168 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1169 eptrb, RM48);
1170 if (rrc == MATCH_KETRPOS)
1171 {
1172 offset_top = md->end_offset_top;
1173 eptr = md->end_match_ptr;
1174 ecode = md->start_code + code_offset;
1175 matched_once = TRUE;
1176 continue;
1177 }
1178
1179 /* See comment in the code for capturing groups above about handling
1180 THEN. */
1181
1182 if (rrc == MATCH_THEN)
1183 {
1184 next = ecode + GET(ecode,1);
1185 if (md->start_match_ptr < next &&
1186 (*ecode == OP_ALT || *next == OP_ALT))
1187 rrc = MATCH_NOMATCH;
1188 }
1189
1190 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1191 ecode += GET(ecode, 1);
1192 if (*ecode != OP_ALT) break;
1193 }
1194
1195 if (matched_once || allow_zero)
1196 {
1197 ecode += 1 + LINK_SIZE;
1198 break;
1199 }
1200 RRETURN(MATCH_NOMATCH);
1201
1202 /* Control never reaches here. */
1203
1204 /* Conditional group: compilation checked that there are no more than
1205 two branches. If the condition is false, skipping the first branch takes us
1206 past the end if there is only one branch, but that's OK because that is
1207 exactly what going to the ket would do. */
1208
1209 case OP_COND:
1210 case OP_SCOND:
1211 codelink = GET(ecode, 1);
1212
1213 /* Because of the way auto-callout works during compile, a callout item is
1214 inserted between OP_COND and an assertion condition. */
1215
1216 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1217 {
1218 if (pcre_callout != NULL)
1219 {
1220 pcre_callout_block cb;
1221 cb.version = 2; /* Version 1 of the callout block */
1222 cb.callout_number = ecode[LINK_SIZE+2];
1223 cb.offset_vector = md->offset_vector;
1224 cb.subject = (PCRE_SPTR)md->start_subject;
1225 cb.subject_length = (int)(md->end_subject - md->start_subject);
1226 cb.start_match = (int)(mstart - md->start_subject);
1227 cb.current_position = (int)(eptr - md->start_subject);
1228 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1229 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1230 cb.capture_top = offset_top/2;
1231 cb.capture_last = md->capture_last;
1232 cb.callout_data = md->callout_data;
1233 cb.mark = md->nomatch_mark;
1234 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1235 if (rrc < 0) RRETURN(rrc);
1236 }
1237 ecode += _pcre_OP_lengths[OP_CALLOUT];
1238 }
1239
1240 condcode = ecode[LINK_SIZE+1];
1241
1242 /* Now see what the actual condition is */
1243
1244 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1245 {
1246 if (md->recursive == NULL) /* Not recursing => FALSE */
1247 {
1248 condition = FALSE;
1249 ecode += GET(ecode, 1);
1250 }
1251 else
1252 {
1253 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1254 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1255
1256 /* If the test is for recursion into a specific subpattern, and it is
1257 false, but the test was set up by name, scan the table to see if the
1258 name refers to any other numbers, and test them. The condition is true
1259 if any one is set. */
1260
1261 if (!condition && condcode == OP_NRREF)
1262 {
1263 uschar *slotA = md->name_table;
1264 for (i = 0; i < md->name_count; i++)
1265 {
1266 if (GET2(slotA, 0) == recno) break;
1267 slotA += md->name_entry_size;
1268 }
1269
1270 /* Found a name for the number - there can be only one; duplicate
1271 names for different numbers are allowed, but not vice versa. First
1272 scan down for duplicates. */
1273
1274 if (i < md->name_count)
1275 {
1276 uschar *slotB = slotA;
1277 while (slotB > md->name_table)
1278 {
1279 slotB -= md->name_entry_size;
1280 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1281 {
1282 condition = GET2(slotB, 0) == md->recursive->group_num;
1283 if (condition) break;
1284 }
1285 else break;
1286 }
1287
1288 /* Scan up for duplicates */
1289
1290 if (!condition)
1291 {
1292 slotB = slotA;
1293 for (i++; i < md->name_count; i++)
1294 {
1295 slotB += md->name_entry_size;
1296 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1297 {
1298 condition = GET2(slotB, 0) == md->recursive->group_num;
1299 if (condition) break;
1300 }
1301 else break;
1302 }
1303 }
1304 }
1305 }
1306
1307 /* Chose branch according to the condition */
1308
1309 ecode += condition? 3 : GET(ecode, 1);
1310 }
1311 }
1312
1313 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1314 {
1315 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1316 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1317
1318 /* If the numbered capture is unset, but the reference was by name,
1319 scan the table to see if the name refers to any other numbers, and test
1320 them. The condition is true if any one is set. This is tediously similar
1321 to the code above, but not close enough to try to amalgamate. */
1322
1323 if (!condition && condcode == OP_NCREF)
1324 {
1325 int refno = offset >> 1;
1326 uschar *slotA = md->name_table;
1327
1328 for (i = 0; i < md->name_count; i++)
1329 {
1330 if (GET2(slotA, 0) == refno) break;
1331 slotA += md->name_entry_size;
1332 }
1333
1334 /* Found a name for the number - there can be only one; duplicate names
1335 for different numbers are allowed, but not vice versa. First scan down
1336 for duplicates. */
1337
1338 if (i < md->name_count)
1339 {
1340 uschar *slotB = slotA;
1341 while (slotB > md->name_table)
1342 {
1343 slotB -= md->name_entry_size;
1344 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1345 {
1346 offset = GET2(slotB, 0) << 1;
1347 condition = offset < offset_top &&
1348 md->offset_vector[offset] >= 0;
1349 if (condition) break;
1350 }
1351 else break;
1352 }
1353
1354 /* Scan up for duplicates */
1355
1356 if (!condition)
1357 {
1358 slotB = slotA;
1359 for (i++; i < md->name_count; i++)
1360 {
1361 slotB += md->name_entry_size;
1362 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1363 {
1364 offset = GET2(slotB, 0) << 1;
1365 condition = offset < offset_top &&
1366 md->offset_vector[offset] >= 0;
1367 if (condition) break;
1368 }
1369 else break;
1370 }
1371 }
1372 }
1373 }
1374
1375 /* Chose branch according to the condition */
1376
1377 ecode += condition? 3 : GET(ecode, 1);
1378 }
1379
1380 else if (condcode == OP_DEF) /* DEFINE - always false */
1381 {
1382 condition = FALSE;
1383 ecode += GET(ecode, 1);
1384 }
1385
1386 /* The condition is an assertion. Call match() to evaluate it - setting
1387 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1388 an assertion. */
1389
1390 else
1391 {
1392 md->match_function_type = MATCH_CONDASSERT;
1393 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1394 if (rrc == MATCH_MATCH)
1395 {
1396 if (md->end_offset_top > offset_top)
1397 offset_top = md->end_offset_top; /* Captures may have happened */
1398 condition = TRUE;
1399 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1400 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1401 }
1402
1403 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1404 assertion; it is therefore treated as NOMATCH. */
1405
1406 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1407 {
1408 RRETURN(rrc); /* Need braces because of following else */
1409 }
1410 else
1411 {
1412 condition = FALSE;
1413 ecode += codelink;
1414 }
1415 }
1416
1417 /* We are now at the branch that is to be obeyed. As there is only one, can
1418 use tail recursion to avoid using another stack frame, except when there is
1419 unlimited repeat of a possibly empty group. In the latter case, a recursive
1420 call to match() is always required, unless the second alternative doesn't
1421 exist, in which case we can just plough on. Note that, for compatibility
1422 with Perl, the | in a conditional group is NOT treated as creating two
1423 alternatives. If a THEN is encountered in the branch, it propagates out to
1424 the enclosing alternative (unless nested in a deeper set of alternatives,
1425 of course). */
1426
1427 if (condition || *ecode == OP_ALT)
1428 {
1429 if (op != OP_SCOND)
1430 {
1431 ecode += 1 + LINK_SIZE;
1432 goto TAIL_RECURSE;
1433 }
1434
1435 md->match_function_type = MATCH_CBEGROUP;
1436 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1437 RRETURN(rrc);
1438 }
1439
1440 /* Condition false & no alternative; continue after the group. */
1441
1442 else
1443 {
1444 ecode += 1 + LINK_SIZE;
1445 }
1446 break;
1447
1448
1449 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1450 to close any currently open capturing brackets. */
1451
1452 case OP_CLOSE:
1453 number = GET2(ecode, 1);
1454 offset = number << 1;
1455
1456#ifdef PCRE_DEBUG
1457 printf("end bracket %d at *ACCEPT", number);
1458 printf("\n");
1459#endif
1460
1461 md->capture_last = number;
1462 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1463 {
1464 md->offset_vector[offset] =
1465 md->offset_vector[md->offset_end - number];
1466 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1467 if (offset_top <= offset) offset_top = offset + 2;
1468 }
1469 ecode += 3;
1470 break;
1471
1472
1473 /* End of the pattern, either real or forced. */
1474
1475 case OP_END:
1476 case OP_ACCEPT:
1477 case OP_ASSERT_ACCEPT:
1478
1479 /* If we have matched an empty string, fail if not in an assertion and not
1480 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1481 is set and we have matched at the start of the subject. In both cases,
1482 backtracking will then try other alternatives, if any. */
1483
1484 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1485 md->recursive == NULL &&
1486 (md->notempty ||
1487 (md->notempty_atstart &&
1488 mstart == md->start_subject + md->start_offset)))
1489 RRETURN(MATCH_NOMATCH);
1490
1491 /* Otherwise, we have a match. */
1492
1493 md->end_match_ptr = eptr; /* Record where we ended */
1494 md->end_offset_top = offset_top; /* and how many extracts were taken */
1495 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1496
1497 /* For some reason, the macros don't work properly if an expression is
1498 given as the argument to RRETURN when the heap is in use. */
1499
1500 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1501 RRETURN(rrc);
1502
1503 /* Assertion brackets. Check the alternative branches in turn - the
1504 matching won't pass the KET for an assertion. If any one branch matches,
1505 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1506 start of each branch to move the current point backwards, so the code at
1507 this level is identical to the lookahead case. When the assertion is part
1508 of a condition, we want to return immediately afterwards. The caller of
1509 this incarnation of the match() function will have set MATCH_CONDASSERT in
1510 md->match_function type, and one of these opcodes will be the first opcode
1511 that is processed. We use a local variable that is preserved over calls to
1512 match() to remember this case. */
1513
1514 case OP_ASSERT:
1515 case OP_ASSERTBACK:
1516 if (md->match_function_type == MATCH_CONDASSERT)
1517 {
1518 condassert = TRUE;
1519 md->match_function_type = 0;
1520 }
1521 else condassert = FALSE;
1522
1523 do
1524 {
1525 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1526 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1527 {
1528 mstart = md->start_match_ptr; /* In case \K reset it */
1529 break;
1530 }
1531
1532 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1533 as NOMATCH. */
1534
1535 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1536 ecode += GET(ecode, 1);
1537 }
1538 while (*ecode == OP_ALT);
1539
1540 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1541
1542 /* If checking an assertion for a condition, return MATCH_MATCH. */
1543
1544 if (condassert) RRETURN(MATCH_MATCH);
1545
1546 /* Continue from after the assertion, updating the offsets high water
1547 mark, since extracts may have been taken during the assertion. */
1548
1549 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1550 ecode += 1 + LINK_SIZE;
1551 offset_top = md->end_offset_top;
1552 continue;
1553
1554 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1555 PRUNE, or COMMIT means we must assume failure without checking subsequent
1556 branches. */
1557
1558 case OP_ASSERT_NOT:
1559 case OP_ASSERTBACK_NOT:
1560 if (md->match_function_type == MATCH_CONDASSERT)
1561 {
1562 condassert = TRUE;
1563 md->match_function_type = 0;
1564 }
1565 else condassert = FALSE;
1566
1567 do
1568 {
1569 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1570 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1571 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1572 {
1573 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1574 break;
1575 }
1576
1577 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1578 as NOMATCH. */
1579
1580 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1581 ecode += GET(ecode,1);
1582 }
1583 while (*ecode == OP_ALT);
1584
1585 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1586
1587 ecode += 1 + LINK_SIZE;
1588 continue;
1589
1590 /* Move the subject pointer back. This occurs only at the start of
1591 each branch of a lookbehind assertion. If we are too close to the start to
1592 move back, this match function fails. When working with UTF-8 we move
1593 back a number of characters, not bytes. */
1594
1595 case OP_REVERSE:
1596#ifdef SUPPORT_UTF8
1597 if (utf8)
1598 {
1599 i = GET(ecode, 1);
1600 while (i-- > 0)
1601 {
1602 eptr--;
1603 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1604 BACKCHAR(eptr);
1605 }
1606 }
1607 else
1608#endif
1609
1610 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1611
1612 {
1613 eptr -= GET(ecode, 1);
1614 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1615 }
1616
1617 /* Save the earliest consulted character, then skip to next op code */
1618
1619 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1620 ecode += 1 + LINK_SIZE;
1621 break;
1622
1623 /* The callout item calls an external function, if one is provided, passing
1624 details of the match so far. This is mainly for debugging, though the
1625 function is able to force a failure. */
1626
1627 case OP_CALLOUT:
1628 if (pcre_callout != NULL)
1629 {
1630 pcre_callout_block cb;
1631 cb.version = 2; /* Version 1 of the callout block */
1632 cb.callout_number = ecode[1];
1633 cb.offset_vector = md->offset_vector;
1634 cb.subject = (PCRE_SPTR)md->start_subject;
1635 cb.subject_length = (int)(md->end_subject - md->start_subject);
1636 cb.start_match = (int)(mstart - md->start_subject);
1637 cb.current_position = (int)(eptr - md->start_subject);
1638 cb.pattern_position = GET(ecode, 2);
1639 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1640 cb.capture_top = offset_top/2;
1641 cb.capture_last = md->capture_last;
1642 cb.callout_data = md->callout_data;
1643 cb.mark = md->nomatch_mark;
1644 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1645 if (rrc < 0) RRETURN(rrc);
1646 }
1647 ecode += 2 + 2*LINK_SIZE;
1648 break;
1649
1650 /* Recursion either matches the current regex, or some subexpression. The
1651 offset data is the offset to the starting bracket from the start of the
1652 whole pattern. (This is so that it works from duplicated subpatterns.)
1653
1654 The state of the capturing groups is preserved over recursion, and
1655 re-instated afterwards. We don't know how many are started and not yet
1656 finished (offset_top records the completed total) so we just have to save
1657 all the potential data. There may be up to 65535 such values, which is too
1658 large to put on the stack, but using malloc for small numbers seems
1659 expensive. As a compromise, the stack is used when there are no more than
1660 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1661
1662 There are also other values that have to be saved. We use a chained
1663 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1664 for the original version of this logic. It has, however, been hacked around
1665 a lot, so he is not to blame for the current way it works. */
1666
1667 case OP_RECURSE:
1668 {
1669 recursion_info *ri;
1670 int recno;
1671
1672 callpat = md->start_code + GET(ecode, 1);
1673 recno = (callpat == md->start_code)? 0 :
1674 GET2(callpat, 1 + LINK_SIZE);
1675
1676 /* Check for repeating a recursion without advancing the subject pointer.
1677 This should catch convoluted mutual recursions. (Some simple cases are
1678 caught at compile time.) */
1679
1680 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1681 if (recno == ri->group_num && eptr == ri->subject_position)
1682 RRETURN(PCRE_ERROR_RECURSELOOP);
1683
1684 /* Add to "recursing stack" */
1685
1686 new_recursive.group_num = recno;
1687 new_recursive.subject_position = eptr;
1688 new_recursive.prevrec = md->recursive;
1689 md->recursive = &new_recursive;
1690
1691 /* Where to continue from afterwards */
1692
1693 ecode += 1 + LINK_SIZE;
1694
1695 /* Now save the offset data */
1696
1697 new_recursive.saved_max = md->offset_end;
1698 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1699 new_recursive.offset_save = stacksave;
1700 else
1701 {
1702 new_recursive.offset_save =
1703 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1704 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1705 }
1706 memcpy(new_recursive.offset_save, md->offset_vector,
1707 new_recursive.saved_max * sizeof(int));
1708
1709 /* OK, now we can do the recursion. After processing each alternative,
1710 restore the offset data. If there were nested recursions, md->recursive
1711 might be changed, so reset it before looping. */
1712
1713 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1714 cbegroup = (*callpat >= OP_SBRA);
1715 do
1716 {
1717 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1718 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1719 md, eptrb, RM6);
1720 memcpy(md->offset_vector, new_recursive.offset_save,
1721 new_recursive.saved_max * sizeof(int));
1722 md->recursive = new_recursive.prevrec;
1723 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1724 {
1725 DPRINTF(("Recursion matched\n"));
1726 if (new_recursive.offset_save != stacksave)
1727 (pcre_free)(new_recursive.offset_save);
1728
1729 /* Set where we got to in the subject, and reset the start in case
1730 it was changed by \K. This *is* propagated back out of a recursion,
1731 for Perl compatibility. */
1732
1733 eptr = md->end_match_ptr;
1734 mstart = md->start_match_ptr;
1735 goto RECURSION_MATCHED; /* Exit loop; end processing */
1736 }
1737
1738 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1739 as NOMATCH. */
1740
1741 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1742 {
1743 DPRINTF(("Recursion gave error %d\n", rrc));
1744 if (new_recursive.offset_save != stacksave)
1745 (pcre_free)(new_recursive.offset_save);
1746 RRETURN(rrc);
1747 }
1748
1749 md->recursive = &new_recursive;
1750 callpat += GET(callpat, 1);
1751 }
1752 while (*callpat == OP_ALT);
1753
1754 DPRINTF(("Recursion didn't match\n"));
1755 md->recursive = new_recursive.prevrec;
1756 if (new_recursive.offset_save != stacksave)
1757 (pcre_free)(new_recursive.offset_save);
1758 RRETURN(MATCH_NOMATCH);
1759 }
1760
1761 RECURSION_MATCHED:
1762 break;
1763
1764 /* An alternation is the end of a branch; scan along to find the end of the
1765 bracketed group and go to there. */
1766
1767 case OP_ALT:
1768 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1769 break;
1770
1771 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1772 indicating that it may occur zero times. It may repeat infinitely, or not
1773 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1774 with fixed upper repeat limits are compiled as a number of copies, with the
1775 optional ones preceded by BRAZERO or BRAMINZERO. */
1776
1777 case OP_BRAZERO:
1778 next = ecode + 1;
1779 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1780 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1781 do next += GET(next, 1); while (*next == OP_ALT);
1782 ecode = next + 1 + LINK_SIZE;
1783 break;
1784
1785 case OP_BRAMINZERO:
1786 next = ecode + 1;
1787 do next += GET(next, 1); while (*next == OP_ALT);
1788 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1789 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1790 ecode++;
1791 break;
1792
1793 case OP_SKIPZERO:
1794 next = ecode+1;
1795 do next += GET(next,1); while (*next == OP_ALT);
1796 ecode = next + 1 + LINK_SIZE;
1797 break;
1798
1799 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1800 here; just jump to the group, with allow_zero set TRUE. */
1801
1802 case OP_BRAPOSZERO:
1803 op = *(++ecode);
1804 allow_zero = TRUE;
1805 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1806 goto POSSESSIVE_NON_CAPTURE;
1807
1808 /* End of a group, repeated or non-repeating. */
1809
1810 case OP_KET:
1811 case OP_KETRMIN:
1812 case OP_KETRMAX:
1813 case OP_KETRPOS:
1814 prev = ecode - GET(ecode, 1);
1815
1816 /* If this was a group that remembered the subject start, in order to break
1817 infinite repeats of empty string matches, retrieve the subject start from
1818 the chain. Otherwise, set it NULL. */
1819
1820 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1821 {
1822 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1823 eptrb = eptrb->epb_prev; /* Backup to previous group */
1824 }
1825 else saved_eptr = NULL;
1826
1827 /* If we are at the end of an assertion group or a non-capturing atomic
1828 group, stop matching and return MATCH_MATCH, but record the current high
1829 water mark for use by positive assertions. We also need to record the match
1830 start in case it was changed by \K. */
1831
1832 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1833 *prev == OP_ONCE_NC)
1834 {
1835 md->end_match_ptr = eptr; /* For ONCE_NC */
1836 md->end_offset_top = offset_top;
1837 md->start_match_ptr = mstart;
1838 RRETURN(MATCH_MATCH); /* Sets md->mark */
1839 }
1840
1841 /* For capturing groups we have to check the group number back at the start
1842 and if necessary complete handling an extraction by setting the offsets and
1843 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1844 into group 0, so it won't be picked up here. Instead, we catch it when the
1845 OP_END is reached. Other recursion is handled here. We just have to record
1846 the current subject position and start match pointer and give a MATCH
1847 return. */
1848
1849 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1850 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1851 {
1852 number = GET2(prev, 1+LINK_SIZE);
1853 offset = number << 1;
1854
1855#ifdef PCRE_DEBUG
1856 printf("end bracket %d", number);
1857 printf("\n");
1858#endif
1859
1860 /* Handle a recursively called group. */
1861
1862 if (md->recursive != NULL && md->recursive->group_num == number)
1863 {
1864 md->end_match_ptr = eptr;
1865 md->start_match_ptr = mstart;
1866 RRETURN(MATCH_MATCH);
1867 }
1868
1869 /* Deal with capturing */
1870
1871 md->capture_last = number;
1872 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1873 {
1874 /* If offset is greater than offset_top, it means that we are
1875 "skipping" a capturing group, and that group's offsets must be marked
1876 unset. In earlier versions of PCRE, all the offsets were unset at the
1877 start of matching, but this doesn't work because atomic groups and
1878 assertions can cause a value to be set that should later be unset.
1879 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1880 part of the atomic group, but this is not on the final matching path,
1881 so must be unset when 2 is set. (If there is no group 2, there is no
1882 problem, because offset_top will then be 2, indicating no capture.) */
1883
1884 if (offset > offset_top)
1885 {
1886 register int *iptr = md->offset_vector + offset_top;
1887 register int *iend = md->offset_vector + offset;
1888 while (iptr < iend) *iptr++ = -1;
1889 }
1890
1891 /* Now make the extraction */
1892
1893 md->offset_vector[offset] =
1894 md->offset_vector[md->offset_end - number];
1895 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1896 if (offset_top <= offset) offset_top = offset + 2;
1897 }
1898 }
1899
1900 /* For an ordinary non-repeating ket, just continue at this level. This
1901 also happens for a repeating ket if no characters were matched in the
1902 group. This is the forcible breaking of infinite loops as implemented in
1903 Perl 5.005. For a non-repeating atomic group that includes captures,
1904 establish a backup point by processing the rest of the pattern at a lower
1905 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1906 original OP_ONCE level, thereby bypassing intermediate backup points, but
1907 resetting any captures that happened along the way. */
1908
1909 if (*ecode == OP_KET || eptr == saved_eptr)
1910 {
1911 if (*prev == OP_ONCE)
1912 {
1913 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1914 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1915 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1916 RRETURN(MATCH_ONCE);
1917 }
1918 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1919 break;
1920 }
1921
1922 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1923 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1924 at a time from the outer level, thus saving stack. */
1925
1926 if (*ecode == OP_KETRPOS)
1927 {
1928 md->end_match_ptr = eptr;
1929 md->end_offset_top = offset_top;
1930 RRETURN(MATCH_KETRPOS);
1931 }
1932
1933 /* The normal repeating kets try the rest of the pattern or restart from
1934 the preceding bracket, in the appropriate order. In the second case, we can
1935 use tail recursion to avoid using another stack frame, unless we have an
1936 an atomic group or an unlimited repeat of a group that can match an empty
1937 string. */
1938
1939 if (*ecode == OP_KETRMIN)
1940 {
1941 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1942 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1943 if (*prev == OP_ONCE)
1944 {
1945 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1946 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1947 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1948 RRETURN(MATCH_ONCE);
1949 }
1950 if (*prev >= OP_SBRA) /* Could match an empty string */
1951 {
1952 md->match_function_type = MATCH_CBEGROUP;
1953 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1954 RRETURN(rrc);
1955 }
1956 ecode = prev;
1957 goto TAIL_RECURSE;
1958 }
1959 else /* OP_KETRMAX */
1960 {
1961 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1962 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1963 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1964 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1965 if (*prev == OP_ONCE)
1966 {
1967 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1968 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1969 md->once_target = prev;
1970 RRETURN(MATCH_ONCE);
1971 }
1972 ecode += 1 + LINK_SIZE;
1973 goto TAIL_RECURSE;
1974 }
1975 /* Control never gets here */
1976
1977 /* Not multiline mode: start of subject assertion, unless notbol. */
1978
1979 case OP_CIRC:
1980 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1981
1982 /* Start of subject assertion */
1983
1984 case OP_SOD:
1985 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1986 ecode++;
1987 break;
1988
1989 /* Multiline mode: start of subject unless notbol, or after any newline. */
1990
1991 case OP_CIRCM:
1992 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1993 if (eptr != md->start_subject &&
1994 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1995 RRETURN(MATCH_NOMATCH);
1996 ecode++;
1997 break;
1998
1999 /* Start of match assertion */
2000
2001 case OP_SOM:
2002 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2003 ecode++;
2004 break;
2005
2006 /* Reset the start of match point */
2007
2008 case OP_SET_SOM:
2009 mstart = eptr;
2010 ecode++;
2011 break;
2012
2013 /* Multiline mode: assert before any newline, or before end of subject
2014 unless noteol is set. */
2015
2016 case OP_DOLLM:
2017 if (eptr < md->end_subject)
2018 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
2019 else
2020 {
2021 if (md->noteol) RRETURN(MATCH_NOMATCH);
2022 SCHECK_PARTIAL();
2023 }
2024 ecode++;
2025 break;
2026
2027 /* Not multiline mode: assert before a terminating newline or before end of
2028 subject unless noteol is set. */
2029
2030 case OP_DOLL:
2031 if (md->noteol) RRETURN(MATCH_NOMATCH);
2032 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2033
2034 /* ... else fall through for endonly */
2035
2036 /* End of subject assertion (\z) */
2037
2038 case OP_EOD:
2039 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2040 SCHECK_PARTIAL();
2041 ecode++;
2042 break;
2043
2044 /* End of subject or ending \n assertion (\Z) */
2045
2046 case OP_EODN:
2047 ASSERT_NL_OR_EOS:
2048 if (eptr < md->end_subject &&
2049 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2050 RRETURN(MATCH_NOMATCH);
2051
2052 /* Either at end of string or \n before end. */
2053
2054 SCHECK_PARTIAL();
2055 ecode++;
2056 break;
2057
2058 /* Word boundary assertions */
2059
2060 case OP_NOT_WORD_BOUNDARY:
2061 case OP_WORD_BOUNDARY:
2062 {
2063
2064 /* Find out if the previous and current characters are "word" characters.
2065 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2066 be "non-word" characters. Remember the earliest consulted character for
2067 partial matching. */
2068
2069#ifdef SUPPORT_UTF8
2070 if (utf8)
2071 {
2072 /* Get status of previous character */
2073
2074 if (eptr == md->start_subject) prev_is_word = FALSE; else
2075 {
2076 USPTR lastptr = eptr - 1;
2077 while((*lastptr & 0xc0) == 0x80) lastptr--;
2078 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2079 GETCHAR(c, lastptr);
2080#ifdef SUPPORT_UCP
2081 if (md->use_ucp)
2082 {
2083 if (c == '_') prev_is_word = TRUE; else
2084 {
2085 int cat = UCD_CATEGORY(c);
2086 prev_is_word = (cat == ucp_L || cat == ucp_N);
2087 }
2088 }
2089 else
2090#endif
2091 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2092 }
2093
2094 /* Get status of next character */
2095
2096 if (eptr >= md->end_subject)
2097 {
2098 SCHECK_PARTIAL();
2099 cur_is_word = FALSE;
2100 }
2101 else
2102 {
2103 GETCHAR(c, eptr);
2104#ifdef SUPPORT_UCP
2105 if (md->use_ucp)
2106 {
2107 if (c == '_') cur_is_word = TRUE; else
2108 {
2109 int cat = UCD_CATEGORY(c);
2110 cur_is_word = (cat == ucp_L || cat == ucp_N);
2111 }
2112 }
2113 else
2114#endif
2115 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2116 }
2117 }
2118 else
2119#endif
2120
2121 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2122 consistency with the behaviour of \w we do use it in this case. */
2123
2124 {
2125 /* Get status of previous character */
2126
2127 if (eptr == md->start_subject) prev_is_word = FALSE; else
2128 {
2129 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2130#ifdef SUPPORT_UCP
2131 if (md->use_ucp)
2132 {
2133 c = eptr[-1];
2134 if (c == '_') prev_is_word = TRUE; else
2135 {
2136 int cat = UCD_CATEGORY(c);
2137 prev_is_word = (cat == ucp_L || cat == ucp_N);
2138 }
2139 }
2140 else
2141#endif
2142 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2143 }
2144
2145 /* Get status of next character */
2146
2147 if (eptr >= md->end_subject)
2148 {
2149 SCHECK_PARTIAL();
2150 cur_is_word = FALSE;
2151 }
2152 else
2153#ifdef SUPPORT_UCP
2154 if (md->use_ucp)
2155 {
2156 c = *eptr;
2157 if (c == '_') cur_is_word = TRUE; else
2158 {
2159 int cat = UCD_CATEGORY(c);
2160 cur_is_word = (cat == ucp_L || cat == ucp_N);
2161 }
2162 }
2163 else
2164#endif
2165 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2166 }
2167
2168 /* Now see if the situation is what we want */
2169
2170 if ((*ecode++ == OP_WORD_BOUNDARY)?
2171 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2172 RRETURN(MATCH_NOMATCH);
2173 }
2174 break;
2175
2176 /* Match a single character type; inline for speed */
2177
2178 case OP_ANY:
2179 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2180 /* Fall through */
2181
2182 case OP_ALLANY:
2183 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2184 { /* not be updated before SCHECK_PARTIAL. */
2185 SCHECK_PARTIAL();
2186 RRETURN(MATCH_NOMATCH);
2187 }
2188 eptr++;
2189 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2190 ecode++;
2191 break;
2192
2193 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2194 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2195
2196 case OP_ANYBYTE:
2197 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2198 { /* not be updated before SCHECK_PARTIAL. */
2199 SCHECK_PARTIAL();
2200 RRETURN(MATCH_NOMATCH);
2201 }
2202 eptr++;
2203 ecode++;
2204 break;
2205
2206 case OP_NOT_DIGIT:
2207 if (eptr >= md->end_subject)
2208 {
2209 SCHECK_PARTIAL();
2210 RRETURN(MATCH_NOMATCH);
2211 }
2212 GETCHARINCTEST(c, eptr);
2213 if (
2214#ifdef SUPPORT_UTF8
2215 c < 256 &&
2216#endif
2217 (md->ctypes[c] & ctype_digit) != 0
2218 )
2219 RRETURN(MATCH_NOMATCH);
2220 ecode++;
2221 break;
2222
2223 case OP_DIGIT:
2224 if (eptr >= md->end_subject)
2225 {
2226 SCHECK_PARTIAL();
2227 RRETURN(MATCH_NOMATCH);
2228 }
2229 GETCHARINCTEST(c, eptr);
2230 if (
2231#ifdef SUPPORT_UTF8
2232 c >= 256 ||
2233#endif
2234 (md->ctypes[c] & ctype_digit) == 0
2235 )
2236 RRETURN(MATCH_NOMATCH);
2237 ecode++;
2238 break;
2239
2240 case OP_NOT_WHITESPACE:
2241 if (eptr >= md->end_subject)
2242 {
2243 SCHECK_PARTIAL();
2244 RRETURN(MATCH_NOMATCH);
2245 }
2246 GETCHARINCTEST(c, eptr);
2247 if (
2248#ifdef SUPPORT_UTF8
2249 c < 256 &&
2250#endif
2251 (md->ctypes[c] & ctype_space) != 0
2252 )
2253 RRETURN(MATCH_NOMATCH);
2254 ecode++;
2255 break;
2256
2257 case OP_WHITESPACE:
2258 if (eptr >= md->end_subject)
2259 {
2260 SCHECK_PARTIAL();
2261 RRETURN(MATCH_NOMATCH);
2262 }
2263 GETCHARINCTEST(c, eptr);
2264 if (
2265#ifdef SUPPORT_UTF8
2266 c >= 256 ||
2267#endif
2268 (md->ctypes[c] & ctype_space) == 0
2269 )
2270 RRETURN(MATCH_NOMATCH);
2271 ecode++;
2272 break;
2273
2274 case OP_NOT_WORDCHAR:
2275 if (eptr >= md->end_subject)
2276 {
2277 SCHECK_PARTIAL();
2278 RRETURN(MATCH_NOMATCH);
2279 }
2280 GETCHARINCTEST(c, eptr);
2281 if (
2282#ifdef SUPPORT_UTF8
2283 c < 256 &&
2284#endif
2285 (md->ctypes[c] & ctype_word) != 0
2286 )
2287 RRETURN(MATCH_NOMATCH);
2288 ecode++;
2289 break;
2290
2291 case OP_WORDCHAR:
2292 if (eptr >= md->end_subject)
2293 {
2294 SCHECK_PARTIAL();
2295 RRETURN(MATCH_NOMATCH);
2296 }
2297 GETCHARINCTEST(c, eptr);
2298 if (
2299#ifdef SUPPORT_UTF8
2300 c >= 256 ||
2301#endif
2302 (md->ctypes[c] & ctype_word) == 0
2303 )
2304 RRETURN(MATCH_NOMATCH);
2305 ecode++;
2306 break;
2307
2308 case OP_ANYNL:
2309 if (eptr >= md->end_subject)
2310 {
2311 SCHECK_PARTIAL();
2312 RRETURN(MATCH_NOMATCH);
2313 }
2314 GETCHARINCTEST(c, eptr);
2315 switch(c)
2316 {
2317 default: RRETURN(MATCH_NOMATCH);
2318
2319 case 0x000d:
2320 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2321 break;
2322
2323 case 0x000a:
2324 break;
2325
2326 case 0x000b:
2327 case 0x000c:
2328 case 0x0085:
2329 case 0x2028:
2330 case 0x2029:
2331 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2332 break;
2333 }
2334 ecode++;
2335 break;
2336
2337 case OP_NOT_HSPACE:
2338 if (eptr >= md->end_subject)
2339 {
2340 SCHECK_PARTIAL();
2341 RRETURN(MATCH_NOMATCH);
2342 }
2343 GETCHARINCTEST(c, eptr);
2344 switch(c)
2345 {
2346 default: break;
2347 case 0x09: /* HT */
2348 case 0x20: /* SPACE */
2349 case 0xa0: /* NBSP */
2350 case 0x1680: /* OGHAM SPACE MARK */
2351 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2352 case 0x2000: /* EN QUAD */
2353 case 0x2001: /* EM QUAD */
2354 case 0x2002: /* EN SPACE */
2355 case 0x2003: /* EM SPACE */
2356 case 0x2004: /* THREE-PER-EM SPACE */
2357 case 0x2005: /* FOUR-PER-EM SPACE */
2358 case 0x2006: /* SIX-PER-EM SPACE */
2359 case 0x2007: /* FIGURE SPACE */
2360 case 0x2008: /* PUNCTUATION SPACE */
2361 case 0x2009: /* THIN SPACE */
2362 case 0x200A: /* HAIR SPACE */
2363 case 0x202f: /* NARROW NO-BREAK SPACE */
2364 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2365 case 0x3000: /* IDEOGRAPHIC SPACE */
2366 RRETURN(MATCH_NOMATCH);
2367 }
2368 ecode++;
2369 break;
2370
2371 case OP_HSPACE:
2372 if (eptr >= md->end_subject)
2373 {
2374 SCHECK_PARTIAL();
2375 RRETURN(MATCH_NOMATCH);
2376 }
2377 GETCHARINCTEST(c, eptr);
2378 switch(c)
2379 {
2380 default: RRETURN(MATCH_NOMATCH);
2381 case 0x09: /* HT */
2382 case 0x20: /* SPACE */
2383 case 0xa0: /* NBSP */
2384 case 0x1680: /* OGHAM SPACE MARK */
2385 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2386 case 0x2000: /* EN QUAD */
2387 case 0x2001: /* EM QUAD */
2388 case 0x2002: /* EN SPACE */
2389 case 0x2003: /* EM SPACE */
2390 case 0x2004: /* THREE-PER-EM SPACE */
2391 case 0x2005: /* FOUR-PER-EM SPACE */
2392 case 0x2006: /* SIX-PER-EM SPACE */
2393 case 0x2007: /* FIGURE SPACE */
2394 case 0x2008: /* PUNCTUATION SPACE */
2395 case 0x2009: /* THIN SPACE */
2396 case 0x200A: /* HAIR SPACE */
2397 case 0x202f: /* NARROW NO-BREAK SPACE */
2398 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2399 case 0x3000: /* IDEOGRAPHIC SPACE */
2400 break;
2401 }
2402 ecode++;
2403 break;
2404
2405 case OP_NOT_VSPACE:
2406 if (eptr >= md->end_subject)
2407 {
2408 SCHECK_PARTIAL();
2409 RRETURN(MATCH_NOMATCH);
2410 }
2411 GETCHARINCTEST(c, eptr);
2412 switch(c)
2413 {
2414 default: break;
2415 case 0x0a: /* LF */
2416 case 0x0b: /* VT */
2417 case 0x0c: /* FF */
2418 case 0x0d: /* CR */
2419 case 0x85: /* NEL */
2420 case 0x2028: /* LINE SEPARATOR */
2421 case 0x2029: /* PARAGRAPH SEPARATOR */
2422 RRETURN(MATCH_NOMATCH);
2423 }
2424 ecode++;
2425 break;
2426
2427 case OP_VSPACE:
2428 if (eptr >= md->end_subject)
2429 {
2430 SCHECK_PARTIAL();
2431 RRETURN(MATCH_NOMATCH);
2432 }
2433 GETCHARINCTEST(c, eptr);
2434 switch(c)
2435 {
2436 default: RRETURN(MATCH_NOMATCH);
2437 case 0x0a: /* LF */
2438 case 0x0b: /* VT */
2439 case 0x0c: /* FF */
2440 case 0x0d: /* CR */
2441 case 0x85: /* NEL */
2442 case 0x2028: /* LINE SEPARATOR */
2443 case 0x2029: /* PARAGRAPH SEPARATOR */
2444 break;
2445 }
2446 ecode++;
2447 break;
2448
2449#ifdef SUPPORT_UCP
2450 /* Check the next character by Unicode property. We will get here only
2451 if the support is in the binary; otherwise a compile-time error occurs. */
2452
2453 case OP_PROP:
2454 case OP_NOTPROP:
2455 if (eptr >= md->end_subject)
2456 {
2457 SCHECK_PARTIAL();
2458 RRETURN(MATCH_NOMATCH);
2459 }
2460 GETCHARINCTEST(c, eptr);
2461 {
2462 const ucd_record *prop = GET_UCD(c);
2463
2464 switch(ecode[1])
2465 {
2466 case PT_ANY:
2467 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2468 break;
2469
2470 case PT_LAMP:
2471 if ((prop->chartype == ucp_Lu ||
2472 prop->chartype == ucp_Ll ||
2473 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2474 RRETURN(MATCH_NOMATCH);
2475 break;
2476
2477 case PT_GC:
2478 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2479 RRETURN(MATCH_NOMATCH);
2480 break;
2481
2482 case PT_PC:
2483 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2484 RRETURN(MATCH_NOMATCH);
2485 break;
2486
2487 case PT_SC:
2488 if ((ecode[2] != prop->script) == (op == OP_PROP))
2489 RRETURN(MATCH_NOMATCH);
2490 break;
2491
2492 /* These are specials */
2493
2494 case PT_ALNUM:
2495 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2496 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2497 RRETURN(MATCH_NOMATCH);
2498 break;
2499
2500 case PT_SPACE: /* Perl space */
2501 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2502 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2503 == (op == OP_NOTPROP))
2504 RRETURN(MATCH_NOMATCH);
2505 break;
2506
2507 case PT_PXSPACE: /* POSIX space */
2508 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2509 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2510 c == CHAR_FF || c == CHAR_CR)
2511 == (op == OP_NOTPROP))
2512 RRETURN(MATCH_NOMATCH);
2513 break;
2514
2515 case PT_WORD:
2516 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2517 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2518 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2519 RRETURN(MATCH_NOMATCH);
2520 break;
2521
2522 /* This should never occur */
2523
2524 default:
2525 RRETURN(PCRE_ERROR_INTERNAL);
2526 }
2527
2528 ecode += 3;
2529 }
2530 break;
2531
2532 /* Match an extended Unicode sequence. We will get here only if the support
2533 is in the binary; otherwise a compile-time error occurs. */
2534
2535 case OP_EXTUNI:
2536 if (eptr >= md->end_subject)
2537 {
2538 SCHECK_PARTIAL();
2539 RRETURN(MATCH_NOMATCH);
2540 }
2541 GETCHARINCTEST(c, eptr);
2542 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2543 while (eptr < md->end_subject)
2544 {
2545 int len = 1;
2546 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2547 if (UCD_CATEGORY(c) != ucp_M) break;
2548 eptr += len;
2549 }
2550 ecode++;
2551 break;
2552#endif
2553
2554
2555 /* Match a back reference, possibly repeatedly. Look past the end of the
2556 item to see if there is repeat information following. The code is similar
2557 to that for character classes, but repeated for efficiency. Then obey
2558 similar code to character type repeats - written out again for speed.
2559 However, if the referenced string is the empty string, always treat
2560 it as matched, any number of times (otherwise there could be infinite
2561 loops). */
2562
2563 case OP_REF:
2564 case OP_REFI:
2565 caseless = op == OP_REFI;
2566 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2567 ecode += 3;
2568
2569 /* If the reference is unset, there are two possibilities:
2570
2571 (a) In the default, Perl-compatible state, set the length negative;
2572 this ensures that every attempt at a match fails. We can't just fail
2573 here, because of the possibility of quantifiers with zero minima.
2574
2575 (b) If the JavaScript compatibility flag is set, set the length to zero
2576 so that the back reference matches an empty string.
2577
2578 Otherwise, set the length to the length of what was matched by the
2579 referenced subpattern. */
2580
2581 if (offset >= offset_top || md->offset_vector[offset] < 0)
2582 length = (md->jscript_compat)? 0 : -1;
2583 else
2584 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2585
2586 /* Set up for repetition, or handle the non-repeated case */
2587
2588 switch (*ecode)
2589 {
2590 case OP_CRSTAR:
2591 case OP_CRMINSTAR:
2592 case OP_CRPLUS:
2593 case OP_CRMINPLUS:
2594 case OP_CRQUERY:
2595 case OP_CRMINQUERY:
2596 c = *ecode++ - OP_CRSTAR;
2597 minimize = (c & 1) != 0;
2598 min = rep_min[c]; /* Pick up values from tables; */
2599 max = rep_max[c]; /* zero for max => infinity */
2600 if (max == 0) max = INT_MAX;
2601 break;
2602
2603 case OP_CRRANGE:
2604 case OP_CRMINRANGE:
2605 minimize = (*ecode == OP_CRMINRANGE);
2606 min = GET2(ecode, 1);
2607 max = GET2(ecode, 3);
2608 if (max == 0) max = INT_MAX;
2609 ecode += 5;
2610 break;
2611
2612 default: /* No repeat follows */
2613 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2614 {
2615 CHECK_PARTIAL();
2616 RRETURN(MATCH_NOMATCH);
2617 }
2618 eptr += length;
2619 continue; /* With the main loop */
2620 }
2621
2622 /* Handle repeated back references. If the length of the reference is
2623 zero, just continue with the main loop. */
2624
2625 if (length == 0) continue;
2626
2627 /* First, ensure the minimum number of matches are present. We get back
2628 the length of the reference string explicitly rather than passing the
2629 address of eptr, so that eptr can be a register variable. */
2630
2631 for (i = 1; i <= min; i++)
2632 {
2633 int slength;
2634 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2635 {
2636 CHECK_PARTIAL();
2637 RRETURN(MATCH_NOMATCH);
2638 }
2639 eptr += slength;
2640 }
2641
2642 /* If min = max, continue at the same level without recursion.
2643 They are not both allowed to be zero. */
2644
2645 if (min == max) continue;
2646
2647 /* If minimizing, keep trying and advancing the pointer */
2648
2649 if (minimize)
2650 {
2651 for (fi = min;; fi++)
2652 {
2653 int slength;
2654 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2655 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2656 if (fi >= max) RRETURN(MATCH_NOMATCH);
2657 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2658 {
2659 CHECK_PARTIAL();
2660 RRETURN(MATCH_NOMATCH);
2661 }
2662 eptr += slength;
2663 }
2664 /* Control never gets here */
2665 }
2666
2667 /* If maximizing, find the longest string and work backwards */
2668
2669 else
2670 {
2671 pp = eptr;
2672 for (i = min; i < max; i++)
2673 {
2674 int slength;
2675 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2676 {
2677 CHECK_PARTIAL();
2678 break;
2679 }
2680 eptr += slength;
2681 }
2682 while (eptr >= pp)
2683 {
2684 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2685 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2686 eptr -= length;
2687 }
2688 RRETURN(MATCH_NOMATCH);
2689 }
2690 /* Control never gets here */
2691
2692 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2693 used when all the characters in the class have values in the range 0-255,
2694 and either the matching is caseful, or the characters are in the range
2695 0-127 when UTF-8 processing is enabled. The only difference between
2696 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2697 encountered.
2698
2699 First, look past the end of the item to see if there is repeat information
2700 following. Then obey similar code to character type repeats - written out
2701 again for speed. */
2702
2703 case OP_NCLASS:
2704 case OP_CLASS:
2705 {
2706 data = ecode + 1; /* Save for matching */
2707 ecode += 33; /* Advance past the item */
2708
2709 switch (*ecode)
2710 {
2711 case OP_CRSTAR:
2712 case OP_CRMINSTAR:
2713 case OP_CRPLUS:
2714 case OP_CRMINPLUS:
2715 case OP_CRQUERY:
2716 case OP_CRMINQUERY:
2717 c = *ecode++ - OP_CRSTAR;
2718 minimize = (c & 1) != 0;
2719 min = rep_min[c]; /* Pick up values from tables; */
2720 max = rep_max[c]; /* zero for max => infinity */
2721 if (max == 0) max = INT_MAX;
2722 break;
2723
2724 case OP_CRRANGE:
2725 case OP_CRMINRANGE:
2726 minimize = (*ecode == OP_CRMINRANGE);
2727 min = GET2(ecode, 1);
2728 max = GET2(ecode, 3);
2729 if (max == 0) max = INT_MAX;
2730 ecode += 5;
2731 break;
2732
2733 default: /* No repeat follows */
2734 min = max = 1;
2735 break;
2736 }
2737
2738 /* First, ensure the minimum number of matches are present. */
2739
2740#ifdef SUPPORT_UTF8
2741 /* UTF-8 mode */
2742 if (utf8)
2743 {
2744 for (i = 1; i <= min; i++)
2745 {
2746 if (eptr >= md->end_subject)
2747 {
2748 SCHECK_PARTIAL();
2749 RRETURN(MATCH_NOMATCH);
2750 }
2751 GETCHARINC(c, eptr);
2752 if (c > 255)
2753 {
2754 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2755 }
2756 else
2757 {
2758 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2759 }
2760 }
2761 }
2762 else
2763#endif
2764 /* Not UTF-8 mode */
2765 {
2766 for (i = 1; i <= min; i++)
2767 {
2768 if (eptr >= md->end_subject)
2769 {
2770 SCHECK_PARTIAL();
2771 RRETURN(MATCH_NOMATCH);
2772 }
2773 c = *eptr++;
2774 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2775 }
2776 }
2777
2778 /* If max == min we can continue with the main loop without the
2779 need to recurse. */
2780
2781 if (min == max) continue;
2782
2783 /* If minimizing, keep testing the rest of the expression and advancing
2784 the pointer while it matches the class. */
2785
2786 if (minimize)
2787 {
2788#ifdef SUPPORT_UTF8
2789 /* UTF-8 mode */
2790 if (utf8)
2791 {
2792 for (fi = min;; fi++)
2793 {
2794 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2796 if (fi >= max) RRETURN(MATCH_NOMATCH);
2797 if (eptr >= md->end_subject)
2798 {
2799 SCHECK_PARTIAL();
2800 RRETURN(MATCH_NOMATCH);
2801 }
2802 GETCHARINC(c, eptr);
2803 if (c > 255)
2804 {
2805 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2806 }
2807 else
2808 {
2809 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2810 }
2811 }
2812 }
2813 else
2814#endif
2815 /* Not UTF-8 mode */
2816 {
2817 for (fi = min;; fi++)
2818 {
2819 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2820 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2821 if (fi >= max) RRETURN(MATCH_NOMATCH);
2822 if (eptr >= md->end_subject)
2823 {
2824 SCHECK_PARTIAL();
2825 RRETURN(MATCH_NOMATCH);
2826 }
2827 c = *eptr++;
2828 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2829 }
2830 }
2831 /* Control never gets here */
2832 }
2833
2834 /* If maximizing, find the longest possible run, then work backwards. */
2835
2836 else
2837 {
2838 pp = eptr;
2839
2840#ifdef SUPPORT_UTF8
2841 /* UTF-8 mode */
2842 if (utf8)
2843 {
2844 for (i = min; i < max; i++)
2845 {
2846 int len = 1;
2847 if (eptr >= md->end_subject)
2848 {
2849 SCHECK_PARTIAL();
2850 break;
2851 }
2852 GETCHARLEN(c, eptr, len);
2853 if (c > 255)
2854 {
2855 if (op == OP_CLASS) break;
2856 }
2857 else
2858 {
2859 if ((data[c/8] & (1 << (c&7))) == 0) break;
2860 }
2861 eptr += len;
2862 }
2863 for (;;)
2864 {
2865 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2866 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2867 if (eptr-- == pp) break; /* Stop if tried at original pos */
2868 BACKCHAR(eptr);
2869 }
2870 }
2871 else
2872#endif
2873 /* Not UTF-8 mode */
2874 {
2875 for (i = min; i < max; i++)
2876 {
2877 if (eptr >= md->end_subject)
2878 {
2879 SCHECK_PARTIAL();
2880 break;
2881 }
2882 c = *eptr;
2883 if ((data[c/8] & (1 << (c&7))) == 0) break;
2884 eptr++;
2885 }
2886 while (eptr >= pp)
2887 {
2888 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2889 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2890 eptr--;
2891 }
2892 }
2893
2894 RRETURN(MATCH_NOMATCH);
2895 }
2896 }
2897 /* Control never gets here */
2898
2899
2900 /* Match an extended character class. This opcode is encountered only
2901 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2902 mode, because Unicode properties are supported in non-UTF-8 mode. */
2903
2904#ifdef SUPPORT_UTF8
2905 case OP_XCLASS:
2906 {
2907 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2908 ecode += GET(ecode, 1); /* Advance past the item */
2909
2910 switch (*ecode)
2911 {
2912 case OP_CRSTAR:
2913 case OP_CRMINSTAR:
2914 case OP_CRPLUS:
2915 case OP_CRMINPLUS:
2916 case OP_CRQUERY:
2917 case OP_CRMINQUERY:
2918 c = *ecode++ - OP_CRSTAR;
2919 minimize = (c & 1) != 0;
2920 min = rep_min[c]; /* Pick up values from tables; */
2921 max = rep_max[c]; /* zero for max => infinity */
2922 if (max == 0) max = INT_MAX;
2923 break;
2924
2925 case OP_CRRANGE:
2926 case OP_CRMINRANGE:
2927 minimize = (*ecode == OP_CRMINRANGE);
2928 min = GET2(ecode, 1);
2929 max = GET2(ecode, 3);
2930 if (max == 0) max = INT_MAX;
2931 ecode += 5;
2932 break;
2933
2934 default: /* No repeat follows */
2935 min = max = 1;
2936 break;
2937 }
2938
2939 /* First, ensure the minimum number of matches are present. */
2940
2941 for (i = 1; i <= min; i++)
2942 {
2943 if (eptr >= md->end_subject)
2944 {
2945 SCHECK_PARTIAL();
2946 RRETURN(MATCH_NOMATCH);
2947 }
2948 GETCHARINCTEST(c, eptr);
2949 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2950 }
2951
2952 /* If max == min we can continue with the main loop without the
2953 need to recurse. */
2954
2955 if (min == max) continue;
2956
2957 /* If minimizing, keep testing the rest of the expression and advancing
2958 the pointer while it matches the class. */
2959
2960 if (minimize)
2961 {
2962 for (fi = min;; fi++)
2963 {
2964 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2965 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2966 if (fi >= max) RRETURN(MATCH_NOMATCH);
2967 if (eptr >= md->end_subject)
2968 {
2969 SCHECK_PARTIAL();
2970 RRETURN(MATCH_NOMATCH);
2971 }
2972 GETCHARINCTEST(c, eptr);
2973 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2974 }
2975 /* Control never gets here */
2976 }
2977
2978 /* If maximizing, find the longest possible run, then work backwards. */
2979
2980 else
2981 {
2982 pp = eptr;
2983 for (i = min; i < max; i++)
2984 {
2985 int len = 1;
2986 if (eptr >= md->end_subject)
2987 {
2988 SCHECK_PARTIAL();
2989 break;
2990 }
2991 GETCHARLENTEST(c, eptr, len);
2992 if (!_pcre_xclass(c, data)) break;
2993 eptr += len;
2994 }
2995 for(;;)
2996 {
2997 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2998 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2999 if (eptr-- == pp) break; /* Stop if tried at original pos */
3000 if (utf8) BACKCHAR(eptr);
3001 }
3002 RRETURN(MATCH_NOMATCH);
3003 }
3004
3005 /* Control never gets here */
3006 }
3007#endif /* End of XCLASS */
3008
3009 /* Match a single character, casefully */
3010
3011 case OP_CHAR:
3012#ifdef SUPPORT_UTF8
3013 if (utf8)
3014 {
3015 length = 1;
3016 ecode++;
3017 GETCHARLEN(fc, ecode, length);
3018 if (length > md->end_subject - eptr)
3019 {
3020 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3021 RRETURN(MATCH_NOMATCH);
3022 }
3023 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3024 }
3025 else
3026#endif
3027
3028 /* Non-UTF-8 mode */
3029 {
3030 if (md->end_subject - eptr < 1)
3031 {
3032 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3033 RRETURN(MATCH_NOMATCH);
3034 }
3035 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3036 ecode += 2;
3037 }
3038 break;
3039
3040 /* Match a single character, caselessly. If we are at the end of the
3041 subject, give up immediately. */
3042
3043 case OP_CHARI:
3044 if (eptr >= md->end_subject)
3045 {
3046 SCHECK_PARTIAL();
3047 RRETURN(MATCH_NOMATCH);
3048 }
3049
3050#ifdef SUPPORT_UTF8
3051 if (utf8)
3052 {
3053 length = 1;
3054 ecode++;
3055 GETCHARLEN(fc, ecode, length);
3056
3057 /* If the pattern character's value is < 128, we have only one byte, and
3058 we know that its other case must also be one byte long, so we can use the
3059 fast lookup table. We know that there is at least one byte left in the
3060 subject. */
3061
3062 if (fc < 128)
3063 {
3064 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3065 }
3066
3067 /* Otherwise we must pick up the subject character. Note that we cannot
3068 use the value of "length" to check for sufficient bytes left, because the
3069 other case of the character may have more or fewer bytes. */
3070
3071 else
3072 {
3073 unsigned int dc;
3074 GETCHARINC(dc, eptr);
3075 ecode += length;
3076
3077 /* If we have Unicode property support, we can use it to test the other
3078 case of the character, if there is one. */
3079
3080 if (fc != dc)
3081 {
3082#ifdef SUPPORT_UCP
3083 if (dc != UCD_OTHERCASE(fc))
3084#endif
3085 RRETURN(MATCH_NOMATCH);
3086 }
3087 }
3088 }
3089 else
3090#endif /* SUPPORT_UTF8 */
3091
3092 /* Non-UTF-8 mode */
3093 {
3094 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3095 ecode += 2;
3096 }
3097 break;
3098
3099 /* Match a single character repeatedly. */
3100
3101 case OP_EXACT:
3102 case OP_EXACTI:
3103 min = max = GET2(ecode, 1);
3104 ecode += 3;
3105 goto REPEATCHAR;
3106
3107 case OP_POSUPTO:
3108 case OP_POSUPTOI:
3109 possessive = TRUE;
3110 /* Fall through */
3111
3112 case OP_UPTO:
3113 case OP_UPTOI:
3114 case OP_MINUPTO:
3115 case OP_MINUPTOI:
3116 min = 0;
3117 max = GET2(ecode, 1);
3118 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3119 ecode += 3;
3120 goto REPEATCHAR;
3121
3122 case OP_POSSTAR:
3123 case OP_POSSTARI:
3124 possessive = TRUE;
3125 min = 0;
3126 max = INT_MAX;
3127 ecode++;
3128 goto REPEATCHAR;
3129
3130 case OP_POSPLUS:
3131 case OP_POSPLUSI:
3132 possessive = TRUE;
3133 min = 1;
3134 max = INT_MAX;
3135 ecode++;
3136 goto REPEATCHAR;
3137
3138 case OP_POSQUERY:
3139 case OP_POSQUERYI:
3140 possessive = TRUE;
3141 min = 0;
3142 max = 1;
3143 ecode++;
3144 goto REPEATCHAR;
3145
3146 case OP_STAR:
3147 case OP_STARI:
3148 case OP_MINSTAR:
3149 case OP_MINSTARI:
3150 case OP_PLUS:
3151 case OP_PLUSI:
3152 case OP_MINPLUS:
3153 case OP_MINPLUSI:
3154 case OP_QUERY:
3155 case OP_QUERYI:
3156 case OP_MINQUERY:
3157 case OP_MINQUERYI:
3158 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3159 minimize = (c & 1) != 0;
3160 min = rep_min[c]; /* Pick up values from tables; */
3161 max = rep_max[c]; /* zero for max => infinity */
3162 if (max == 0) max = INT_MAX;
3163
3164 /* Common code for all repeated single-character matches. */
3165
3166 REPEATCHAR:
3167#ifdef SUPPORT_UTF8
3168 if (utf8)
3169 {
3170 length = 1;
3171 charptr = ecode;
3172 GETCHARLEN(fc, ecode, length);
3173 ecode += length;
3174
3175 /* Handle multibyte character matching specially here. There is
3176 support for caseless matching if UCP support is present. */
3177
3178 if (length > 1)
3179 {
3180#ifdef SUPPORT_UCP
3181 unsigned int othercase;
3182 if (op >= OP_STARI && /* Caseless */
3183 (othercase = UCD_OTHERCASE(fc)) != fc)
3184 oclength = _pcre_ord2utf8(othercase, occhars);
3185 else oclength = 0;
3186#endif /* SUPPORT_UCP */
3187
3188 for (i = 1; i <= min; i++)
3189 {
3190 if (eptr <= md->end_subject - length &&
3191 memcmp(eptr, charptr, length) == 0) eptr += length;
3192#ifdef SUPPORT_UCP
3193 else if (oclength > 0 &&
3194 eptr <= md->end_subject - oclength &&
3195 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3196#endif /* SUPPORT_UCP */
3197 else
3198 {
3199 CHECK_PARTIAL();
3200 RRETURN(MATCH_NOMATCH);
3201 }
3202 }
3203
3204 if (min == max) continue;
3205
3206 if (minimize)
3207 {
3208 for (fi = min;; fi++)
3209 {
3210 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3211 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3212 if (fi >= max) RRETURN(MATCH_NOMATCH);
3213 if (eptr <= md->end_subject - length &&
3214 memcmp(eptr, charptr, length) == 0) eptr += length;
3215#ifdef SUPPORT_UCP
3216 else if (oclength > 0 &&
3217 eptr <= md->end_subject - oclength &&
3218 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3219#endif /* SUPPORT_UCP */
3220 else
3221 {
3222 CHECK_PARTIAL();
3223 RRETURN(MATCH_NOMATCH);
3224 }
3225 }
3226 /* Control never gets here */
3227 }
3228
3229 else /* Maximize */
3230 {
3231 pp = eptr;
3232 for (i = min; i < max; i++)
3233 {
3234 if (eptr <= md->end_subject - length &&
3235 memcmp(eptr, charptr, length) == 0) eptr += length;
3236#ifdef SUPPORT_UCP
3237 else if (oclength > 0 &&
3238 eptr <= md->end_subject - oclength &&
3239 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3240#endif /* SUPPORT_UCP */
3241 else
3242 {
3243 CHECK_PARTIAL();
3244 break;
3245 }
3246 }
3247
3248 if (possessive) continue;
3249
3250 for(;;)
3251 {
3252 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3253 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3254 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3255#ifdef SUPPORT_UCP
3256 eptr--;
3257 BACKCHAR(eptr);
3258#else /* without SUPPORT_UCP */
3259 eptr -= length;
3260#endif /* SUPPORT_UCP */
3261 }
3262 }
3263 /* Control never gets here */
3264 }
3265
3266 /* If the length of a UTF-8 character is 1, we fall through here, and
3267 obey the code as for non-UTF-8 characters below, though in this case the
3268 value of fc will always be < 128. */
3269 }
3270 else
3271#endif /* SUPPORT_UTF8 */
3272
3273 /* When not in UTF-8 mode, load a single-byte character. */
3274
3275 fc = *ecode++;
3276
3277 /* The value of fc at this point is always less than 256, though we may or
3278 may not be in UTF-8 mode. The code is duplicated for the caseless and
3279 caseful cases, for speed, since matching characters is likely to be quite
3280 common. First, ensure the minimum number of matches are present. If min =
3281 max, continue at the same level without recursing. Otherwise, if
3282 minimizing, keep trying the rest of the expression and advancing one
3283 matching character if failing, up to the maximum. Alternatively, if
3284 maximizing, find the maximum number of characters and work backwards. */
3285
3286 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3287 max, eptr));
3288
3289 if (op >= OP_STARI) /* Caseless */
3290 {
3291 fc = md->lcc[fc];
3292 for (i = 1; i <= min; i++)
3293 {
3294 if (eptr >= md->end_subject)
3295 {
3296 SCHECK_PARTIAL();
3297 RRETURN(MATCH_NOMATCH);
3298 }
3299 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3300 }
3301 if (min == max) continue;
3302 if (minimize)
3303 {
3304 for (fi = min;; fi++)
3305 {
3306 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3307 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3308 if (fi >= max) RRETURN(MATCH_NOMATCH);
3309 if (eptr >= md->end_subject)
3310 {
3311 SCHECK_PARTIAL();
3312 RRETURN(MATCH_NOMATCH);
3313 }
3314 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3315 }
3316 /* Control never gets here */
3317 }
3318 else /* Maximize */
3319 {
3320 pp = eptr;
3321 for (i = min; i < max; i++)
3322 {
3323 if (eptr >= md->end_subject)
3324 {
3325 SCHECK_PARTIAL();
3326 break;
3327 }
3328 if (fc != md->lcc[*eptr]) break;
3329 eptr++;
3330 }
3331
3332 if (possessive) continue;
3333
3334 while (eptr >= pp)
3335 {
3336 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3337 eptr--;
3338 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3339 }
3340 RRETURN(MATCH_NOMATCH);
3341 }
3342 /* Control never gets here */
3343 }
3344
3345 /* Caseful comparisons (includes all multi-byte characters) */
3346
3347 else
3348 {
3349 for (i = 1; i <= min; i++)
3350 {
3351 if (eptr >= md->end_subject)
3352 {
3353 SCHECK_PARTIAL();
3354 RRETURN(MATCH_NOMATCH);
3355 }
3356 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3357 }
3358
3359 if (min == max) continue;
3360
3361 if (minimize)
3362 {
3363 for (fi = min;; fi++)
3364 {
3365 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3366 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3367 if (fi >= max) RRETURN(MATCH_NOMATCH);
3368 if (eptr >= md->end_subject)
3369 {
3370 SCHECK_PARTIAL();
3371 RRETURN(MATCH_NOMATCH);
3372 }
3373 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3374 }
3375 /* Control never gets here */
3376 }
3377 else /* Maximize */
3378 {
3379 pp = eptr;
3380 for (i = min; i < max; i++)
3381 {
3382 if (eptr >= md->end_subject)
3383 {
3384 SCHECK_PARTIAL();
3385 break;
3386 }
3387 if (fc != *eptr) break;
3388 eptr++;
3389 }
3390 if (possessive) continue;
3391
3392 while (eptr >= pp)
3393 {
3394 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3395 eptr--;
3396 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3397 }
3398 RRETURN(MATCH_NOMATCH);
3399 }
3400 }
3401 /* Control never gets here */
3402
3403 /* Match a negated single one-byte character. The character we are
3404 checking can be multibyte. */
3405
3406 case OP_NOT:
3407 case OP_NOTI:
3408 if (eptr >= md->end_subject)
3409 {
3410 SCHECK_PARTIAL();
3411 RRETURN(MATCH_NOMATCH);
3412 }
3413 ecode++;
3414 GETCHARINCTEST(c, eptr);
3415 if (op == OP_NOTI) /* The caseless case */
3416 {
3417#ifdef SUPPORT_UTF8
3418 if (c < 256)
3419#endif
3420 c = md->lcc[c];
3421 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
3422 }
3423 else /* Caseful */
3424 {
3425 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3426 }
3427 break;
3428
3429 /* Match a negated single one-byte character repeatedly. This is almost a
3430 repeat of the code for a repeated single character, but I haven't found a
3431 nice way of commoning these up that doesn't require a test of the
3432 positive/negative option for each character match. Maybe that wouldn't add
3433 very much to the time taken, but character matching *is* what this is all
3434 about... */
3435
3436 case OP_NOTEXACT:
3437 case OP_NOTEXACTI:
3438 min = max = GET2(ecode, 1);
3439 ecode += 3;
3440 goto REPEATNOTCHAR;
3441
3442 case OP_NOTUPTO:
3443 case OP_NOTUPTOI:
3444 case OP_NOTMINUPTO:
3445 case OP_NOTMINUPTOI:
3446 min = 0;
3447 max = GET2(ecode, 1);
3448 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3449 ecode += 3;
3450 goto REPEATNOTCHAR;
3451
3452 case OP_NOTPOSSTAR:
3453 case OP_NOTPOSSTARI:
3454 possessive = TRUE;
3455 min = 0;
3456 max = INT_MAX;
3457 ecode++;
3458 goto REPEATNOTCHAR;
3459
3460 case OP_NOTPOSPLUS:
3461 case OP_NOTPOSPLUSI:
3462 possessive = TRUE;
3463 min = 1;
3464 max = INT_MAX;
3465 ecode++;
3466 goto REPEATNOTCHAR;
3467
3468 case OP_NOTPOSQUERY:
3469 case OP_NOTPOSQUERYI:
3470 possessive = TRUE;
3471 min = 0;
3472 max = 1;
3473 ecode++;
3474 goto REPEATNOTCHAR;
3475
3476 case OP_NOTPOSUPTO:
3477 case OP_NOTPOSUPTOI:
3478 possessive = TRUE;
3479 min = 0;
3480 max = GET2(ecode, 1);
3481 ecode += 3;
3482 goto REPEATNOTCHAR;
3483
3484 case OP_NOTSTAR:
3485 case OP_NOTSTARI:
3486 case OP_NOTMINSTAR:
3487 case OP_NOTMINSTARI:
3488 case OP_NOTPLUS:
3489 case OP_NOTPLUSI:
3490 case OP_NOTMINPLUS:
3491 case OP_NOTMINPLUSI:
3492 case OP_NOTQUERY:
3493 case OP_NOTQUERYI:
3494 case OP_NOTMINQUERY:
3495 case OP_NOTMINQUERYI:
3496 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3497 minimize = (c & 1) != 0;
3498 min = rep_min[c]; /* Pick up values from tables; */
3499 max = rep_max[c]; /* zero for max => infinity */
3500 if (max == 0) max = INT_MAX;
3501
3502 /* Common code for all repeated single-byte matches. */
3503
3504 REPEATNOTCHAR:
3505 fc = *ecode++;
3506
3507 /* The code is duplicated for the caseless and caseful cases, for speed,
3508 since matching characters is likely to be quite common. First, ensure the
3509 minimum number of matches are present. If min = max, continue at the same
3510 level without recursing. Otherwise, if minimizing, keep trying the rest of
3511 the expression and advancing one matching character if failing, up to the
3512 maximum. Alternatively, if maximizing, find the maximum number of
3513 characters and work backwards. */
3514
3515 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3516 max, eptr));
3517
3518 if (op >= OP_NOTSTARI) /* Caseless */
3519 {
3520 fc = md->lcc[fc];
3521
3522#ifdef SUPPORT_UTF8
3523 /* UTF-8 mode */
3524 if (utf8)
3525 {
3526 register unsigned int d;
3527 for (i = 1; i <= min; i++)
3528 {
3529 if (eptr >= md->end_subject)
3530 {
3531 SCHECK_PARTIAL();
3532 RRETURN(MATCH_NOMATCH);
3533 }
3534 GETCHARINC(d, eptr);
3535 if (d < 256) d = md->lcc[d];
3536 if (fc == d) RRETURN(MATCH_NOMATCH);
3537 }
3538 }
3539 else
3540#endif
3541
3542 /* Not UTF-8 mode */
3543 {
3544 for (i = 1; i <= min; i++)
3545 {
3546 if (eptr >= md->end_subject)
3547 {
3548 SCHECK_PARTIAL();
3549 RRETURN(MATCH_NOMATCH);
3550 }
3551 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3552 }
3553 }
3554
3555 if (min == max) continue;
3556
3557 if (minimize)
3558 {
3559#ifdef SUPPORT_UTF8
3560 /* UTF-8 mode */
3561 if (utf8)
3562 {
3563 register unsigned int d;
3564 for (fi = min;; fi++)
3565 {
3566 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3567 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3568 if (fi >= max) RRETURN(MATCH_NOMATCH);
3569 if (eptr >= md->end_subject)
3570 {
3571 SCHECK_PARTIAL();
3572 RRETURN(MATCH_NOMATCH);
3573 }
3574 GETCHARINC(d, eptr);
3575 if (d < 256) d = md->lcc[d];
3576 if (fc == d) RRETURN(MATCH_NOMATCH);
3577 }
3578 }
3579 else
3580#endif
3581 /* Not UTF-8 mode */
3582 {
3583 for (fi = min;; fi++)
3584 {
3585 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3586 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3587 if (fi >= max) RRETURN(MATCH_NOMATCH);
3588 if (eptr >= md->end_subject)
3589 {
3590 SCHECK_PARTIAL();
3591 RRETURN(MATCH_NOMATCH);
3592 }
3593 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3594 }
3595 }
3596 /* Control never gets here */
3597 }
3598
3599 /* Maximize case */
3600
3601 else
3602 {
3603 pp = eptr;
3604
3605#ifdef SUPPORT_UTF8
3606 /* UTF-8 mode */
3607 if (utf8)
3608 {
3609 register unsigned int d;
3610 for (i = min; i < max; i++)
3611 {
3612 int len = 1;
3613 if (eptr >= md->end_subject)
3614 {
3615 SCHECK_PARTIAL();
3616 break;
3617 }
3618 GETCHARLEN(d, eptr, len);
3619 if (d < 256) d = md->lcc[d];
3620 if (fc == d) break;
3621 eptr += len;
3622 }
3623 if (possessive) continue;
3624 for(;;)
3625 {
3626 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3627 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3628 if (eptr-- == pp) break; /* Stop if tried at original pos */
3629 BACKCHAR(eptr);
3630 }
3631 }
3632 else
3633#endif
3634 /* Not UTF-8 mode */
3635 {
3636 for (i = min; i < max; i++)
3637 {
3638 if (eptr >= md->end_subject)
3639 {
3640 SCHECK_PARTIAL();
3641 break;
3642 }
3643 if (fc == md->lcc[*eptr]) break;
3644 eptr++;
3645 }
3646 if (possessive) continue;
3647 while (eptr >= pp)
3648 {
3649 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3650 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3651 eptr--;
3652 }
3653 }
3654
3655 RRETURN(MATCH_NOMATCH);
3656 }
3657 /* Control never gets here */
3658 }
3659
3660 /* Caseful comparisons */
3661
3662 else
3663 {
3664#ifdef SUPPORT_UTF8
3665 /* UTF-8 mode */
3666 if (utf8)
3667 {
3668 register unsigned int d;
3669 for (i = 1; i <= min; i++)
3670 {
3671 if (eptr >= md->end_subject)
3672 {
3673 SCHECK_PARTIAL();
3674 RRETURN(MATCH_NOMATCH);
3675 }
3676 GETCHARINC(d, eptr);
3677 if (fc == d) RRETURN(MATCH_NOMATCH);
3678 }
3679 }
3680 else
3681#endif
3682 /* Not UTF-8 mode */
3683 {
3684 for (i = 1; i <= min; i++)
3685 {
3686 if (eptr >= md->end_subject)
3687 {
3688 SCHECK_PARTIAL();
3689 RRETURN(MATCH_NOMATCH);
3690 }
3691 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3692 }
3693 }
3694
3695 if (min == max) continue;
3696
3697 if (minimize)
3698 {
3699#ifdef SUPPORT_UTF8
3700 /* UTF-8 mode */
3701 if (utf8)
3702 {
3703 register unsigned int d;
3704 for (fi = min;; fi++)
3705 {
3706 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3707 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3708 if (fi >= max) RRETURN(MATCH_NOMATCH);
3709 if (eptr >= md->end_subject)
3710 {
3711 SCHECK_PARTIAL();
3712 RRETURN(MATCH_NOMATCH);
3713 }
3714 GETCHARINC(d, eptr);
3715 if (fc == d) RRETURN(MATCH_NOMATCH);
3716 }
3717 }
3718 else
3719#endif
3720 /* Not UTF-8 mode */
3721 {
3722 for (fi = min;; fi++)
3723 {
3724 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3725 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3726 if (fi >= max) RRETURN(MATCH_NOMATCH);
3727 if (eptr >= md->end_subject)
3728 {
3729 SCHECK_PARTIAL();
3730 RRETURN(MATCH_NOMATCH);
3731 }
3732 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3733 }
3734 }
3735 /* Control never gets here */
3736 }
3737
3738 /* Maximize case */
3739
3740 else
3741 {
3742 pp = eptr;
3743
3744#ifdef SUPPORT_UTF8
3745 /* UTF-8 mode */
3746 if (utf8)
3747 {
3748 register unsigned int d;
3749 for (i = min; i < max; i++)
3750 {
3751 int len = 1;
3752 if (eptr >= md->end_subject)
3753 {
3754 SCHECK_PARTIAL();
3755 break;
3756 }
3757 GETCHARLEN(d, eptr, len);
3758 if (fc == d) break;
3759 eptr += len;
3760 }
3761 if (possessive) continue;
3762 for(;;)
3763 {
3764 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3765 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3766 if (eptr-- == pp) break; /* Stop if tried at original pos */
3767 BACKCHAR(eptr);
3768 }
3769 }
3770 else
3771#endif
3772 /* Not UTF-8 mode */
3773 {
3774 for (i = min; i < max; i++)
3775 {
3776 if (eptr >= md->end_subject)
3777 {
3778 SCHECK_PARTIAL();
3779 break;
3780 }
3781 if (fc == *eptr) break;
3782 eptr++;
3783 }
3784 if (possessive) continue;
3785 while (eptr >= pp)
3786 {
3787 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3788 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3789 eptr--;
3790 }
3791 }
3792
3793 RRETURN(MATCH_NOMATCH);
3794 }
3795 }
3796 /* Control never gets here */
3797
3798 /* Match a single character type repeatedly; several different opcodes
3799 share code. This is very similar to the code for single characters, but we
3800 repeat it in the interests of efficiency. */
3801
3802 case OP_TYPEEXACT:
3803 min = max = GET2(ecode, 1);
3804 minimize = TRUE;
3805 ecode += 3;
3806 goto REPEATTYPE;
3807
3808 case OP_TYPEUPTO:
3809 case OP_TYPEMINUPTO:
3810 min = 0;
3811 max = GET2(ecode, 1);
3812 minimize = *ecode == OP_TYPEMINUPTO;
3813 ecode += 3;
3814 goto REPEATTYPE;
3815
3816 case OP_TYPEPOSSTAR:
3817 possessive = TRUE;
3818 min = 0;
3819 max = INT_MAX;
3820 ecode++;
3821 goto REPEATTYPE;
3822
3823 case OP_TYPEPOSPLUS:
3824 possessive = TRUE;
3825 min = 1;
3826 max = INT_MAX;
3827 ecode++;
3828 goto REPEATTYPE;
3829
3830 case OP_TYPEPOSQUERY:
3831 possessive = TRUE;
3832 min = 0;
3833 max = 1;
3834 ecode++;
3835 goto REPEATTYPE;
3836
3837 case OP_TYPEPOSUPTO:
3838 possessive = TRUE;
3839 min = 0;
3840 max = GET2(ecode, 1);
3841 ecode += 3;
3842 goto REPEATTYPE;
3843
3844 case OP_TYPESTAR:
3845 case OP_TYPEMINSTAR:
3846 case OP_TYPEPLUS:
3847 case OP_TYPEMINPLUS:
3848 case OP_TYPEQUERY:
3849 case OP_TYPEMINQUERY:
3850 c = *ecode++ - OP_TYPESTAR;
3851 minimize = (c & 1) != 0;
3852 min = rep_min[c]; /* Pick up values from tables; */
3853 max = rep_max[c]; /* zero for max => infinity */
3854 if (max == 0) max = INT_MAX;
3855
3856 /* Common code for all repeated single character type matches. Note that
3857 in UTF-8 mode, '.' matches a character of any length, but for the other
3858 character types, the valid characters are all one-byte long. */
3859
3860 REPEATTYPE:
3861 ctype = *ecode++; /* Code for the character type */
3862
3863#ifdef SUPPORT_UCP
3864 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3865 {
3866 prop_fail_result = ctype == OP_NOTPROP;
3867 prop_type = *ecode++;
3868 prop_value = *ecode++;
3869 }
3870 else prop_type = -1;
3871#endif
3872
3873 /* First, ensure the minimum number of matches are present. Use inline
3874 code for maximizing the speed, and do the type test once at the start
3875 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3876 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3877 and single-bytes. */
3878
3879 if (min > 0)
3880 {
3881#ifdef SUPPORT_UCP
3882 if (prop_type >= 0)
3883 {
3884 switch(prop_type)
3885 {
3886 case PT_ANY:
3887 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3888 for (i = 1; i <= min; i++)
3889 {
3890 if (eptr >= md->end_subject)
3891 {
3892 SCHECK_PARTIAL();
3893 RRETURN(MATCH_NOMATCH);
3894 }
3895 GETCHARINCTEST(c, eptr);
3896 }
3897 break;
3898
3899 case PT_LAMP:
3900 for (i = 1; i <= min; i++)
3901 {
3902 int chartype;
3903 if (eptr >= md->end_subject)
3904 {
3905 SCHECK_PARTIAL();
3906 RRETURN(MATCH_NOMATCH);
3907 }
3908 GETCHARINCTEST(c, eptr);
3909 chartype = UCD_CHARTYPE(c);
3910 if ((chartype == ucp_Lu ||
3911 chartype == ucp_Ll ||
3912 chartype == ucp_Lt) == prop_fail_result)
3913 RRETURN(MATCH_NOMATCH);
3914 }
3915 break;
3916
3917 case PT_GC:
3918 for (i = 1; i <= min; i++)
3919 {
3920 if (eptr >= md->end_subject)
3921 {
3922 SCHECK_PARTIAL();
3923 RRETURN(MATCH_NOMATCH);
3924 }
3925 GETCHARINCTEST(c, eptr);
3926 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3927 RRETURN(MATCH_NOMATCH);
3928 }
3929 break;
3930
3931 case PT_PC:
3932 for (i = 1; i <= min; i++)
3933 {
3934 if (eptr >= md->end_subject)
3935 {
3936 SCHECK_PARTIAL();
3937 RRETURN(MATCH_NOMATCH);
3938 }
3939 GETCHARINCTEST(c, eptr);
3940 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3941 RRETURN(MATCH_NOMATCH);
3942 }
3943 break;
3944
3945 case PT_SC:
3946 for (i = 1; i <= min; i++)
3947 {
3948 if (eptr >= md->end_subject)
3949 {
3950 SCHECK_PARTIAL();
3951 RRETURN(MATCH_NOMATCH);
3952 }
3953 GETCHARINCTEST(c, eptr);
3954 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3955 RRETURN(MATCH_NOMATCH);
3956 }
3957 break;
3958
3959 case PT_ALNUM:
3960 for (i = 1; i <= min; i++)
3961 {
3962 int category;
3963 if (eptr >= md->end_subject)
3964 {
3965 SCHECK_PARTIAL();
3966 RRETURN(MATCH_NOMATCH);
3967 }
3968 GETCHARINCTEST(c, eptr);
3969 category = UCD_CATEGORY(c);
3970 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3971 RRETURN(MATCH_NOMATCH);
3972 }
3973 break;
3974
3975 case PT_SPACE: /* Perl space */
3976 for (i = 1; i <= min; i++)
3977 {
3978 if (eptr >= md->end_subject)
3979 {
3980 SCHECK_PARTIAL();
3981 RRETURN(MATCH_NOMATCH);
3982 }
3983 GETCHARINCTEST(c, eptr);
3984 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3985 c == CHAR_FF || c == CHAR_CR)
3986 == prop_fail_result)
3987 RRETURN(MATCH_NOMATCH);
3988 }
3989 break;
3990
3991 case PT_PXSPACE: /* POSIX space */
3992 for (i = 1; i <= min; i++)
3993 {
3994 if (eptr >= md->end_subject)
3995 {
3996 SCHECK_PARTIAL();
3997 RRETURN(MATCH_NOMATCH);
3998 }
3999 GETCHARINCTEST(c, eptr);
4000 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4001 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4002 == prop_fail_result)
4003 RRETURN(MATCH_NOMATCH);
4004 }
4005 break;
4006
4007 case PT_WORD:
4008 for (i = 1; i <= min; i++)
4009 {
4010 int category;
4011 if (eptr >= md->end_subject)
4012 {
4013 SCHECK_PARTIAL();
4014 RRETURN(MATCH_NOMATCH);
4015 }
4016 GETCHARINCTEST(c, eptr);
4017 category = UCD_CATEGORY(c);
4018 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4019 == prop_fail_result)
4020 RRETURN(MATCH_NOMATCH);
4021 }
4022 break;
4023
4024 /* This should not occur */
4025
4026 default:
4027 RRETURN(PCRE_ERROR_INTERNAL);
4028 }
4029 }
4030
4031 /* Match extended Unicode sequences. We will get here only if the
4032 support is in the binary; otherwise a compile-time error occurs. */
4033
4034 else if (ctype == OP_EXTUNI)
4035 {
4036 for (i = 1; i <= min; i++)
4037 {
4038 if (eptr >= md->end_subject)
4039 {
4040 SCHECK_PARTIAL();
4041 RRETURN(MATCH_NOMATCH);
4042 }
4043 GETCHARINCTEST(c, eptr);
4044 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4045 while (eptr < md->end_subject)
4046 {
4047 int len = 1;
4048 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4049 if (UCD_CATEGORY(c) != ucp_M) break;
4050 eptr += len;
4051 }
4052 }
4053 }
4054
4055 else
4056#endif /* SUPPORT_UCP */
4057
4058/* Handle all other cases when the coding is UTF-8 */
4059
4060#ifdef SUPPORT_UTF8
4061 if (utf8) switch(ctype)
4062 {
4063 case OP_ANY:
4064 for (i = 1; i <= min; i++)
4065 {
4066 if (eptr >= md->end_subject)
4067 {
4068 SCHECK_PARTIAL();
4069 RRETURN(MATCH_NOMATCH);
4070 }
4071 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4072 eptr++;
4073 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4074 }
4075 break;
4076
4077 case OP_ALLANY:
4078 for (i = 1; i <= min; i++)
4079 {
4080 if (eptr >= md->end_subject)
4081 {
4082 SCHECK_PARTIAL();
4083 RRETURN(MATCH_NOMATCH);
4084 }
4085 eptr++;
4086 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4087 }
4088 break;
4089
4090 case OP_ANYBYTE:
4091 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4092 eptr += min;
4093 break;
4094
4095 case OP_ANYNL:
4096 for (i = 1; i <= min; i++)
4097 {
4098 if (eptr >= md->end_subject)
4099 {
4100 SCHECK_PARTIAL();
4101 RRETURN(MATCH_NOMATCH);
4102 }
4103 GETCHARINC(c, eptr);
4104 switch(c)
4105 {
4106 default: RRETURN(MATCH_NOMATCH);
4107
4108 case 0x000d:
4109 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4110 break;
4111
4112 case 0x000a:
4113 break;
4114
4115 case 0x000b:
4116 case 0x000c:
4117 case 0x0085:
4118 case 0x2028:
4119 case 0x2029:
4120 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4121 break;
4122 }
4123 }
4124 break;
4125
4126 case OP_NOT_HSPACE:
4127 for (i = 1; i <= min; i++)
4128 {
4129 if (eptr >= md->end_subject)
4130 {
4131 SCHECK_PARTIAL();
4132 RRETURN(MATCH_NOMATCH);
4133 }
4134 GETCHARINC(c, eptr);
4135 switch(c)
4136 {
4137 default: break;
4138 case 0x09: /* HT */
4139 case 0x20: /* SPACE */
4140 case 0xa0: /* NBSP */
4141 case 0x1680: /* OGHAM SPACE MARK */
4142 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4143 case 0x2000: /* EN QUAD */
4144 case 0x2001: /* EM QUAD */
4145 case 0x2002: /* EN SPACE */
4146 case 0x2003: /* EM SPACE */
4147 case 0x2004: /* THREE-PER-EM SPACE */
4148 case 0x2005: /* FOUR-PER-EM SPACE */
4149 case 0x2006: /* SIX-PER-EM SPACE */
4150 case 0x2007: /* FIGURE SPACE */
4151 case 0x2008: /* PUNCTUATION SPACE */
4152 case 0x2009: /* THIN SPACE */
4153 case 0x200A: /* HAIR SPACE */
4154 case 0x202f: /* NARROW NO-BREAK SPACE */
4155 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4156 case 0x3000: /* IDEOGRAPHIC SPACE */
4157 RRETURN(MATCH_NOMATCH);
4158 }
4159 }
4160 break;
4161
4162 case OP_HSPACE:
4163 for (i = 1; i <= min; i++)
4164 {
4165 if (eptr >= md->end_subject)
4166 {
4167 SCHECK_PARTIAL();
4168 RRETURN(MATCH_NOMATCH);
4169 }
4170 GETCHARINC(c, eptr);
4171 switch(c)
4172 {
4173 default: RRETURN(MATCH_NOMATCH);
4174 case 0x09: /* HT */
4175 case 0x20: /* SPACE */
4176 case 0xa0: /* NBSP */
4177 case 0x1680: /* OGHAM SPACE MARK */
4178 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4179 case 0x2000: /* EN QUAD */
4180 case 0x2001: /* EM QUAD */
4181 case 0x2002: /* EN SPACE */
4182 case 0x2003: /* EM SPACE */
4183 case 0x2004: /* THREE-PER-EM SPACE */
4184 case 0x2005: /* FOUR-PER-EM SPACE */
4185 case 0x2006: /* SIX-PER-EM SPACE */
4186 case 0x2007: /* FIGURE SPACE */
4187 case 0x2008: /* PUNCTUATION SPACE */
4188 case 0x2009: /* THIN SPACE */
4189 case 0x200A: /* HAIR SPACE */
4190 case 0x202f: /* NARROW NO-BREAK SPACE */
4191 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4192 case 0x3000: /* IDEOGRAPHIC SPACE */
4193 break;
4194 }
4195 }
4196 break;
4197
4198 case OP_NOT_VSPACE:
4199 for (i = 1; i <= min; i++)
4200 {
4201 if (eptr >= md->end_subject)
4202 {
4203 SCHECK_PARTIAL();
4204 RRETURN(MATCH_NOMATCH);
4205 }
4206 GETCHARINC(c, eptr);
4207 switch(c)
4208 {
4209 default: break;
4210 case 0x0a: /* LF */
4211 case 0x0b: /* VT */
4212 case 0x0c: /* FF */
4213 case 0x0d: /* CR */
4214 case 0x85: /* NEL */
4215 case 0x2028: /* LINE SEPARATOR */
4216 case 0x2029: /* PARAGRAPH SEPARATOR */
4217 RRETURN(MATCH_NOMATCH);
4218 }
4219 }
4220 break;
4221
4222 case OP_VSPACE:
4223 for (i = 1; i <= min; i++)
4224 {
4225 if (eptr >= md->end_subject)
4226 {
4227 SCHECK_PARTIAL();
4228 RRETURN(MATCH_NOMATCH);
4229 }
4230 GETCHARINC(c, eptr);
4231 switch(c)
4232 {
4233 default: RRETURN(MATCH_NOMATCH);
4234 case 0x0a: /* LF */
4235 case 0x0b: /* VT */
4236 case 0x0c: /* FF */
4237 case 0x0d: /* CR */
4238 case 0x85: /* NEL */
4239 case 0x2028: /* LINE SEPARATOR */
4240 case 0x2029: /* PARAGRAPH SEPARATOR */
4241 break;
4242 }
4243 }
4244 break;
4245
4246 case OP_NOT_DIGIT:
4247 for (i = 1; i <= min; i++)
4248 {
4249 if (eptr >= md->end_subject)
4250 {
4251 SCHECK_PARTIAL();
4252 RRETURN(MATCH_NOMATCH);
4253 }
4254 GETCHARINC(c, eptr);
4255 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4256 RRETURN(MATCH_NOMATCH);
4257 }
4258 break;
4259
4260 case OP_DIGIT:
4261 for (i = 1; i <= min; i++)
4262 {
4263 if (eptr >= md->end_subject)
4264 {
4265 SCHECK_PARTIAL();
4266 RRETURN(MATCH_NOMATCH);
4267 }
4268 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4269 RRETURN(MATCH_NOMATCH);
4270 /* No need to skip more bytes - we know it's a 1-byte character */
4271 }
4272 break;
4273
4274 case OP_NOT_WHITESPACE:
4275 for (i = 1; i <= min; i++)
4276 {
4277 if (eptr >= md->end_subject)
4278 {
4279 SCHECK_PARTIAL();
4280 RRETURN(MATCH_NOMATCH);
4281 }
4282 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4283 RRETURN(MATCH_NOMATCH);
4284 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4285 }
4286 break;
4287
4288 case OP_WHITESPACE:
4289 for (i = 1; i <= min; i++)
4290 {
4291 if (eptr >= md->end_subject)
4292 {
4293 SCHECK_PARTIAL();
4294 RRETURN(MATCH_NOMATCH);
4295 }
4296 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4297 RRETURN(MATCH_NOMATCH);
4298 /* No need to skip more bytes - we know it's a 1-byte character */
4299 }
4300 break;
4301
4302 case OP_NOT_WORDCHAR:
4303 for (i = 1; i <= min; i++)
4304 {
4305 if (eptr >= md->end_subject)
4306 {
4307 SCHECK_PARTIAL();
4308 RRETURN(MATCH_NOMATCH);
4309 }
4310 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4311 RRETURN(MATCH_NOMATCH);
4312 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4313 }
4314 break;
4315
4316 case OP_WORDCHAR:
4317 for (i = 1; i <= min; i++)
4318 {
4319 if (eptr >= md->end_subject)
4320 {
4321 SCHECK_PARTIAL();
4322 RRETURN(MATCH_NOMATCH);
4323 }
4324 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4325 RRETURN(MATCH_NOMATCH);
4326 /* No need to skip more bytes - we know it's a 1-byte character */
4327 }
4328 break;
4329
4330 default:
4331 RRETURN(PCRE_ERROR_INTERNAL);
4332 } /* End switch(ctype) */
4333
4334 else
4335#endif /* SUPPORT_UTF8 */
4336
4337 /* Code for the non-UTF-8 case for minimum matching of operators other
4338 than OP_PROP and OP_NOTPROP. */
4339
4340 switch(ctype)
4341 {
4342 case OP_ANY:
4343 for (i = 1; i <= min; i++)
4344 {
4345 if (eptr >= md->end_subject)
4346 {
4347 SCHECK_PARTIAL();
4348 RRETURN(MATCH_NOMATCH);
4349 }
4350 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4351 eptr++;
4352 }
4353 break;
4354
4355 case OP_ALLANY:
4356 if (eptr > md->end_subject - min)
4357 {
4358 SCHECK_PARTIAL();
4359 RRETURN(MATCH_NOMATCH);
4360 }
4361 eptr += min;
4362 break;
4363
4364 case OP_ANYBYTE:
4365 if (eptr > md->end_subject - min)
4366 {
4367 SCHECK_PARTIAL();
4368 RRETURN(MATCH_NOMATCH);
4369 }
4370 eptr += min;
4371 break;
4372
4373 case OP_ANYNL:
4374 for (i = 1; i <= min; i++)
4375 {
4376 if (eptr >= md->end_subject)
4377 {
4378 SCHECK_PARTIAL();
4379 RRETURN(MATCH_NOMATCH);
4380 }
4381 switch(*eptr++)
4382 {
4383 default: RRETURN(MATCH_NOMATCH);
4384
4385 case 0x000d:
4386 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4387 break;
4388
4389 case 0x000a:
4390 break;
4391
4392 case 0x000b:
4393 case 0x000c:
4394 case 0x0085:
4395 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4396 break;
4397 }
4398 }
4399 break;
4400
4401 case OP_NOT_HSPACE:
4402 for (i = 1; i <= min; i++)
4403 {
4404 if (eptr >= md->end_subject)
4405 {
4406 SCHECK_PARTIAL();
4407 RRETURN(MATCH_NOMATCH);
4408 }
4409 switch(*eptr++)
4410 {
4411 default: break;
4412 case 0x09: /* HT */
4413 case 0x20: /* SPACE */
4414 case 0xa0: /* NBSP */
4415 RRETURN(MATCH_NOMATCH);
4416 }
4417 }
4418 break;
4419
4420 case OP_HSPACE:
4421 for (i = 1; i <= min; i++)
4422 {
4423 if (eptr >= md->end_subject)
4424 {
4425 SCHECK_PARTIAL();
4426 RRETURN(MATCH_NOMATCH);
4427 }
4428 switch(*eptr++)
4429 {
4430 default: RRETURN(MATCH_NOMATCH);
4431 case 0x09: /* HT */
4432 case 0x20: /* SPACE */
4433 case 0xa0: /* NBSP */
4434 break;
4435 }
4436 }
4437 break;
4438
4439 case OP_NOT_VSPACE:
4440 for (i = 1; i <= min; i++)
4441 {
4442 if (eptr >= md->end_subject)
4443 {
4444 SCHECK_PARTIAL();
4445 RRETURN(MATCH_NOMATCH);
4446 }
4447 switch(*eptr++)
4448 {
4449 default: break;
4450 case 0x0a: /* LF */
4451 case 0x0b: /* VT */
4452 case 0x0c: /* FF */
4453 case 0x0d: /* CR */
4454 case 0x85: /* NEL */
4455 RRETURN(MATCH_NOMATCH);
4456 }
4457 }
4458 break;
4459
4460 case OP_VSPACE:
4461 for (i = 1; i <= min; i++)
4462 {
4463 if (eptr >= md->end_subject)
4464 {
4465 SCHECK_PARTIAL();
4466 RRETURN(MATCH_NOMATCH);
4467 }
4468 switch(*eptr++)
4469 {
4470 default: RRETURN(MATCH_NOMATCH);
4471 case 0x0a: /* LF */
4472 case 0x0b: /* VT */
4473 case 0x0c: /* FF */
4474 case 0x0d: /* CR */
4475 case 0x85: /* NEL */
4476 break;
4477 }
4478 }
4479 break;
4480
4481 case OP_NOT_DIGIT:
4482 for (i = 1; i <= min; i++)
4483 {
4484 if (eptr >= md->end_subject)
4485 {
4486 SCHECK_PARTIAL();
4487 RRETURN(MATCH_NOMATCH);
4488 }
4489 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
4490 }
4491 break;
4492
4493 case OP_DIGIT:
4494 for (i = 1; i <= min; i++)
4495 {
4496 if (eptr >= md->end_subject)
4497 {
4498 SCHECK_PARTIAL();
4499 RRETURN(MATCH_NOMATCH);
4500 }
4501 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
4502 }
4503 break;
4504
4505 case OP_NOT_WHITESPACE:
4506 for (i = 1; i <= min; i++)
4507 {
4508 if (eptr >= md->end_subject)
4509 {
4510 SCHECK_PARTIAL();
4511 RRETURN(MATCH_NOMATCH);
4512 }
4513 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
4514 }
4515 break;
4516
4517 case OP_WHITESPACE:
4518 for (i = 1; i <= min; i++)
4519 {
4520 if (eptr >= md->end_subject)
4521 {
4522 SCHECK_PARTIAL();
4523 RRETURN(MATCH_NOMATCH);
4524 }
4525 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
4526 }
4527 break;
4528
4529 case OP_NOT_WORDCHAR:
4530 for (i = 1; i <= min; i++)
4531 {
4532 if (eptr >= md->end_subject)
4533 {
4534 SCHECK_PARTIAL();
4535 RRETURN(MATCH_NOMATCH);
4536 }
4537 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4538 RRETURN(MATCH_NOMATCH);
4539 }
4540 break;
4541
4542 case OP_WORDCHAR:
4543 for (i = 1; i <= min; i++)
4544 {
4545 if (eptr >= md->end_subject)
4546 {
4547 SCHECK_PARTIAL();
4548 RRETURN(MATCH_NOMATCH);
4549 }
4550 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4551 RRETURN(MATCH_NOMATCH);
4552 }
4553 break;
4554
4555 default:
4556 RRETURN(PCRE_ERROR_INTERNAL);
4557 }
4558 }
4559
4560 /* If min = max, continue at the same level without recursing */
4561
4562 if (min == max) continue;
4563
4564 /* If minimizing, we have to test the rest of the pattern before each
4565 subsequent match. Again, separate the UTF-8 case for speed, and also
4566 separate the UCP cases. */
4567
4568 if (minimize)
4569 {
4570#ifdef SUPPORT_UCP
4571 if (prop_type >= 0)
4572 {
4573 switch(prop_type)
4574 {
4575 case PT_ANY:
4576 for (fi = min;; fi++)
4577 {
4578 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4579 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4580 if (fi >= max) RRETURN(MATCH_NOMATCH);
4581 if (eptr >= md->end_subject)
4582 {
4583 SCHECK_PARTIAL();
4584 RRETURN(MATCH_NOMATCH);
4585 }
4586 GETCHARINCTEST(c, eptr);
4587 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4588 }
4589 /* Control never gets here */
4590
4591 case PT_LAMP:
4592 for (fi = min;; fi++)
4593 {
4594 int chartype;
4595 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4596 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4597 if (fi >= max) RRETURN(MATCH_NOMATCH);
4598 if (eptr >= md->end_subject)
4599 {
4600 SCHECK_PARTIAL();
4601 RRETURN(MATCH_NOMATCH);
4602 }
4603 GETCHARINCTEST(c, eptr);
4604 chartype = UCD_CHARTYPE(c);
4605 if ((chartype == ucp_Lu ||
4606 chartype == ucp_Ll ||
4607 chartype == ucp_Lt) == prop_fail_result)
4608 RRETURN(MATCH_NOMATCH);
4609 }
4610 /* Control never gets here */
4611
4612 case PT_GC:
4613 for (fi = min;; fi++)
4614 {
4615 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4616 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4617 if (fi >= max) RRETURN(MATCH_NOMATCH);
4618 if (eptr >= md->end_subject)
4619 {
4620 SCHECK_PARTIAL();
4621 RRETURN(MATCH_NOMATCH);
4622 }
4623 GETCHARINCTEST(c, eptr);
4624 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4625 RRETURN(MATCH_NOMATCH);
4626 }
4627 /* Control never gets here */
4628
4629 case PT_PC:
4630 for (fi = min;; fi++)
4631 {
4632 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4633 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4634 if (fi >= max) RRETURN(MATCH_NOMATCH);
4635 if (eptr >= md->end_subject)
4636 {
4637 SCHECK_PARTIAL();
4638 RRETURN(MATCH_NOMATCH);
4639 }
4640 GETCHARINCTEST(c, eptr);
4641 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4642 RRETURN(MATCH_NOMATCH);
4643 }
4644 /* Control never gets here */
4645
4646 case PT_SC:
4647 for (fi = min;; fi++)
4648 {
4649 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4650 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4651 if (fi >= max) RRETURN(MATCH_NOMATCH);
4652 if (eptr >= md->end_subject)
4653 {
4654 SCHECK_PARTIAL();
4655 RRETURN(MATCH_NOMATCH);
4656 }
4657 GETCHARINCTEST(c, eptr);
4658 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4659 RRETURN(MATCH_NOMATCH);
4660 }
4661 /* Control never gets here */
4662
4663 case PT_ALNUM:
4664 for (fi = min;; fi++)
4665 {
4666 int category;
4667 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4668 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4669 if (fi >= max) RRETURN(MATCH_NOMATCH);
4670 if (eptr >= md->end_subject)
4671 {
4672 SCHECK_PARTIAL();
4673 RRETURN(MATCH_NOMATCH);
4674 }
4675 GETCHARINCTEST(c, eptr);
4676 category = UCD_CATEGORY(c);
4677 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4678 RRETURN(MATCH_NOMATCH);
4679 }
4680 /* Control never gets here */
4681
4682 case PT_SPACE: /* Perl space */
4683 for (fi = min;; fi++)
4684 {
4685 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4686 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4687 if (fi >= max) RRETURN(MATCH_NOMATCH);
4688 if (eptr >= md->end_subject)
4689 {
4690 SCHECK_PARTIAL();
4691 RRETURN(MATCH_NOMATCH);
4692 }
4693 GETCHARINCTEST(c, eptr);
4694 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4695 c == CHAR_FF || c == CHAR_CR)
4696 == prop_fail_result)
4697 RRETURN(MATCH_NOMATCH);
4698 }
4699 /* Control never gets here */
4700
4701 case PT_PXSPACE: /* POSIX space */
4702 for (fi = min;; fi++)
4703 {
4704 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4705 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4706 if (fi >= max) RRETURN(MATCH_NOMATCH);
4707 if (eptr >= md->end_subject)
4708 {
4709 SCHECK_PARTIAL();
4710 RRETURN(MATCH_NOMATCH);
4711 }
4712 GETCHARINCTEST(c, eptr);
4713 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4714 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4715 == prop_fail_result)
4716 RRETURN(MATCH_NOMATCH);
4717 }
4718 /* Control never gets here */
4719
4720 case PT_WORD:
4721 for (fi = min;; fi++)
4722 {
4723 int category;
4724 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4725 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4726 if (fi >= max) RRETURN(MATCH_NOMATCH);
4727 if (eptr >= md->end_subject)
4728 {
4729 SCHECK_PARTIAL();
4730 RRETURN(MATCH_NOMATCH);
4731 }
4732 GETCHARINCTEST(c, eptr);
4733 category = UCD_CATEGORY(c);
4734 if ((category == ucp_L ||
4735 category == ucp_N ||
4736 c == CHAR_UNDERSCORE)
4737 == prop_fail_result)
4738 RRETURN(MATCH_NOMATCH);
4739 }
4740 /* Control never gets here */
4741
4742 /* This should never occur */
4743
4744 default:
4745 RRETURN(PCRE_ERROR_INTERNAL);
4746 }
4747 }
4748
4749 /* Match extended Unicode sequences. We will get here only if the
4750 support is in the binary; otherwise a compile-time error occurs. */
4751
4752 else if (ctype == OP_EXTUNI)
4753 {
4754 for (fi = min;; fi++)
4755 {
4756 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4757 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4758 if (fi >= max) RRETURN(MATCH_NOMATCH);
4759 if (eptr >= md->end_subject)
4760 {
4761 SCHECK_PARTIAL();
4762 RRETURN(MATCH_NOMATCH);
4763 }
4764 GETCHARINCTEST(c, eptr);
4765 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4766 while (eptr < md->end_subject)
4767 {
4768 int len = 1;
4769 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4770 if (UCD_CATEGORY(c) != ucp_M) break;
4771 eptr += len;
4772 }
4773 }
4774 }
4775 else
4776#endif /* SUPPORT_UCP */
4777
4778#ifdef SUPPORT_UTF8
4779 /* UTF-8 mode */
4780 if (utf8)
4781 {
4782 for (fi = min;; fi++)
4783 {
4784 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4786 if (fi >= max) RRETURN(MATCH_NOMATCH);
4787 if (eptr >= md->end_subject)
4788 {
4789 SCHECK_PARTIAL();
4790 RRETURN(MATCH_NOMATCH);
4791 }
4792 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4793 RRETURN(MATCH_NOMATCH);
4794 GETCHARINC(c, eptr);
4795 switch(ctype)
4796 {
4797 case OP_ANY: /* This is the non-NL case */
4798 case OP_ALLANY:
4799 case OP_ANYBYTE:
4800 break;
4801
4802 case OP_ANYNL:
4803 switch(c)
4804 {
4805 default: RRETURN(MATCH_NOMATCH);
4806 case 0x000d:
4807 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4808 break;
4809 case 0x000a:
4810 break;
4811
4812 case 0x000b:
4813 case 0x000c:
4814 case 0x0085:
4815 case 0x2028:
4816 case 0x2029:
4817 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4818 break;
4819 }
4820 break;
4821
4822 case OP_NOT_HSPACE:
4823 switch(c)
4824 {
4825 default: break;
4826 case 0x09: /* HT */
4827 case 0x20: /* SPACE */
4828 case 0xa0: /* NBSP */
4829 case 0x1680: /* OGHAM SPACE MARK */
4830 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4831 case 0x2000: /* EN QUAD */
4832 case 0x2001: /* EM QUAD */
4833 case 0x2002: /* EN SPACE */
4834 case 0x2003: /* EM SPACE */
4835 case 0x2004: /* THREE-PER-EM SPACE */
4836 case 0x2005: /* FOUR-PER-EM SPACE */
4837 case 0x2006: /* SIX-PER-EM SPACE */
4838 case 0x2007: /* FIGURE SPACE */
4839 case 0x2008: /* PUNCTUATION SPACE */
4840 case 0x2009: /* THIN SPACE */
4841 case 0x200A: /* HAIR SPACE */
4842 case 0x202f: /* NARROW NO-BREAK SPACE */
4843 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4844 case 0x3000: /* IDEOGRAPHIC SPACE */
4845 RRETURN(MATCH_NOMATCH);
4846 }
4847 break;
4848
4849 case OP_HSPACE:
4850 switch(c)
4851 {
4852 default: RRETURN(MATCH_NOMATCH);
4853 case 0x09: /* HT */
4854 case 0x20: /* SPACE */
4855 case 0xa0: /* NBSP */
4856 case 0x1680: /* OGHAM SPACE MARK */
4857 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4858 case 0x2000: /* EN QUAD */
4859 case 0x2001: /* EM QUAD */
4860 case 0x2002: /* EN SPACE */
4861 case 0x2003: /* EM SPACE */
4862 case 0x2004: /* THREE-PER-EM SPACE */
4863 case 0x2005: /* FOUR-PER-EM SPACE */
4864 case 0x2006: /* SIX-PER-EM SPACE */
4865 case 0x2007: /* FIGURE SPACE */
4866 case 0x2008: /* PUNCTUATION SPACE */
4867 case 0x2009: /* THIN SPACE */
4868 case 0x200A: /* HAIR SPACE */
4869 case 0x202f: /* NARROW NO-BREAK SPACE */
4870 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4871 case 0x3000: /* IDEOGRAPHIC SPACE */
4872 break;
4873 }
4874 break;
4875
4876 case OP_NOT_VSPACE:
4877 switch(c)
4878 {
4879 default: break;
4880 case 0x0a: /* LF */
4881 case 0x0b: /* VT */
4882 case 0x0c: /* FF */
4883 case 0x0d: /* CR */
4884 case 0x85: /* NEL */
4885 case 0x2028: /* LINE SEPARATOR */
4886 case 0x2029: /* PARAGRAPH SEPARATOR */
4887 RRETURN(MATCH_NOMATCH);
4888 }
4889 break;
4890
4891 case OP_VSPACE:
4892 switch(c)
4893 {
4894 default: RRETURN(MATCH_NOMATCH);
4895 case 0x0a: /* LF */
4896 case 0x0b: /* VT */
4897 case 0x0c: /* FF */
4898 case 0x0d: /* CR */
4899 case 0x85: /* NEL */
4900 case 0x2028: /* LINE SEPARATOR */
4901 case 0x2029: /* PARAGRAPH SEPARATOR */
4902 break;
4903 }
4904 break;
4905
4906 case OP_NOT_DIGIT:
4907 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4908 RRETURN(MATCH_NOMATCH);
4909 break;
4910
4911 case OP_DIGIT:
4912 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4913 RRETURN(MATCH_NOMATCH);
4914 break;
4915
4916 case OP_NOT_WHITESPACE:
4917 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4918 RRETURN(MATCH_NOMATCH);
4919 break;
4920
4921 case OP_WHITESPACE:
4922 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4923 RRETURN(MATCH_NOMATCH);
4924 break;
4925
4926 case OP_NOT_WORDCHAR:
4927 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4928 RRETURN(MATCH_NOMATCH);
4929 break;
4930
4931 case OP_WORDCHAR:
4932 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4933 RRETURN(MATCH_NOMATCH);
4934 break;
4935
4936 default:
4937 RRETURN(PCRE_ERROR_INTERNAL);
4938 }
4939 }
4940 }
4941 else
4942#endif
4943 /* Not UTF-8 mode */
4944 {
4945 for (fi = min;; fi++)
4946 {
4947 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4948 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4949 if (fi >= max) RRETURN(MATCH_NOMATCH);
4950 if (eptr >= md->end_subject)
4951 {
4952 SCHECK_PARTIAL();
4953 RRETURN(MATCH_NOMATCH);
4954 }
4955 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4956 RRETURN(MATCH_NOMATCH);
4957 c = *eptr++;
4958 switch(ctype)
4959 {
4960 case OP_ANY: /* This is the non-NL case */
4961 case OP_ALLANY:
4962 case OP_ANYBYTE:
4963 break;
4964
4965 case OP_ANYNL:
4966 switch(c)
4967 {
4968 default: RRETURN(MATCH_NOMATCH);
4969 case 0x000d:
4970 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4971 break;
4972
4973 case 0x000a:
4974 break;
4975
4976 case 0x000b:
4977 case 0x000c:
4978 case 0x0085:
4979 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4980 break;
4981 }
4982 break;
4983
4984 case OP_NOT_HSPACE:
4985 switch(c)
4986 {
4987 default: break;
4988 case 0x09: /* HT */
4989 case 0x20: /* SPACE */
4990 case 0xa0: /* NBSP */
4991 RRETURN(MATCH_NOMATCH);
4992 }
4993 break;
4994
4995 case OP_HSPACE:
4996 switch(c)
4997 {
4998 default: RRETURN(MATCH_NOMATCH);
4999 case 0x09: /* HT */
5000 case 0x20: /* SPACE */
5001 case 0xa0: /* NBSP */
5002 break;
5003 }
5004 break;
5005
5006 case OP_NOT_VSPACE:
5007 switch(c)
5008 {
5009 default: break;
5010 case 0x0a: /* LF */
5011 case 0x0b: /* VT */
5012 case 0x0c: /* FF */
5013 case 0x0d: /* CR */
5014 case 0x85: /* NEL */
5015 RRETURN(MATCH_NOMATCH);
5016 }
5017 break;
5018
5019 case OP_VSPACE:
5020 switch(c)
5021 {
5022 default: RRETURN(MATCH_NOMATCH);
5023 case 0x0a: /* LF */
5024 case 0x0b: /* VT */
5025 case 0x0c: /* FF */
5026 case 0x0d: /* CR */
5027 case 0x85: /* NEL */
5028 break;
5029 }
5030 break;
5031
5032 case OP_NOT_DIGIT:
5033 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5034 break;
5035
5036 case OP_DIGIT:
5037 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5038 break;
5039
5040 case OP_NOT_WHITESPACE:
5041 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5042 break;
5043
5044 case OP_WHITESPACE:
5045 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5046 break;
5047
5048 case OP_NOT_WORDCHAR:
5049 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5050 break;
5051
5052 case OP_WORDCHAR:
5053 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5054 break;
5055
5056 default:
5057 RRETURN(PCRE_ERROR_INTERNAL);
5058 }
5059 }
5060 }
5061 /* Control never gets here */
5062 }
5063
5064 /* If maximizing, it is worth using inline code for speed, doing the type
5065 test once at the start (i.e. keep it out of the loop). Again, keep the
5066 UTF-8 and UCP stuff separate. */
5067
5068 else
5069 {
5070 pp = eptr; /* Remember where we started */
5071
5072#ifdef SUPPORT_UCP
5073 if (prop_type >= 0)
5074 {
5075 switch(prop_type)
5076 {
5077 case PT_ANY:
5078 for (i = min; i < max; i++)
5079 {
5080 int len = 1;
5081 if (eptr >= md->end_subject)
5082 {
5083 SCHECK_PARTIAL();
5084 break;
5085 }
5086 GETCHARLENTEST(c, eptr, len);
5087 if (prop_fail_result) break;
5088 eptr+= len;
5089 }
5090 break;
5091
5092 case PT_LAMP:
5093 for (i = min; i < max; i++)
5094 {
5095 int chartype;
5096 int len = 1;
5097 if (eptr >= md->end_subject)
5098 {
5099 SCHECK_PARTIAL();
5100 break;
5101 }
5102 GETCHARLENTEST(c, eptr, len);
5103 chartype = UCD_CHARTYPE(c);
5104 if ((chartype == ucp_Lu ||
5105 chartype == ucp_Ll ||
5106 chartype == ucp_Lt) == prop_fail_result)
5107 break;
5108 eptr+= len;
5109 }
5110 break;
5111
5112 case PT_GC:
5113 for (i = min; i < max; i++)
5114 {
5115 int len = 1;
5116 if (eptr >= md->end_subject)
5117 {
5118 SCHECK_PARTIAL();
5119 break;
5120 }
5121 GETCHARLENTEST(c, eptr, len);
5122 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5123 eptr+= len;
5124 }
5125 break;
5126
5127 case PT_PC:
5128 for (i = min; i < max; i++)
5129 {
5130 int len = 1;
5131 if (eptr >= md->end_subject)
5132 {
5133 SCHECK_PARTIAL();
5134 break;
5135 }
5136 GETCHARLENTEST(c, eptr, len);
5137 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5138 eptr+= len;
5139 }
5140 break;
5141
5142 case PT_SC:
5143 for (i = min; i < max; i++)
5144 {
5145 int len = 1;
5146 if (eptr >= md->end_subject)
5147 {
5148 SCHECK_PARTIAL();
5149 break;
5150 }
5151 GETCHARLENTEST(c, eptr, len);
5152 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5153 eptr+= len;
5154 }
5155 break;
5156
5157 case PT_ALNUM:
5158 for (i = min; i < max; i++)
5159 {
5160 int category;
5161 int len = 1;
5162 if (eptr >= md->end_subject)
5163 {
5164 SCHECK_PARTIAL();
5165 break;
5166 }
5167 GETCHARLENTEST(c, eptr, len);
5168 category = UCD_CATEGORY(c);
5169 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5170 break;
5171 eptr+= len;
5172 }
5173 break;
5174
5175 case PT_SPACE: /* Perl space */
5176 for (i = min; i < max; i++)
5177 {
5178 int len = 1;
5179 if (eptr >= md->end_subject)
5180 {
5181 SCHECK_PARTIAL();
5182 break;
5183 }
5184 GETCHARLENTEST(c, eptr, len);
5185 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5186 c == CHAR_FF || c == CHAR_CR)
5187 == prop_fail_result)
5188 break;
5189 eptr+= len;
5190 }
5191 break;
5192
5193 case PT_PXSPACE: /* POSIX space */
5194 for (i = min; i < max; i++)
5195 {
5196 int len = 1;
5197 if (eptr >= md->end_subject)
5198 {
5199 SCHECK_PARTIAL();
5200 break;
5201 }
5202 GETCHARLENTEST(c, eptr, len);
5203 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5204 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5205 == prop_fail_result)
5206 break;
5207 eptr+= len;
5208 }
5209 break;
5210
5211 case PT_WORD:
5212 for (i = min; i < max; i++)
5213 {
5214 int category;
5215 int len = 1;
5216 if (eptr >= md->end_subject)
5217 {
5218 SCHECK_PARTIAL();
5219 break;
5220 }
5221 GETCHARLENTEST(c, eptr, len);
5222 category = UCD_CATEGORY(c);
5223 if ((category == ucp_L || category == ucp_N ||
5224 c == CHAR_UNDERSCORE) == prop_fail_result)
5225 break;
5226 eptr+= len;
5227 }
5228 break;
5229
5230 default:
5231 RRETURN(PCRE_ERROR_INTERNAL);
5232 }
5233
5234 /* eptr is now past the end of the maximum run */
5235
5236 if (possessive) continue;
5237 for(;;)
5238 {
5239 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5240 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5241 if (eptr-- == pp) break; /* Stop if tried at original pos */
5242 if (utf8) BACKCHAR(eptr);
5243 }
5244 }
5245
5246 /* Match extended Unicode sequences. We will get here only if the
5247 support is in the binary; otherwise a compile-time error occurs. */
5248
5249 else if (ctype == OP_EXTUNI)
5250 {
5251 for (i = min; i < max; i++)
5252 {
5253 int len = 1;
5254 if (eptr >= md->end_subject)
5255 {
5256 SCHECK_PARTIAL();
5257 break;
5258 }
5259 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5260 if (UCD_CATEGORY(c) == ucp_M) break;
5261 eptr += len;
5262 while (eptr < md->end_subject)
5263 {
5264 len = 1;
5265 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5266 if (UCD_CATEGORY(c) != ucp_M) break;
5267 eptr += len;
5268 }
5269 }
5270
5271 /* eptr is now past the end of the maximum run */
5272
5273 if (possessive) continue;
5274
5275 for(;;)
5276 {
5277 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5278 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5279 if (eptr-- == pp) break; /* Stop if tried at original pos */
5280 for (;;) /* Move back over one extended */
5281 {
5282 if (!utf8) c = *eptr; else
5283 {
5284 BACKCHAR(eptr);
5285 GETCHAR(c, eptr);
5286 }
5287 if (UCD_CATEGORY(c) != ucp_M) break;
5288 eptr--;
5289 }
5290 }
5291 }
5292
5293 else
5294#endif /* SUPPORT_UCP */
5295
5296#ifdef SUPPORT_UTF8
5297 /* UTF-8 mode */
5298
5299 if (utf8)
5300 {
5301 switch(ctype)
5302 {
5303 case OP_ANY:
5304 if (max < INT_MAX)
5305 {
5306 for (i = min; i < max; i++)
5307 {
5308 if (eptr >= md->end_subject)
5309 {
5310 SCHECK_PARTIAL();
5311 break;
5312 }
5313 if (IS_NEWLINE(eptr)) break;
5314 eptr++;
5315 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5316 }
5317 }
5318
5319 /* Handle unlimited UTF-8 repeat */
5320
5321 else
5322 {
5323 for (i = min; i < max; i++)
5324 {
5325 if (eptr >= md->end_subject)
5326 {
5327 SCHECK_PARTIAL();
5328 break;
5329 }
5330 if (IS_NEWLINE(eptr)) break;
5331 eptr++;
5332 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5333 }
5334 }
5335 break;
5336
5337 case OP_ALLANY:
5338 if (max < INT_MAX)
5339 {
5340 for (i = min; i < max; i++)
5341 {
5342 if (eptr >= md->end_subject)
5343 {
5344 SCHECK_PARTIAL();
5345 break;
5346 }
5347 eptr++;
5348 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5349 }
5350 }
5351 else
5352 {
5353 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5354 SCHECK_PARTIAL();
5355 }
5356 break;
5357
5358 /* The byte case is the same as non-UTF8 */
5359
5360 case OP_ANYBYTE:
5361 c = max - min;
5362 if (c > (unsigned int)(md->end_subject - eptr))
5363 {
5364 eptr = md->end_subject;
5365 SCHECK_PARTIAL();
5366 }
5367 else eptr += c;
5368 break;
5369
5370 case OP_ANYNL:
5371 for (i = min; i < max; i++)
5372 {
5373 int len = 1;
5374 if (eptr >= md->end_subject)
5375 {
5376 SCHECK_PARTIAL();
5377 break;
5378 }
5379 GETCHARLEN(c, eptr, len);
5380 if (c == 0x000d)
5381 {
5382 if (++eptr >= md->end_subject) break;
5383 if (*eptr == 0x000a) eptr++;
5384 }
5385 else
5386 {
5387 if (c != 0x000a &&
5388 (md->bsr_anycrlf ||
5389 (c != 0x000b && c != 0x000c &&
5390 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5391 break;
5392 eptr += len;
5393 }
5394 }
5395 break;
5396
5397 case OP_NOT_HSPACE:
5398 case OP_HSPACE:
5399 for (i = min; i < max; i++)
5400 {
5401 BOOL gotspace;
5402 int len = 1;
5403 if (eptr >= md->end_subject)
5404 {
5405 SCHECK_PARTIAL();
5406 break;
5407 }
5408 GETCHARLEN(c, eptr, len);
5409 switch(c)
5410 {
5411 default: gotspace = FALSE; break;
5412 case 0x09: /* HT */
5413 case 0x20: /* SPACE */
5414 case 0xa0: /* NBSP */
5415 case 0x1680: /* OGHAM SPACE MARK */
5416 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5417 case 0x2000: /* EN QUAD */
5418 case 0x2001: /* EM QUAD */
5419 case 0x2002: /* EN SPACE */
5420 case 0x2003: /* EM SPACE */
5421 case 0x2004: /* THREE-PER-EM SPACE */
5422 case 0x2005: /* FOUR-PER-EM SPACE */
5423 case 0x2006: /* SIX-PER-EM SPACE */
5424 case 0x2007: /* FIGURE SPACE */
5425 case 0x2008: /* PUNCTUATION SPACE */
5426 case 0x2009: /* THIN SPACE */
5427 case 0x200A: /* HAIR SPACE */
5428 case 0x202f: /* NARROW NO-BREAK SPACE */
5429 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5430 case 0x3000: /* IDEOGRAPHIC SPACE */
5431 gotspace = TRUE;
5432 break;
5433 }
5434 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5435 eptr += len;
5436 }
5437 break;
5438
5439 case OP_NOT_VSPACE:
5440 case OP_VSPACE:
5441 for (i = min; i < max; i++)
5442 {
5443 BOOL gotspace;
5444 int len = 1;
5445 if (eptr >= md->end_subject)
5446 {
5447 SCHECK_PARTIAL();
5448 break;
5449 }
5450 GETCHARLEN(c, eptr, len);
5451 switch(c)
5452 {
5453 default: gotspace = FALSE; break;
5454 case 0x0a: /* LF */
5455 case 0x0b: /* VT */
5456 case 0x0c: /* FF */
5457 case 0x0d: /* CR */
5458 case 0x85: /* NEL */
5459 case 0x2028: /* LINE SEPARATOR */
5460 case 0x2029: /* PARAGRAPH SEPARATOR */
5461 gotspace = TRUE;
5462 break;
5463 }
5464 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5465 eptr += len;
5466 }
5467 break;
5468
5469 case OP_NOT_DIGIT:
5470 for (i = min; i < max; i++)
5471 {
5472 int len = 1;
5473 if (eptr >= md->end_subject)
5474 {
5475 SCHECK_PARTIAL();
5476 break;
5477 }
5478 GETCHARLEN(c, eptr, len);
5479 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5480 eptr+= len;
5481 }
5482 break;
5483
5484 case OP_DIGIT:
5485 for (i = min; i < max; i++)
5486 {
5487 int len = 1;
5488 if (eptr >= md->end_subject)
5489 {
5490 SCHECK_PARTIAL();
5491 break;
5492 }
5493 GETCHARLEN(c, eptr, len);
5494 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5495 eptr+= len;
5496 }
5497 break;
5498
5499 case OP_NOT_WHITESPACE:
5500 for (i = min; i < max; i++)
5501 {
5502 int len = 1;
5503 if (eptr >= md->end_subject)
5504 {
5505 SCHECK_PARTIAL();
5506 break;
5507 }
5508 GETCHARLEN(c, eptr, len);
5509 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5510 eptr+= len;
5511 }
5512 break;
5513
5514 case OP_WHITESPACE:
5515 for (i = min; i < max; i++)
5516 {
5517 int len = 1;
5518 if (eptr >= md->end_subject)
5519 {
5520 SCHECK_PARTIAL();
5521 break;
5522 }
5523 GETCHARLEN(c, eptr, len);
5524 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5525 eptr+= len;
5526 }
5527 break;
5528
5529 case OP_NOT_WORDCHAR:
5530 for (i = min; i < max; i++)
5531 {
5532 int len = 1;
5533 if (eptr >= md->end_subject)
5534 {
5535 SCHECK_PARTIAL();
5536 break;
5537 }
5538 GETCHARLEN(c, eptr, len);
5539 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5540 eptr+= len;
5541 }
5542 break;
5543
5544 case OP_WORDCHAR:
5545 for (i = min; i < max; i++)
5546 {
5547 int len = 1;
5548 if (eptr >= md->end_subject)
5549 {
5550 SCHECK_PARTIAL();
5551 break;
5552 }
5553 GETCHARLEN(c, eptr, len);
5554 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5555 eptr+= len;
5556 }
5557 break;
5558
5559 default:
5560 RRETURN(PCRE_ERROR_INTERNAL);
5561 }
5562
5563 /* eptr is now past the end of the maximum run. If possessive, we are
5564 done (no backing up). Otherwise, match at this position; anything other
5565 than no match is immediately returned. For nomatch, back up one
5566 character, unless we are matching \R and the last thing matched was
5567 \r\n, in which case, back up two bytes. */
5568
5569 if (possessive) continue;
5570 for(;;)
5571 {
5572 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5573 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5574 if (eptr-- == pp) break; /* Stop if tried at original pos */
5575 BACKCHAR(eptr);
5576 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5577 eptr[-1] == '\r') eptr--;
5578 }
5579 }
5580 else
5581#endif /* SUPPORT_UTF8 */
5582
5583 /* Not UTF-8 mode */
5584 {
5585 switch(ctype)
5586 {
5587 case OP_ANY:
5588 for (i = min; i < max; i++)
5589 {
5590 if (eptr >= md->end_subject)
5591 {
5592 SCHECK_PARTIAL();
5593 break;
5594 }
5595 if (IS_NEWLINE(eptr)) break;
5596 eptr++;
5597 }
5598 break;
5599
5600 case OP_ALLANY:
5601 case OP_ANYBYTE:
5602 c = max - min;
5603 if (c > (unsigned int)(md->end_subject - eptr))
5604 {
5605 eptr = md->end_subject;
5606 SCHECK_PARTIAL();
5607 }
5608 else eptr += c;
5609 break;
5610
5611 case OP_ANYNL:
5612 for (i = min; i < max; i++)
5613 {
5614 if (eptr >= md->end_subject)
5615 {
5616 SCHECK_PARTIAL();
5617 break;
5618 }
5619 c = *eptr;
5620 if (c == 0x000d)
5621 {
5622 if (++eptr >= md->end_subject) break;
5623 if (*eptr == 0x000a) eptr++;
5624 }
5625 else
5626 {
5627 if (c != 0x000a &&
5628 (md->bsr_anycrlf ||
5629 (c != 0x000b && c != 0x000c && c != 0x0085)))
5630 break;
5631 eptr++;
5632 }
5633 }
5634 break;
5635
5636 case OP_NOT_HSPACE:
5637 for (i = min; i < max; i++)
5638 {
5639 if (eptr >= md->end_subject)
5640 {
5641 SCHECK_PARTIAL();
5642 break;
5643 }
5644 c = *eptr;
5645 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5646 eptr++;
5647 }
5648 break;
5649
5650 case OP_HSPACE:
5651 for (i = min; i < max; i++)
5652 {
5653 if (eptr >= md->end_subject)
5654 {
5655 SCHECK_PARTIAL();
5656 break;
5657 }
5658 c = *eptr;
5659 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5660 eptr++;
5661 }
5662 break;
5663
5664 case OP_NOT_VSPACE:
5665 for (i = min; i < max; i++)
5666 {
5667 if (eptr >= md->end_subject)
5668 {
5669 SCHECK_PARTIAL();
5670 break;
5671 }
5672 c = *eptr;
5673 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5674 break;
5675 eptr++;
5676 }
5677 break;
5678
5679 case OP_VSPACE:
5680 for (i = min; i < max; i++)
5681 {
5682 if (eptr >= md->end_subject)
5683 {
5684 SCHECK_PARTIAL();
5685 break;
5686 }
5687 c = *eptr;
5688 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5689 break;
5690 eptr++;
5691 }
5692 break;
5693
5694 case OP_NOT_DIGIT:
5695 for (i = min; i < max; i++)
5696 {
5697 if (eptr >= md->end_subject)
5698 {
5699 SCHECK_PARTIAL();
5700 break;
5701 }
5702 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5703 eptr++;
5704 }
5705 break;
5706
5707 case OP_DIGIT:
5708 for (i = min; i < max; i++)
5709 {
5710 if (eptr >= md->end_subject)
5711 {
5712 SCHECK_PARTIAL();
5713 break;
5714 }
5715 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5716 eptr++;
5717 }
5718 break;
5719
5720 case OP_NOT_WHITESPACE:
5721 for (i = min; i < max; i++)
5722 {
5723 if (eptr >= md->end_subject)
5724 {
5725 SCHECK_PARTIAL();
5726 break;
5727 }
5728 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5729 eptr++;
5730 }
5731 break;
5732
5733 case OP_WHITESPACE:
5734 for (i = min; i < max; i++)
5735 {
5736 if (eptr >= md->end_subject)
5737 {
5738 SCHECK_PARTIAL();
5739 break;
5740 }
5741 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5742 eptr++;
5743 }
5744 break;
5745
5746 case OP_NOT_WORDCHAR:
5747 for (i = min; i < max; i++)
5748 {
5749 if (eptr >= md->end_subject)
5750 {
5751 SCHECK_PARTIAL();
5752 break;
5753 }
5754 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5755 eptr++;
5756 }
5757 break;
5758
5759 case OP_WORDCHAR:
5760 for (i = min; i < max; i++)
5761 {
5762 if (eptr >= md->end_subject)
5763 {
5764 SCHECK_PARTIAL();
5765 break;
5766 }
5767 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5768 eptr++;
5769 }
5770 break;
5771
5772 default:
5773 RRETURN(PCRE_ERROR_INTERNAL);
5774 }
5775
5776 /* eptr is now past the end of the maximum run. If possessive, we are
5777 done (no backing up). Otherwise, match at this position; anything other
5778 than no match is immediately returned. For nomatch, back up one
5779 character (byte), unless we are matching \R and the last thing matched
5780 was \r\n, in which case, back up two bytes. */
5781
5782 if (possessive) continue;
5783 while (eptr >= pp)
5784 {
5785 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5787 eptr--;
5788 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5789 eptr[-1] == '\r') eptr--;
5790 }
5791 }
5792
5793 /* Get here if we can't make it match with any permitted repetitions */
5794
5795 RRETURN(MATCH_NOMATCH);
5796 }
5797 /* Control never gets here */
5798
5799 /* There's been some horrible disaster. Arrival here can only mean there is
5800 something seriously wrong in the code above or the OP_xxx definitions. */
5801
5802 default:
5803 DPRINTF(("Unknown opcode %d\n", *ecode));
5804 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5805 }
5806
5807 /* Do not stick any code in here without much thought; it is assumed
5808 that "continue" in the code above comes out to here to repeat the main
5809 loop. */
5810
5811 } /* End of main loop */
5812/* Control never reaches here */
5813
5814
5815/* When compiling to use the heap rather than the stack for recursive calls to
5816match(), the RRETURN() macro jumps here. The number that is saved in
5817frame->Xwhere indicates which label we actually want to return to. */
5818
5819#ifdef NO_RECURSE
5820#define LBL(val) case val: goto L_RM##val;
5821HEAP_RETURN:
5822switch (frame->Xwhere)
5823 {
5824 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5825 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5826 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5827 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5828 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5829 LBL(65) LBL(66)
5830#ifdef SUPPORT_UTF8
5831 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5832 LBL(32) LBL(34) LBL(42) LBL(46)
5833#ifdef SUPPORT_UCP
5834 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5835 LBL(59) LBL(60) LBL(61) LBL(62)
5836#endif /* SUPPORT_UCP */
5837#endif /* SUPPORT_UTF8 */
5838 default:
5839 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5840 return PCRE_ERROR_INTERNAL;
5841 }
5842#undef LBL
5843#endif /* NO_RECURSE */
5844}
5845
5846
5847/***************************************************************************
5848****************************************************************************
5849 RECURSION IN THE match() FUNCTION
5850
5851Undefine all the macros that were defined above to handle this. */
5852
5853#ifdef NO_RECURSE
5854#undef eptr
5855#undef ecode
5856#undef mstart
5857#undef offset_top
5858#undef eptrb
5859#undef flags
5860
5861#undef callpat
5862#undef charptr
5863#undef data
5864#undef next
5865#undef pp
5866#undef prev
5867#undef saved_eptr
5868
5869#undef new_recursive
5870
5871#undef cur_is_word
5872#undef condition
5873#undef prev_is_word
5874
5875#undef ctype
5876#undef length
5877#undef max
5878#undef min
5879#undef number
5880#undef offset
5881#undef op
5882#undef save_capture_last
5883#undef save_offset1
5884#undef save_offset2
5885#undef save_offset3
5886#undef stacksave
5887
5888#undef newptrb
5889
5890#endif
5891
5892/* These two are defined as macros in both cases */
5893
5894#undef fc
5895#undef fi
5896
5897/***************************************************************************
5898***************************************************************************/
5899
5900
5901
5902/*************************************************
5903* Execute a Regular Expression *
5904*************************************************/
5905
5906/* This function applies a compiled re to a subject string and picks out
5907portions of the string if it matches. Two elements in the vector are set for
5908each substring: the offsets to the start and end of the substring.
5909
5910Arguments:
5911 argument_re points to the compiled expression
5912 extra_data points to extra data or is NULL
5913 subject points to the subject string
5914 length length of subject string (may contain binary zeros)
5915 start_offset where to start in the subject string
5916 options option bits
5917 offsets points to a vector of ints to be filled in with offsets
5918 offsetcount the number of elements in the vector
5919
5920Returns: > 0 => success; value is the number of elements filled in
5921 = 0 => success, but offsets is not big enough
5922 -1 => failed to match
5923 < -1 => some kind of unexpected problem
5924*/
5925
5926PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5927pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5928 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5929 int offsetcount)
5930{
5931int rc, ocount, arg_offset_max;
5932int first_byte = -1;
5933int req_byte = -1;
5934int req_byte2 = -1;
5935int newline;
5936BOOL using_temporary_offsets = FALSE;
5937BOOL anchored;
5938BOOL startline;
5939BOOL firstline;
5940BOOL first_byte_caseless = FALSE;
5941BOOL req_byte_caseless = FALSE;
5942BOOL utf8;
5943match_data match_block;
5944match_data *md = &match_block;
5945const uschar *tables;
5946const uschar *start_bits = NULL;
5947USPTR start_match = (USPTR)subject + start_offset;
5948USPTR end_subject;
5949USPTR start_partial = NULL;
5950USPTR req_byte_ptr = start_match - 1;
5951
5952pcre_study_data internal_study;
5953const pcre_study_data *study;
5954
5955real_pcre internal_re;
5956const real_pcre *external_re = (const real_pcre *)argument_re;
5957const real_pcre *re = external_re;
5958
5959/* Plausibility checks */
5960
5961if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5962if (re == NULL || subject == NULL ||
5963 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5964if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5965if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5966
5967/* These two settings are used in the code for checking a UTF-8 string that
5968follows immediately afterwards. Other values in the md block are used only
5969during "normal" pcre_exec() processing, not when the JIT support is in use,
5970so they are set up later. */
5971
5972utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5973md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5974 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5975
5976/* Check a UTF-8 string if required. Pass back the character offset and error
5977code for an invalid string if a results vector is available. */
5978
5979#ifdef SUPPORT_UTF8
5980if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5981 {
5982 int erroroffset;
5983 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5984 if (errorcode != 0)
5985 {
5986 if (offsetcount >= 2)
5987 {
5988 offsets[0] = erroroffset;
5989 offsets[1] = errorcode;
5990 }
5991 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5992 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5993 }
5994
5995 /* Check that a start_offset points to the start of a UTF-8 character. */
5996 if (start_offset > 0 && start_offset < length &&
5997 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
5998 return PCRE_ERROR_BADUTF8_OFFSET;
5999 }
6000#endif
6001
6002/* If the pattern was successfully studied with JIT support, run the JIT
6003executable instead of the rest of this function. Most options must be set at
6004compile time for the JIT code to be usable. Fallback to the normal code path if
6005an unsupported flag is set. In particular, JIT does not support partial
6006matching. */
6007
6008#ifdef SUPPORT_JIT
6009if (extra_data != NULL
6010 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6011 && extra_data->executable_jit != NULL
6012 && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6013 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6014 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6015 return _pcre_jit_exec(re, extra_data->executable_jit, subject, length,
6016 start_offset, options, ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6017 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6018#endif
6019
6020/* Carry on with non-JIT matching. This information is for finding all the
6021numbers associated with a given name, for condition testing. */
6022
6023md->name_table = (uschar *)re + re->name_table_offset;
6024md->name_count = re->name_count;
6025md->name_entry_size = re->name_entry_size;
6026
6027/* Fish out the optional data from the extra_data structure, first setting
6028the default values. */
6029
6030study = NULL;
6031md->match_limit = MATCH_LIMIT;
6032md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6033md->callout_data = NULL;
6034
6035/* The table pointer is always in native byte order. */
6036
6037tables = external_re->tables;
6038
6039if (extra_data != NULL)
6040 {
6041 register unsigned int flags = extra_data->flags;
6042 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6043 study = (const pcre_study_data *)extra_data->study_data;
6044 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6045 md->match_limit = extra_data->match_limit;
6046 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6047 md->match_limit_recursion = extra_data->match_limit_recursion;
6048 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6049 md->callout_data = extra_data->callout_data;
6050 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6051 }
6052
6053/* If the exec call supplied NULL for tables, use the inbuilt ones. This
6054is a feature that makes it possible to save compiled regex and re-use them
6055in other programs later. */
6056
6057if (tables == NULL) tables = _pcre_default_tables;
6058
6059/* Check that the first field in the block is the magic number. If it is not,
6060test for a regex that was compiled on a host of opposite endianness. If this is
6061the case, flipped values are put in internal_re and internal_study if there was
6062study data too. */
6063
6064if (re->magic_number != MAGIC_NUMBER)
6065 {
6066 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
6067 if (re == NULL) return PCRE_ERROR_BADMAGIC;
6068 if (study != NULL) study = &internal_study;
6069 }
6070
6071/* Set up other data */
6072
6073anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6074startline = (re->flags & PCRE_STARTLINE) != 0;
6075firstline = (re->options & PCRE_FIRSTLINE) != 0;
6076
6077/* The code starts after the real_pcre block and the capture name table. */
6078
6079md->start_code = (const uschar *)external_re + re->name_table_offset +
6080 re->name_count * re->name_entry_size;
6081
6082md->start_subject = (USPTR)subject;
6083md->start_offset = start_offset;
6084md->end_subject = md->start_subject + length;
6085end_subject = md->end_subject;
6086
6087md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6088md->use_ucp = (re->options & PCRE_UCP) != 0;
6089md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6090md->ignore_skip_arg = FALSE;
6091
6092/* Some options are unpacked into BOOL variables in the hope that testing
6093them will be faster than individual option bits. */
6094
6095md->notbol = (options & PCRE_NOTBOL) != 0;
6096md->noteol = (options & PCRE_NOTEOL) != 0;
6097md->notempty = (options & PCRE_NOTEMPTY) != 0;
6098md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6099
6100md->hitend = FALSE;
6101md->mark = md->nomatch_mark = NULL; /* In case never set */
6102
6103md->recursive = NULL; /* No recursion at top level */
6104md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6105
6106md->lcc = tables + lcc_offset;
6107md->ctypes = tables + ctypes_offset;
6108
6109/* Handle different \R options. */
6110
6111switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6112 {
6113 case 0:
6114 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6115 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6116 else
6117#ifdef BSR_ANYCRLF
6118 md->bsr_anycrlf = TRUE;
6119#else
6120 md->bsr_anycrlf = FALSE;
6121#endif
6122 break;
6123
6124 case PCRE_BSR_ANYCRLF:
6125 md->bsr_anycrlf = TRUE;
6126 break;
6127
6128 case PCRE_BSR_UNICODE:
6129 md->bsr_anycrlf = FALSE;
6130 break;
6131
6132 default: return PCRE_ERROR_BADNEWLINE;
6133 }
6134
6135/* Handle different types of newline. The three bits give eight cases. If
6136nothing is set at run time, whatever was used at compile time applies. */
6137
6138switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6139 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6140 {
6141 case 0: newline = NEWLINE; break; /* Compile-time default */
6142 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6143 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6144 case PCRE_NEWLINE_CR+
6145 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6146 case PCRE_NEWLINE_ANY: newline = -1; break;
6147 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6148 default: return PCRE_ERROR_BADNEWLINE;
6149 }
6150
6151if (newline == -2)
6152 {
6153 md->nltype = NLTYPE_ANYCRLF;
6154 }
6155else if (newline < 0)
6156 {
6157 md->nltype = NLTYPE_ANY;
6158 }
6159else
6160 {
6161 md->nltype = NLTYPE_FIXED;
6162 if (newline > 255)
6163 {
6164 md->nllen = 2;
6165 md->nl[0] = (newline >> 8) & 255;
6166 md->nl[1] = newline & 255;
6167 }
6168 else
6169 {
6170 md->nllen = 1;
6171 md->nl[0] = newline;
6172 }
6173 }
6174
6175/* Partial matching was originally supported only for a restricted set of
6176regexes; from release 8.00 there are no restrictions, but the bits are still
6177defined (though never set). So there's no harm in leaving this code. */
6178
6179if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6180 return PCRE_ERROR_BADPARTIAL;
6181
6182/* If the expression has got more back references than the offsets supplied can
6183hold, we get a temporary chunk of working store to use during the matching.
6184Otherwise, we can use the vector supplied, rounding down its size to a multiple
6185of 3. */
6186
6187ocount = offsetcount - (offsetcount % 3);
6188arg_offset_max = (2*ocount)/3;
6189
6190if (re->top_backref > 0 && re->top_backref >= ocount/3)
6191 {
6192 ocount = re->top_backref * 3 + 3;
6193 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6194 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6195 using_temporary_offsets = TRUE;
6196 DPRINTF(("Got memory to hold back references\n"));
6197 }
6198else md->offset_vector = offsets;
6199
6200md->offset_end = ocount;
6201md->offset_max = (2*ocount)/3;
6202md->offset_overflow = FALSE;
6203md->capture_last = -1;
6204
6205/* Reset the working variable associated with each extraction. These should
6206never be used unless previously set, but they get saved and restored, and so we
6207initialize them to avoid reading uninitialized locations. Also, unset the
6208offsets for the matched string. This is really just for tidiness with callouts,
6209in case they inspect these fields. */
6210
6211if (md->offset_vector != NULL)
6212 {
6213 register int *iptr = md->offset_vector + ocount;
6214 register int *iend = iptr - re->top_bracket;
6215 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6216 while (--iptr >= iend) *iptr = -1;
6217 md->offset_vector[0] = md->offset_vector[1] = -1;
6218 }
6219
6220/* Set up the first character to match, if available. The first_byte value is
6221never set for an anchored regular expression, but the anchoring may be forced
6222at run time, so we have to test for anchoring. The first char may be unset for
6223an unanchored pattern, of course. If there's no first char and the pattern was
6224studied, there may be a bitmap of possible first characters. */
6225
6226if (!anchored)
6227 {
6228 if ((re->flags & PCRE_FIRSTSET) != 0)
6229 {
6230 first_byte = re->first_byte & 255;
6231 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6232 first_byte = md->lcc[first_byte];
6233 }
6234 else
6235 if (!startline && study != NULL &&
6236 (study->flags & PCRE_STUDY_MAPPED) != 0)
6237 start_bits = study->start_bits;
6238 }
6239
6240/* For anchored or unanchored matches, there may be a "last known required
6241character" set. */
6242
6243if ((re->flags & PCRE_REQCHSET) != 0)
6244 {
6245 req_byte = re->req_byte & 255;
6246 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6247 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6248 }
6249
6250
6251
6252
6253/* ==========================================================================*/
6254
6255/* Loop for handling unanchored repeated matching attempts; for anchored regexs
6256the loop runs just once. */
6257
6258for(;;)
6259 {
6260 USPTR save_end_subject = end_subject;
6261 USPTR new_start_match;
6262
6263 /* If firstline is TRUE, the start of the match is constrained to the first
6264 line of a multiline string. That is, the match must be before or at the first
6265 newline. Implement this by temporarily adjusting end_subject so that we stop
6266 scanning at a newline. If the match fails at the newline, later code breaks
6267 this loop. */
6268
6269 if (firstline)
6270 {
6271 USPTR t = start_match;
6272#ifdef SUPPORT_UTF8
6273 if (utf8)
6274 {
6275 while (t < md->end_subject && !IS_NEWLINE(t))
6276 {
6277 t++;
6278 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6279 }
6280 }
6281 else
6282#endif
6283 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6284 end_subject = t;
6285 }
6286
6287 /* There are some optimizations that avoid running the match if a known
6288 starting point is not found, or if a known later character is not present.
6289 However, there is an option that disables these, for testing and for ensuring
6290 that all callouts do actually occur. The option can be set in the regex by
6291 (*NO_START_OPT) or passed in match-time options. */
6292
6293 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6294 {
6295 /* Advance to a unique first byte if there is one. */
6296
6297 if (first_byte >= 0)
6298 {
6299 if (first_byte_caseless)
6300 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6301 start_match++;
6302 else
6303 while (start_match < end_subject && *start_match != first_byte)
6304 start_match++;
6305 }
6306
6307 /* Or to just after a linebreak for a multiline match */
6308
6309 else if (startline)
6310 {
6311 if (start_match > md->start_subject + start_offset)
6312 {
6313#ifdef SUPPORT_UTF8
6314 if (utf8)
6315 {
6316 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6317 {
6318 start_match++;
6319 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6320 start_match++;
6321 }
6322 }
6323 else
6324#endif
6325 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6326 start_match++;
6327
6328 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6329 and we are now at a LF, advance the match position by one more character.
6330 */
6331
6332 if (start_match[-1] == CHAR_CR &&
6333 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6334 start_match < end_subject &&
6335 *start_match == CHAR_NL)
6336 start_match++;
6337 }
6338 }
6339
6340 /* Or to a non-unique first byte after study */
6341
6342 else if (start_bits != NULL)
6343 {
6344 while (start_match < end_subject)
6345 {
6346 register unsigned int c = *start_match;
6347 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6348 {
6349 start_match++;
6350#ifdef SUPPORT_UTF8
6351 if (utf8)
6352 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6353 start_match++;
6354#endif
6355 }
6356 else break;
6357 }
6358 }
6359 } /* Starting optimizations */
6360
6361 /* Restore fudged end_subject */
6362
6363 end_subject = save_end_subject;
6364
6365 /* The following two optimizations are disabled for partial matching or if
6366 disabling is explicitly requested. */
6367
6368 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6369 {
6370 /* If the pattern was studied, a minimum subject length may be set. This is
6371 a lower bound; no actual string of that length may actually match the
6372 pattern. Although the value is, strictly, in characters, we treat it as
6373 bytes to avoid spending too much time in this optimization. */
6374
6375 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6376 (pcre_uint32)(end_subject - start_match) < study->minlength)
6377 {
6378 rc = MATCH_NOMATCH;
6379 break;
6380 }
6381
6382 /* If req_byte is set, we know that that character must appear in the
6383 subject for the match to succeed. If the first character is set, req_byte
6384 must be later in the subject; otherwise the test starts at the match point.
6385 This optimization can save a huge amount of backtracking in patterns with
6386 nested unlimited repeats that aren't going to match. Writing separate code
6387 for cased/caseless versions makes it go faster, as does using an
6388 autoincrement and backing off on a match.
6389
6390 HOWEVER: when the subject string is very, very long, searching to its end
6391 can take a long time, and give bad performance on quite ordinary patterns.
6392 This showed up when somebody was matching something like /^\d+C/ on a
6393 32-megabyte string... so we don't do this when the string is sufficiently
6394 long. */
6395
6396 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6397 {
6398 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6399
6400 /* We don't need to repeat the search if we haven't yet reached the
6401 place we found it at last time. */
6402
6403 if (p > req_byte_ptr)
6404 {
6405 if (req_byte_caseless)
6406 {
6407 while (p < end_subject)
6408 {
6409 register int pp = *p++;
6410 if (pp == req_byte || pp == req_byte2) { p--; break; }
6411 }
6412 }
6413 else
6414 {
6415 while (p < end_subject)
6416 {
6417 if (*p++ == req_byte) { p--; break; }
6418 }
6419 }
6420
6421 /* If we can't find the required character, break the matching loop,
6422 forcing a match failure. */
6423
6424 if (p >= end_subject)
6425 {
6426 rc = MATCH_NOMATCH;
6427 break;
6428 }
6429
6430 /* If we have found the required character, save the point where we
6431 found it, so that we don't search again next time round the loop if
6432 the start hasn't passed this character yet. */
6433
6434 req_byte_ptr = p;
6435 }
6436 }
6437 }
6438
6439#ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6440 printf(">>>> Match against: ");
6441 pchars(start_match, end_subject - start_match, TRUE, md);
6442 printf("\n");
6443#endif
6444
6445 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6446 first starting point for which a partial match was found. */
6447
6448 md->start_match_ptr = start_match;
6449 md->start_used_ptr = start_match;
6450 md->match_call_count = 0;
6451 md->match_function_type = 0;
6452 md->end_offset_top = 0;
6453 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6454 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6455
6456 switch(rc)
6457 {
6458 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6459 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6460 entirely. The only way we can do that is to re-do the match at the same
6461 point, with a flag to force SKIP with an argument to be ignored. Just
6462 treating this case as NOMATCH does not work because it does not check other
6463 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6464
6465 case MATCH_SKIP_ARG:
6466 new_start_match = start_match;
6467 md->ignore_skip_arg = TRUE;
6468 break;
6469
6470 /* SKIP passes back the next starting point explicitly, but if it is the
6471 same as the match we have just done, treat it as NOMATCH. */
6472
6473 case MATCH_SKIP:
6474 if (md->start_match_ptr != start_match)
6475 {
6476 new_start_match = md->start_match_ptr;
6477 break;
6478 }
6479 /* Fall through */
6480
6481 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6482 exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
6483
6484 case MATCH_NOMATCH:
6485 case MATCH_PRUNE:
6486 case MATCH_THEN:
6487 md->ignore_skip_arg = FALSE;
6488 new_start_match = start_match + 1;
6489#ifdef SUPPORT_UTF8
6490 if (utf8)
6491 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6492 new_start_match++;
6493#endif
6494 break;
6495
6496 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6497
6498 case MATCH_COMMIT:
6499 rc = MATCH_NOMATCH;
6500 goto ENDLOOP;
6501
6502 /* Any other return is either a match, or some kind of error. */
6503
6504 default:
6505 goto ENDLOOP;
6506 }
6507
6508 /* Control reaches here for the various types of "no match at this point"
6509 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6510
6511 rc = MATCH_NOMATCH;
6512
6513 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6514 newline in the subject (though it may continue over the newline). Therefore,
6515 if we have just failed to match, starting at a newline, do not continue. */
6516
6517 if (firstline && IS_NEWLINE(start_match)) break;
6518
6519 /* Advance to new matching position */
6520
6521 start_match = new_start_match;
6522
6523 /* Break the loop if the pattern is anchored or if we have passed the end of
6524 the subject. */
6525
6526 if (anchored || start_match > end_subject) break;
6527
6528 /* If we have just passed a CR and we are now at a LF, and the pattern does
6529 not contain any explicit matches for \r or \n, and the newline option is CRLF
6530 or ANY or ANYCRLF, advance the match position by one more character. */
6531
6532 if (start_match[-1] == CHAR_CR &&
6533 start_match < end_subject &&
6534 *start_match == CHAR_NL &&
6535 (re->flags & PCRE_HASCRORLF) == 0 &&
6536 (md->nltype == NLTYPE_ANY ||
6537 md->nltype == NLTYPE_ANYCRLF ||
6538 md->nllen == 2))
6539 start_match++;
6540
6541 md->mark = NULL; /* Reset for start of next match attempt */
6542 } /* End of for(;;) "bumpalong" loop */
6543
6544/* ==========================================================================*/
6545
6546/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6547conditions is true:
6548
6549(1) The pattern is anchored or the match was failed by (*COMMIT);
6550
6551(2) We are past the end of the subject;
6552
6553(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6554 this option requests that a match occur at or before the first newline in
6555 the subject.
6556
6557When we have a match and the offset vector is big enough to deal with any
6558backreferences, captured substring offsets will already be set up. In the case
6559where we had to get some local store to hold offsets for backreference
6560processing, copy those that we can. In this case there need not be overflow if
6561certain parts of the pattern were not used, even though there are more
6562capturing parentheses than vector slots. */
6563
6564ENDLOOP:
6565
6566if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6567 {
6568 if (using_temporary_offsets)
6569 {
6570 if (arg_offset_max >= 4)
6571 {
6572 memcpy(offsets + 2, md->offset_vector + 2,
6573 (arg_offset_max - 2) * sizeof(int));
6574 DPRINTF(("Copied offsets from temporary memory\n"));
6575 }
6576 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6577 DPRINTF(("Freeing temporary memory\n"));
6578 (pcre_free)(md->offset_vector);
6579 }
6580
6581 /* Set the return code to the number of captured strings, or 0 if there were
6582 too many to fit into the vector. */
6583
6584 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6585 0 : md->end_offset_top/2;
6586
6587 /* If there is space in the offset vector, set any unused pairs at the end of
6588 the pattern to -1 for backwards compatibility. It is documented that this
6589 happens. In earlier versions, the whole set of potential capturing offsets
6590 was set to -1 each time round the loop, but this is handled differently now.
6591 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6592 those at the end that need unsetting here. We can't just unset them all at
6593 the start of the whole thing because they may get set in one branch that is
6594 not the final matching branch. */
6595
6596 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6597 {
6598 register int *iptr, *iend;
6599 int resetcount = 2 + re->top_bracket * 2;
6600 if (resetcount > offsetcount) resetcount = ocount;
6601 iptr = offsets + md->end_offset_top;
6602 iend = offsets + resetcount;
6603 while (iptr < iend) *iptr++ = -1;
6604 }
6605
6606 /* If there is space, set up the whole thing as substring 0. The value of
6607 md->start_match_ptr might be modified if \K was encountered on the success
6608 matching path. */
6609
6610 if (offsetcount < 2) rc = 0; else
6611 {
6612 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6613 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6614 }
6615
6616 /* Return MARK data if requested */
6617
6618 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6619 *(extra_data->mark) = (unsigned char *)(md->mark);
6620 DPRINTF((">>>> returning %d\n", rc));
6621 return rc;
6622 }
6623
6624/* Control gets here if there has been an error, or if the overall match
6625attempt has failed at all permitted starting positions. */
6626
6627if (using_temporary_offsets)
6628 {
6629 DPRINTF(("Freeing temporary memory\n"));
6630 (pcre_free)(md->offset_vector);
6631 }
6632
6633/* For anything other than nomatch or partial match, just return the code. */
6634
6635if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6636 {
6637 DPRINTF((">>>> error: returning %d\n", rc));
6638 return rc;
6639 }
6640
6641/* Handle partial matches - disable any mark data */
6642
6643if (start_partial != NULL)
6644 {
6645 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6646 md->mark = NULL;
6647 if (offsetcount > 1)
6648 {
6649 offsets[0] = (int)(start_partial - (USPTR)subject);
6650 offsets[1] = (int)(end_subject - (USPTR)subject);
6651 }
6652 rc = PCRE_ERROR_PARTIAL;
6653 }
6654
6655/* This is the classic nomatch case */
6656
6657else
6658 {
6659 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6660 rc = PCRE_ERROR_NOMATCH;
6661 }
6662
6663/* Return the MARK data if it has been requested. */
6664
6665if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6666 *(extra_data->mark) = (unsigned char *)(md->nomatch_mark);
6667return rc;
6668}
6669
6670/* End of pcre_exec.c */