forked from LeenkxTeam/LNXSDK
		
	
		
			
	
	
		
			1366 lines
		
	
	
		
			41 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
		
		
			
		
	
	
			1366 lines
		
	
	
		
			41 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
|  | /*************************************************
 | ||
|  | *      Perl-Compatible Regular Expressions       * | ||
|  | *************************************************/ | ||
|  | 
 | ||
|  | /* PCRE is a library of functions to support regular expressions whose syntax
 | ||
|  | and semantics are as close as possible to those of the Perl 5 language. | ||
|  | 
 | ||
|  |                        Written by Philip Hazel | ||
|  |      Original API code Copyright (c) 1997-2012 University of Cambridge | ||
|  |           New API code Copyright (c) 2016-2022 University of Cambridge | ||
|  | 
 | ||
|  | ----------------------------------------------------------------------------- | ||
|  | Redistribution and use in source and binary forms, with or without | ||
|  | modification, are permitted provided that the following conditions are met: | ||
|  | 
 | ||
|  |     * Redistributions of source code must retain the above copyright notice, | ||
|  |       this list of conditions and the following disclaimer. | ||
|  | 
 | ||
|  |     * Redistributions in binary form must reproduce the above copyright | ||
|  |       notice, this list of conditions and the following disclaimer in the | ||
|  |       documentation and/or other materials provided with the distribution. | ||
|  | 
 | ||
|  |     * Neither the name of the University of Cambridge nor the names of its | ||
|  |       contributors may be used to endorse or promote products derived from | ||
|  |       this software without specific prior written permission. | ||
|  | 
 | ||
|  | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
|  | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
|  | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
|  | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | ||
|  | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
|  | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
|  | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
|  | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
|  | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
|  | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
|  | POSSIBILITY OF SUCH DAMAGE. | ||
|  | ----------------------------------------------------------------------------- | ||
|  | */ | ||
|  | 
 | ||
|  | /* This module contains functions that scan a compiled pattern and change
 | ||
|  | repeats into possessive repeats where possible. */ | ||
|  | 
 | ||
|  | 
 | ||
|  | #ifdef HAVE_CONFIG_H
 | ||
|  | #include "config.h"
 | ||
|  | #endif
 | ||
|  | 
 | ||
|  | 
 | ||
|  | #include "pcre2_internal.h"
 | ||
|  | 
 | ||
|  | 
 | ||
|  | /*************************************************
 | ||
|  | *        Tables for auto-possessification        * | ||
|  | *************************************************/ | ||
|  | 
 | ||
|  | /* This table is used to check whether auto-possessification is possible
 | ||
|  | between adjacent character-type opcodes. The left-hand (repeated) opcode is | ||
|  | used to select the row, and the right-hand opcode is use to select the column. | ||
|  | A value of 1 means that auto-possessification is OK. For example, the second | ||
|  | value in the first row means that \D+\d can be turned into \D++\d. | ||
|  | 
 | ||
|  | The Unicode property types (\P and \p) have to be present to fill out the table | ||
|  | because of what their opcode values are, but the table values should always be | ||
|  | zero because property types are handled separately in the code. The last four | ||
|  | columns apply to items that cannot be repeated, so there is no need to have | ||
|  | rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is | ||
|  | *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ | ||
|  | 
 | ||
|  | #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
 | ||
|  | #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
 | ||
|  | 
 | ||
|  | static const uint8_t autoposstab[APTROWS][APTCOLS] = { | ||
|  | /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */ | ||
|  |   { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */ | ||
|  |   { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */ | ||
|  |   { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */ | ||
|  |   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */ | ||
|  |   { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */ | ||
|  |   { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */ | ||
|  |   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */ | ||
|  |   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */ | ||
|  |   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */ | ||
|  |   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */ | ||
|  |   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */ | ||
|  |   { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */ | ||
|  |   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */ | ||
|  |   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */ | ||
|  |   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */ | ||
|  |   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */ | ||
|  |   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */ | ||
|  | }; | ||
|  | 
 | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  | /* This table is used to check whether auto-possessification is possible
 | ||
|  | between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The | ||
|  | left-hand (repeated) opcode is used to select the row, and the right-hand | ||
|  | opcode is used to select the column. The values are as follows: | ||
|  | 
 | ||
|  |   0   Always return FALSE (never auto-possessify) | ||
|  |   1   Character groups are distinct (possessify if both are OP_PROP) | ||
|  |   2   Check character categories in the same group (general or particular) | ||
|  |   3   TRUE if the two opcodes are not the same (PROP vs NOTPROP) | ||
|  | 
 | ||
|  |   4   Check left general category vs right particular category | ||
|  |   5   Check right general category vs left particular category | ||
|  | 
 | ||
|  |   6   Left alphanum vs right general category | ||
|  |   7   Left space vs right general category | ||
|  |   8   Left word vs right general category | ||
|  | 
 | ||
|  |   9   Right alphanum vs left general category | ||
|  |  10   Right space vs left general category | ||
|  |  11   Right word vs left general category | ||
|  | 
 | ||
|  |  12   Left alphanum vs right particular category | ||
|  |  13   Left space vs right particular category | ||
|  |  14   Left word vs right particular category | ||
|  | 
 | ||
|  |  15   Right alphanum vs left particular category | ||
|  |  16   Right space vs left particular category | ||
|  |  17   Right word vs left particular category | ||
|  | */ | ||
|  | 
 | ||
|  | static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = { | ||
|  | /* ANY LAMP GC  PC  SC  SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */ | ||
|  |   { 0,  0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_ANY */ | ||
|  |   { 0,  3,  0,  0,  0,   0,    3,    1,      1,   0,    0,   0,    0,    0 },  /* PT_LAMP */ | ||
|  |   { 0,  0,  2,  4,  0,   0,    9,   10,     10,  11,    0,   0,    0,    0 },  /* PT_GC */ | ||
|  |   { 0,  0,  5,  2,  0,   0,   15,   16,     16,  17,    0,   0,    0,    0 },  /* PT_PC */ | ||
|  |   { 0,  0,  0,  0,  2,   2,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_SC */ | ||
|  |   { 0,  0,  0,  0,  2,   2,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_SCX */ | ||
|  |   { 0,  3,  6, 12,  0,   0,    3,    1,      1,   0,    0,   0,    0,    0 },  /* PT_ALNUM */ | ||
|  |   { 0,  1,  7, 13,  0,   0,    1,    3,      3,   1,    0,   0,    0,    0 },  /* PT_SPACE */ | ||
|  |   { 0,  1,  7, 13,  0,   0,    1,    3,      3,   1,    0,   0,    0,    0 },  /* PT_PXSPACE */ | ||
|  |   { 0,  0,  8, 14,  0,   0,    0,    1,      1,   3,    0,   0,    0,    0 },  /* PT_WORD */ | ||
|  |   { 0,  0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_CLIST */ | ||
|  |   { 0,  0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   3,    0,    0 },  /* PT_UCNC */ | ||
|  |   { 0,  0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_BIDICL */ | ||
|  |   { 0,  0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 }   /* PT_BOOL */ | ||
|  | }; | ||
|  | 
 | ||
|  | /* This table is used to check whether auto-possessification is possible
 | ||
|  | between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one | ||
|  | specifies a general category and the other specifies a particular category. The | ||
|  | row is selected by the general category and the column by the particular | ||
|  | category. The value is 1 if the particular category is not part of the general | ||
|  | category. */ | ||
|  | 
 | ||
|  | static const uint8_t catposstab[7][30] = { | ||
|  | /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */ | ||
|  |   { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */ | ||
|  |   { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */ | ||
|  |   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */ | ||
|  |   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */ | ||
|  |   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */ | ||
|  |   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */ | ||
|  |   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */ | ||
|  | }; | ||
|  | 
 | ||
|  | /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
 | ||
|  | a general or particular category. The properties in each row are those | ||
|  | that apply to the character set in question. Duplication means that a little | ||
|  | unnecessary work is done when checking, but this keeps things much simpler | ||
|  | because they can all use the same code. For more details see the comment where | ||
|  | this table is used. | ||
|  | 
 | ||
|  | Note: SPACE and PXSPACE used to be different because Perl excluded VT from | ||
|  | "space", but from Perl 5.18 it's included, so both categories are treated the | ||
|  | same here. */ | ||
|  | 
 | ||
|  | static const uint8_t posspropstab[3][4] = { | ||
|  |   { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */ | ||
|  |   { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */ | ||
|  |   { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */ | ||
|  | }; | ||
|  | #endif  /* SUPPORT_UNICODE */
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  | /*************************************************
 | ||
|  | *        Check a character and a property        * | ||
|  | *************************************************/ | ||
|  | 
 | ||
|  | /* This function is called by compare_opcodes() when a property item is
 | ||
|  | adjacent to a fixed character. | ||
|  | 
 | ||
|  | Arguments: | ||
|  |   c            the character | ||
|  |   ptype        the property type | ||
|  |   pdata        the data for the type | ||
|  |   negated      TRUE if it's a negated property (\P or \p{^) | ||
|  | 
 | ||
|  | Returns:       TRUE if auto-possessifying is OK | ||
|  | */ | ||
|  | 
 | ||
|  | static BOOL | ||
|  | check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata, | ||
|  |   BOOL negated) | ||
|  | { | ||
|  | BOOL ok; | ||
|  | const uint32_t *p; | ||
|  | const ucd_record *prop = GET_UCD(c); | ||
|  | 
 | ||
|  | switch(ptype) | ||
|  |   { | ||
|  |   case PT_LAMP: | ||
|  |   return (prop->chartype == ucp_Lu || | ||
|  |           prop->chartype == ucp_Ll || | ||
|  |           prop->chartype == ucp_Lt) == negated; | ||
|  | 
 | ||
|  |   case PT_GC: | ||
|  |   return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated; | ||
|  | 
 | ||
|  |   case PT_PC: | ||
|  |   return (pdata == prop->chartype) == negated; | ||
|  | 
 | ||
|  |   case PT_SC: | ||
|  |   return (pdata == prop->script) == negated; | ||
|  | 
 | ||
|  |   case PT_SCX: | ||
|  |   ok = (pdata == prop->script | ||
|  |         || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0); | ||
|  |   return ok == negated; | ||
|  | 
 | ||
|  |   /* These are specials */ | ||
|  | 
 | ||
|  |   case PT_ALNUM: | ||
|  |   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || | ||
|  |           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated; | ||
|  | 
 | ||
|  |   /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
 | ||
|  |   means that Perl space and POSIX space are now identical. PCRE was changed | ||
|  |   at release 8.34. */ | ||
|  | 
 | ||
|  |   case PT_SPACE:    /* Perl space */ | ||
|  |   case PT_PXSPACE:  /* POSIX space */ | ||
|  |   switch(c) | ||
|  |     { | ||
|  |     HSPACE_CASES: | ||
|  |     VSPACE_CASES: | ||
|  |     return negated; | ||
|  | 
 | ||
|  |     default: | ||
|  |     return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated; | ||
|  |     } | ||
|  |   break;  /* Control never reaches here */ | ||
|  | 
 | ||
|  |   case PT_WORD: | ||
|  |   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || | ||
|  |           PRIV(ucp_gentype)[prop->chartype] == ucp_N || | ||
|  |           c == CHAR_UNDERSCORE) == negated; | ||
|  | 
 | ||
|  |   case PT_CLIST: | ||
|  |   p = PRIV(ucd_caseless_sets) + prop->caseset; | ||
|  |   for (;;) | ||
|  |     { | ||
|  |     if (c < *p) return !negated; | ||
|  |     if (c == *p++) return negated; | ||
|  |     } | ||
|  |   break;  /* Control never reaches here */ | ||
|  | 
 | ||
|  |   /* Haven't yet thought these through. */ | ||
|  | 
 | ||
|  |   case PT_BIDICL: | ||
|  |   return FALSE; | ||
|  | 
 | ||
|  |   case PT_BOOL: | ||
|  |   return FALSE; | ||
|  |   } | ||
|  | 
 | ||
|  | return FALSE; | ||
|  | } | ||
|  | #endif  /* SUPPORT_UNICODE */
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | /*************************************************
 | ||
|  | *        Base opcode of repeated opcodes         * | ||
|  | *************************************************/ | ||
|  | 
 | ||
|  | /* Returns the base opcode for repeated single character type opcodes. If the
 | ||
|  | opcode is not a repeated character type, it returns with the original value. | ||
|  | 
 | ||
|  | Arguments:  c opcode | ||
|  | Returns:    base opcode for the type | ||
|  | */ | ||
|  | 
 | ||
|  | static PCRE2_UCHAR | ||
|  | get_repeat_base(PCRE2_UCHAR c) | ||
|  | { | ||
|  | return (c > OP_TYPEPOSUPTO)? c : | ||
|  |        (c >= OP_TYPESTAR)?   OP_TYPESTAR : | ||
|  |        (c >= OP_NOTSTARI)?   OP_NOTSTARI : | ||
|  |        (c >= OP_NOTSTAR)?    OP_NOTSTAR : | ||
|  |        (c >= OP_STARI)?      OP_STARI : | ||
|  |                              OP_STAR; | ||
|  | } | ||
|  | 
 | ||
|  | 
 | ||
|  | /*************************************************
 | ||
|  | *        Fill the character property list        * | ||
|  | *************************************************/ | ||
|  | 
 | ||
|  | /* Checks whether the code points to an opcode that can take part in auto-
 | ||
|  | possessification, and if so, fills a list with its properties. | ||
|  | 
 | ||
|  | Arguments: | ||
|  |   code        points to start of expression | ||
|  |   utf         TRUE if in UTF mode | ||
|  |   ucp         TRUE if in UCP mode | ||
|  |   fcc         points to the case-flipping table | ||
|  |   list        points to output list | ||
|  |               list[0] will be filled with the opcode | ||
|  |               list[1] will be non-zero if this opcode | ||
|  |                 can match an empty character string | ||
|  |               list[2..7] depends on the opcode | ||
|  | 
 | ||
|  | Returns:      points to the start of the next opcode if *code is accepted | ||
|  |               NULL if *code is not accepted | ||
|  | */ | ||
|  | 
 | ||
|  | static PCRE2_SPTR | ||
|  | get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc, | ||
|  |   uint32_t *list) | ||
|  | { | ||
|  | PCRE2_UCHAR c = *code; | ||
|  | PCRE2_UCHAR base; | ||
|  | PCRE2_SPTR end; | ||
|  | uint32_t chr; | ||
|  | 
 | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  | uint32_t *clist_dest; | ||
|  | const uint32_t *clist_src; | ||
|  | #else
 | ||
|  | (void)utf;    /* Suppress "unused parameter" compiler warnings */ | ||
|  | (void)ucp; | ||
|  | #endif
 | ||
|  | 
 | ||
|  | list[0] = c; | ||
|  | list[1] = FALSE; | ||
|  | code++; | ||
|  | 
 | ||
|  | if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) | ||
|  |   { | ||
|  |   base = get_repeat_base(c); | ||
|  |   c -= (base - OP_STAR); | ||
|  | 
 | ||
|  |   if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO) | ||
|  |     code += IMM2_SIZE; | ||
|  | 
 | ||
|  |   list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && | ||
|  |              c != OP_POSPLUS); | ||
|  | 
 | ||
|  |   switch(base) | ||
|  |     { | ||
|  |     case OP_STAR: | ||
|  |     list[0] = OP_CHAR; | ||
|  |     break; | ||
|  | 
 | ||
|  |     case OP_STARI: | ||
|  |     list[0] = OP_CHARI; | ||
|  |     break; | ||
|  | 
 | ||
|  |     case OP_NOTSTAR: | ||
|  |     list[0] = OP_NOT; | ||
|  |     break; | ||
|  | 
 | ||
|  |     case OP_NOTSTARI: | ||
|  |     list[0] = OP_NOTI; | ||
|  |     break; | ||
|  | 
 | ||
|  |     case OP_TYPESTAR: | ||
|  |     list[0] = *code; | ||
|  |     code++; | ||
|  |     break; | ||
|  |     } | ||
|  |   c = list[0]; | ||
|  |   } | ||
|  | 
 | ||
|  | switch(c) | ||
|  |   { | ||
|  |   case OP_NOT_DIGIT: | ||
|  |   case OP_DIGIT: | ||
|  |   case OP_NOT_WHITESPACE: | ||
|  |   case OP_WHITESPACE: | ||
|  |   case OP_NOT_WORDCHAR: | ||
|  |   case OP_WORDCHAR: | ||
|  |   case OP_ANY: | ||
|  |   case OP_ALLANY: | ||
|  |   case OP_ANYNL: | ||
|  |   case OP_NOT_HSPACE: | ||
|  |   case OP_HSPACE: | ||
|  |   case OP_NOT_VSPACE: | ||
|  |   case OP_VSPACE: | ||
|  |   case OP_EXTUNI: | ||
|  |   case OP_EODN: | ||
|  |   case OP_EOD: | ||
|  |   case OP_DOLL: | ||
|  |   case OP_DOLLM: | ||
|  |   return code; | ||
|  | 
 | ||
|  |   case OP_CHAR: | ||
|  |   case OP_NOT: | ||
|  |   GETCHARINCTEST(chr, code); | ||
|  |   list[2] = chr; | ||
|  |   list[3] = NOTACHAR; | ||
|  |   return code; | ||
|  | 
 | ||
|  |   case OP_CHARI: | ||
|  |   case OP_NOTI: | ||
|  |   list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT; | ||
|  |   GETCHARINCTEST(chr, code); | ||
|  |   list[2] = chr; | ||
|  | 
 | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  |   if (chr < 128 || (chr < 256 && !utf && !ucp)) | ||
|  |     list[3] = fcc[chr]; | ||
|  |   else | ||
|  |     list[3] = UCD_OTHERCASE(chr); | ||
|  | #elif defined SUPPORT_WIDE_CHARS
 | ||
|  |   list[3] = (chr < 256) ? fcc[chr] : chr; | ||
|  | #else
 | ||
|  |   list[3] = fcc[chr]; | ||
|  | #endif
 | ||
|  | 
 | ||
|  |   /* The othercase might be the same value. */ | ||
|  | 
 | ||
|  |   if (chr == list[3]) | ||
|  |     list[3] = NOTACHAR; | ||
|  |   else | ||
|  |     list[4] = NOTACHAR; | ||
|  |   return code; | ||
|  | 
 | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  |   case OP_PROP: | ||
|  |   case OP_NOTPROP: | ||
|  |   if (code[0] != PT_CLIST) | ||
|  |     { | ||
|  |     list[2] = code[0]; | ||
|  |     list[3] = code[1]; | ||
|  |     return code + 2; | ||
|  |     } | ||
|  | 
 | ||
|  |   /* Convert only if we have enough space. */ | ||
|  | 
 | ||
|  |   clist_src = PRIV(ucd_caseless_sets) + code[1]; | ||
|  |   clist_dest = list + 2; | ||
|  |   code += 2; | ||
|  | 
 | ||
|  |   do { | ||
|  |      if (clist_dest >= list + 8) | ||
|  |        { | ||
|  |        /* Early return if there is not enough space. This should never
 | ||
|  |        happen, since all clists are shorter than 5 character now. */ | ||
|  |        list[2] = code[0]; | ||
|  |        list[3] = code[1]; | ||
|  |        return code; | ||
|  |        } | ||
|  |      *clist_dest++ = *clist_src; | ||
|  |      } | ||
|  |   while(*clist_src++ != NOTACHAR); | ||
|  | 
 | ||
|  |   /* All characters are stored. The terminating NOTACHAR is copied from the
 | ||
|  |   clist itself. */ | ||
|  | 
 | ||
|  |   list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT; | ||
|  |   return code; | ||
|  | #endif
 | ||
|  | 
 | ||
|  |   case OP_NCLASS: | ||
|  |   case OP_CLASS: | ||
|  | #ifdef SUPPORT_WIDE_CHARS
 | ||
|  |   case OP_XCLASS: | ||
|  |   if (c == OP_XCLASS) | ||
|  |     end = code + GET(code, 0) - 1; | ||
|  |   else | ||
|  | #endif
 | ||
|  |     end = code + 32 / sizeof(PCRE2_UCHAR); | ||
|  | 
 | ||
|  |   switch(*end) | ||
|  |     { | ||
|  |     case OP_CRSTAR: | ||
|  |     case OP_CRMINSTAR: | ||
|  |     case OP_CRQUERY: | ||
|  |     case OP_CRMINQUERY: | ||
|  |     case OP_CRPOSSTAR: | ||
|  |     case OP_CRPOSQUERY: | ||
|  |     list[1] = TRUE; | ||
|  |     end++; | ||
|  |     break; | ||
|  | 
 | ||
|  |     case OP_CRPLUS: | ||
|  |     case OP_CRMINPLUS: | ||
|  |     case OP_CRPOSPLUS: | ||
|  |     end++; | ||
|  |     break; | ||
|  | 
 | ||
|  |     case OP_CRRANGE: | ||
|  |     case OP_CRMINRANGE: | ||
|  |     case OP_CRPOSRANGE: | ||
|  |     list[1] = (GET2(end, 1) == 0); | ||
|  |     end += 1 + 2 * IMM2_SIZE; | ||
|  |     break; | ||
|  |     } | ||
|  |   list[2] = (uint32_t)(end - code); | ||
|  |   return end; | ||
|  |   } | ||
|  | 
 | ||
|  | return NULL;    /* Opcode not accepted */ | ||
|  | } | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | /*************************************************
 | ||
|  | *    Scan further character sets for match       * | ||
|  | *************************************************/ | ||
|  | 
 | ||
|  | /* Checks whether the base and the current opcode have a common character, in
 | ||
|  | which case the base cannot be possessified. | ||
|  | 
 | ||
|  | Arguments: | ||
|  |   code        points to the byte code | ||
|  |   utf         TRUE in UTF mode | ||
|  |   ucp         TRUE in UCP mode | ||
|  |   cb          compile data block | ||
|  |   base_list   the data list of the base opcode | ||
|  |   base_end    the end of the base opcode | ||
|  |   rec_limit   points to recursion depth counter | ||
|  | 
 | ||
|  | Returns:      TRUE if the auto-possessification is possible | ||
|  | */ | ||
|  | 
 | ||
|  | static BOOL | ||
|  | compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb, | ||
|  |   const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit) | ||
|  | { | ||
|  | PCRE2_UCHAR c; | ||
|  | uint32_t list[8]; | ||
|  | const uint32_t *chr_ptr; | ||
|  | const uint32_t *ochr_ptr; | ||
|  | const uint32_t *list_ptr; | ||
|  | PCRE2_SPTR next_code; | ||
|  | #ifdef SUPPORT_WIDE_CHARS
 | ||
|  | PCRE2_SPTR xclass_flags; | ||
|  | #endif
 | ||
|  | const uint8_t *class_bitset; | ||
|  | const uint8_t *set1, *set2, *set_end; | ||
|  | uint32_t chr; | ||
|  | BOOL accepted, invert_bits; | ||
|  | BOOL entered_a_group = FALSE; | ||
|  | 
 | ||
|  | if (--(*rec_limit) <= 0) return FALSE;  /* Recursion has gone too deep */ | ||
|  | 
 | ||
|  | /* Note: the base_list[1] contains whether the current opcode has a greedy
 | ||
|  | (represented by a non-zero value) quantifier. This is a different from | ||
|  | other character type lists, which store here that the character iterator | ||
|  | matches to an empty string (also represented by a non-zero value). */ | ||
|  | 
 | ||
|  | for(;;) | ||
|  |   { | ||
|  |   /* All operations move the code pointer forward.
 | ||
|  |   Therefore infinite recursions are not possible. */ | ||
|  | 
 | ||
|  |   c = *code; | ||
|  | 
 | ||
|  |   /* Skip over callouts */ | ||
|  | 
 | ||
|  |   if (c == OP_CALLOUT) | ||
|  |     { | ||
|  |     code += PRIV(OP_lengths)[c]; | ||
|  |     continue; | ||
|  |     } | ||
|  | 
 | ||
|  |   if (c == OP_CALLOUT_STR) | ||
|  |     { | ||
|  |     code += GET(code, 1 + 2*LINK_SIZE); | ||
|  |     continue; | ||
|  |     } | ||
|  | 
 | ||
|  |   /* At the end of a branch, skip to the end of the group. */ | ||
|  | 
 | ||
|  |   if (c == OP_ALT) | ||
|  |     { | ||
|  |     do code += GET(code, 1); while (*code == OP_ALT); | ||
|  |     c = *code; | ||
|  |     } | ||
|  | 
 | ||
|  |   /* Inspect the next opcode. */ | ||
|  | 
 | ||
|  |   switch(c) | ||
|  |     { | ||
|  |     /* We can always possessify a greedy iterator at the end of the pattern,
 | ||
|  |     which is reached after skipping over the final OP_KET. A non-greedy | ||
|  |     iterator must never be possessified. */ | ||
|  | 
 | ||
|  |     case OP_END: | ||
|  |     return base_list[1] != 0; | ||
|  | 
 | ||
|  |     /* When an iterator is at the end of certain kinds of group we can inspect
 | ||
|  |     what follows the group by skipping over the closing ket. Note that this | ||
|  |     does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given | ||
|  |     iteration is variable (could be another iteration or could be the next | ||
|  |     item). As these two opcodes are not listed in the next switch, they will | ||
|  |     end up as the next code to inspect, and return FALSE by virtue of being | ||
|  |     unsupported. */ | ||
|  | 
 | ||
|  |     case OP_KET: | ||
|  |     case OP_KETRPOS: | ||
|  |     /* The non-greedy case cannot be converted to a possessive form. */ | ||
|  | 
 | ||
|  |     if (base_list[1] == 0) return FALSE; | ||
|  | 
 | ||
|  |     /* If the bracket is capturing it might be referenced by an OP_RECURSE
 | ||
|  |     so its last iterator can never be possessified if the pattern contains | ||
|  |     recursions. (This could be improved by keeping a list of group numbers that | ||
|  |     are called by recursion.) */ | ||
|  | 
 | ||
|  |     switch(*(code - GET(code, 1))) | ||
|  |       { | ||
|  |       case OP_CBRA: | ||
|  |       case OP_SCBRA: | ||
|  |       case OP_CBRAPOS: | ||
|  |       case OP_SCBRAPOS: | ||
|  |       if (cb->had_recurse) return FALSE; | ||
|  |       break; | ||
|  | 
 | ||
|  |       /* A script run might have to backtrack if the iterated item can match
 | ||
|  |       characters from more than one script. So give up unless repeating an | ||
|  |       explicit character. */ | ||
|  | 
 | ||
|  |       case OP_SCRIPT_RUN: | ||
|  |       if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI) | ||
|  |         return FALSE; | ||
|  |       break; | ||
|  | 
 | ||
|  |       /* Atomic sub-patterns and assertions can always auto-possessify their
 | ||
|  |       last iterator. However, if the group was entered as a result of checking | ||
|  |       a previous iterator, this is not possible. */ | ||
|  | 
 | ||
|  |       case OP_ASSERT: | ||
|  |       case OP_ASSERT_NOT: | ||
|  |       case OP_ASSERTBACK: | ||
|  |       case OP_ASSERTBACK_NOT: | ||
|  |       case OP_ONCE: | ||
|  |       return !entered_a_group; | ||
|  | 
 | ||
|  |       /* Non-atomic assertions - don't possessify last iterator. This needs
 | ||
|  |       more thought. */ | ||
|  | 
 | ||
|  |       case OP_ASSERT_NA: | ||
|  |       case OP_ASSERTBACK_NA: | ||
|  |       return FALSE; | ||
|  |       } | ||
|  | 
 | ||
|  |     /* Skip over the bracket and inspect what comes next. */ | ||
|  | 
 | ||
|  |     code += PRIV(OP_lengths)[c]; | ||
|  |     continue; | ||
|  | 
 | ||
|  |     /* Handle cases where the next item is a group. */ | ||
|  | 
 | ||
|  |     case OP_ONCE: | ||
|  |     case OP_BRA: | ||
|  |     case OP_CBRA: | ||
|  |     next_code = code + GET(code, 1); | ||
|  |     code += PRIV(OP_lengths)[c]; | ||
|  | 
 | ||
|  |     /* Check each branch. We have to recurse a level for all but the last
 | ||
|  |     branch. */ | ||
|  | 
 | ||
|  |     while (*next_code == OP_ALT) | ||
|  |       { | ||
|  |       if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit)) | ||
|  |         return FALSE; | ||
|  |       code = next_code + 1 + LINK_SIZE; | ||
|  |       next_code += GET(next_code, 1); | ||
|  |       } | ||
|  | 
 | ||
|  |     entered_a_group = TRUE; | ||
|  |     continue; | ||
|  | 
 | ||
|  |     case OP_BRAZERO: | ||
|  |     case OP_BRAMINZERO: | ||
|  | 
 | ||
|  |     next_code = code + 1; | ||
|  |     if (*next_code != OP_BRA && *next_code != OP_CBRA && | ||
|  |         *next_code != OP_ONCE) return FALSE; | ||
|  | 
 | ||
|  |     do next_code += GET(next_code, 1); while (*next_code == OP_ALT); | ||
|  | 
 | ||
|  |     /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */ | ||
|  | 
 | ||
|  |     next_code += 1 + LINK_SIZE; | ||
|  |     if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end, | ||
|  |          rec_limit)) | ||
|  |       return FALSE; | ||
|  | 
 | ||
|  |     code += PRIV(OP_lengths)[c]; | ||
|  |     continue; | ||
|  | 
 | ||
|  |     /* The next opcode does not need special handling; fall through and use it
 | ||
|  |     to see if the base can be possessified. */ | ||
|  | 
 | ||
|  |     default: | ||
|  |     break; | ||
|  |     } | ||
|  | 
 | ||
|  |   /* We now have the next appropriate opcode to compare with the base. Check
 | ||
|  |   for a supported opcode, and load its properties. */ | ||
|  | 
 | ||
|  |   code = get_chr_property_list(code, utf, ucp, cb->fcc, list); | ||
|  |   if (code == NULL) return FALSE;    /* Unsupported */ | ||
|  | 
 | ||
|  |   /* If either opcode is a small character list, set pointers for comparing
 | ||
|  |   characters from that list with another list, or with a property. */ | ||
|  | 
 | ||
|  |   if (base_list[0] == OP_CHAR) | ||
|  |     { | ||
|  |     chr_ptr = base_list + 2; | ||
|  |     list_ptr = list; | ||
|  |     } | ||
|  |   else if (list[0] == OP_CHAR) | ||
|  |     { | ||
|  |     chr_ptr = list + 2; | ||
|  |     list_ptr = base_list; | ||
|  |     } | ||
|  | 
 | ||
|  |   /* Character bitsets can also be compared to certain opcodes. */ | ||
|  | 
 | ||
|  |   else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS | ||
|  | #if PCRE2_CODE_UNIT_WIDTH == 8
 | ||
|  |       /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */ | ||
|  |       || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS)) | ||
|  | #endif
 | ||
|  |       ) | ||
|  |     { | ||
|  | #if PCRE2_CODE_UNIT_WIDTH == 8
 | ||
|  |     if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS)) | ||
|  | #else
 | ||
|  |     if (base_list[0] == OP_CLASS) | ||
|  | #endif
 | ||
|  |       { | ||
|  |       set1 = (uint8_t *)(base_end - base_list[2]); | ||
|  |       list_ptr = list; | ||
|  |       } | ||
|  |     else | ||
|  |       { | ||
|  |       set1 = (uint8_t *)(code - list[2]); | ||
|  |       list_ptr = base_list; | ||
|  |       } | ||
|  | 
 | ||
|  |     invert_bits = FALSE; | ||
|  |     switch(list_ptr[0]) | ||
|  |       { | ||
|  |       case OP_CLASS: | ||
|  |       case OP_NCLASS: | ||
|  |       set2 = (uint8_t *) | ||
|  |         ((list_ptr == list ? code : base_end) - list_ptr[2]); | ||
|  |       break; | ||
|  | 
 | ||
|  | #ifdef SUPPORT_WIDE_CHARS
 | ||
|  |       case OP_XCLASS: | ||
|  |       xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE; | ||
|  |       if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE; | ||
|  |       if ((*xclass_flags & XCL_MAP) == 0) | ||
|  |         { | ||
|  |         /* No bits are set for characters < 256. */ | ||
|  |         if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0; | ||
|  |         /* Might be an empty repeat. */ | ||
|  |         continue; | ||
|  |         } | ||
|  |       set2 = (uint8_t *)(xclass_flags + 1); | ||
|  |       break; | ||
|  | #endif
 | ||
|  | 
 | ||
|  |       case OP_NOT_DIGIT: | ||
|  |       invert_bits = TRUE; | ||
|  |       /* Fall through */ | ||
|  |       case OP_DIGIT: | ||
|  |       set2 = (uint8_t *)(cb->cbits + cbit_digit); | ||
|  |       break; | ||
|  | 
 | ||
|  |       case OP_NOT_WHITESPACE: | ||
|  |       invert_bits = TRUE; | ||
|  |       /* Fall through */ | ||
|  |       case OP_WHITESPACE: | ||
|  |       set2 = (uint8_t *)(cb->cbits + cbit_space); | ||
|  |       break; | ||
|  | 
 | ||
|  |       case OP_NOT_WORDCHAR: | ||
|  |       invert_bits = TRUE; | ||
|  |       /* Fall through */ | ||
|  |       case OP_WORDCHAR: | ||
|  |       set2 = (uint8_t *)(cb->cbits + cbit_word); | ||
|  |       break; | ||
|  | 
 | ||
|  |       default: | ||
|  |       return FALSE; | ||
|  |       } | ||
|  | 
 | ||
|  |     /* Because the bit sets are unaligned bytes, we need to perform byte
 | ||
|  |     comparison here. */ | ||
|  | 
 | ||
|  |     set_end = set1 + 32; | ||
|  |     if (invert_bits) | ||
|  |       { | ||
|  |       do | ||
|  |         { | ||
|  |         if ((*set1++ & ~(*set2++)) != 0) return FALSE; | ||
|  |         } | ||
|  |       while (set1 < set_end); | ||
|  |       } | ||
|  |     else | ||
|  |       { | ||
|  |       do | ||
|  |         { | ||
|  |         if ((*set1++ & *set2++) != 0) return FALSE; | ||
|  |         } | ||
|  |       while (set1 < set_end); | ||
|  |       } | ||
|  | 
 | ||
|  |     if (list[1] == 0) return TRUE; | ||
|  |     /* Might be an empty repeat. */ | ||
|  |     continue; | ||
|  |     } | ||
|  | 
 | ||
|  |   /* Some property combinations also acceptable. Unicode property opcodes are
 | ||
|  |   processed specially; the rest can be handled with a lookup table. */ | ||
|  | 
 | ||
|  |   else | ||
|  |     { | ||
|  |     uint32_t leftop, rightop; | ||
|  | 
 | ||
|  |     leftop = base_list[0]; | ||
|  |     rightop = list[0]; | ||
|  | 
 | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  |     accepted = FALSE; /* Always set in non-unicode case. */ | ||
|  |     if (leftop == OP_PROP || leftop == OP_NOTPROP) | ||
|  |       { | ||
|  |       if (rightop == OP_EOD) | ||
|  |         accepted = TRUE; | ||
|  |       else if (rightop == OP_PROP || rightop == OP_NOTPROP) | ||
|  |         { | ||
|  |         int n; | ||
|  |         const uint8_t *p; | ||
|  |         BOOL same = leftop == rightop; | ||
|  |         BOOL lisprop = leftop == OP_PROP; | ||
|  |         BOOL risprop = rightop == OP_PROP; | ||
|  |         BOOL bothprop = lisprop && risprop; | ||
|  | 
 | ||
|  |         /* There's a table that specifies how each combination is to be
 | ||
|  |         processed: | ||
|  |           0   Always return FALSE (never auto-possessify) | ||
|  |           1   Character groups are distinct (possessify if both are OP_PROP) | ||
|  |           2   Check character categories in the same group (general or particular) | ||
|  |           3   Return TRUE if the two opcodes are not the same | ||
|  |           ... see comments below | ||
|  |         */ | ||
|  | 
 | ||
|  |         n = propposstab[base_list[2]][list[2]]; | ||
|  |         switch(n) | ||
|  |           { | ||
|  |           case 0: break; | ||
|  |           case 1: accepted = bothprop; break; | ||
|  |           case 2: accepted = (base_list[3] == list[3]) != same; break; | ||
|  |           case 3: accepted = !same; break; | ||
|  | 
 | ||
|  |           case 4:  /* Left general category, right particular category */ | ||
|  |           accepted = risprop && catposstab[base_list[3]][list[3]] == same; | ||
|  |           break; | ||
|  | 
 | ||
|  |           case 5:  /* Right general category, left particular category */ | ||
|  |           accepted = lisprop && catposstab[list[3]][base_list[3]] == same; | ||
|  |           break; | ||
|  | 
 | ||
|  |           /* This code is logically tricky. Think hard before fiddling with it.
 | ||
|  |           The posspropstab table has four entries per row. Each row relates to | ||
|  |           one of PCRE's special properties such as ALNUM or SPACE or WORD. | ||
|  |           Only WORD actually needs all four entries, but using repeats for the | ||
|  |           others means they can all use the same code below. | ||
|  | 
 | ||
|  |           The first two entries in each row are Unicode general categories, and | ||
|  |           apply always, because all the characters they include are part of the | ||
|  |           PCRE character set. The third and fourth entries are a general and a | ||
|  |           particular category, respectively, that include one or more relevant | ||
|  |           characters. One or the other is used, depending on whether the check | ||
|  |           is for a general or a particular category. However, in both cases the | ||
|  |           category contains more characters than the specials that are defined | ||
|  |           for the property being tested against. Therefore, it cannot be used | ||
|  |           in a NOTPROP case. | ||
|  | 
 | ||
|  |           Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po. | ||
|  |           Underscore is covered by ucp_P or ucp_Po. */ | ||
|  | 
 | ||
|  |           case 6:  /* Left alphanum vs right general category */ | ||
|  |           case 7:  /* Left space vs right general category */ | ||
|  |           case 8:  /* Left word vs right general category */ | ||
|  |           p = posspropstab[n-6]; | ||
|  |           accepted = risprop && lisprop == | ||
|  |             (list[3] != p[0] && | ||
|  |              list[3] != p[1] && | ||
|  |             (list[3] != p[2] || !lisprop)); | ||
|  |           break; | ||
|  | 
 | ||
|  |           case 9:   /* Right alphanum vs left general category */ | ||
|  |           case 10:  /* Right space vs left general category */ | ||
|  |           case 11:  /* Right word vs left general category */ | ||
|  |           p = posspropstab[n-9]; | ||
|  |           accepted = lisprop && risprop == | ||
|  |             (base_list[3] != p[0] && | ||
|  |              base_list[3] != p[1] && | ||
|  |             (base_list[3] != p[2] || !risprop)); | ||
|  |           break; | ||
|  | 
 | ||
|  |           case 12:  /* Left alphanum vs right particular category */ | ||
|  |           case 13:  /* Left space vs right particular category */ | ||
|  |           case 14:  /* Left word vs right particular category */ | ||
|  |           p = posspropstab[n-12]; | ||
|  |           accepted = risprop && lisprop == | ||
|  |             (catposstab[p[0]][list[3]] && | ||
|  |              catposstab[p[1]][list[3]] && | ||
|  |             (list[3] != p[3] || !lisprop)); | ||
|  |           break; | ||
|  | 
 | ||
|  |           case 15:  /* Right alphanum vs left particular category */ | ||
|  |           case 16:  /* Right space vs left particular category */ | ||
|  |           case 17:  /* Right word vs left particular category */ | ||
|  |           p = posspropstab[n-15]; | ||
|  |           accepted = lisprop && risprop == | ||
|  |             (catposstab[p[0]][base_list[3]] && | ||
|  |              catposstab[p[1]][base_list[3]] && | ||
|  |             (base_list[3] != p[3] || !risprop)); | ||
|  |           break; | ||
|  |           } | ||
|  |         } | ||
|  |       } | ||
|  | 
 | ||
|  |     else | ||
|  | #endif  /* SUPPORT_UNICODE */
 | ||
|  | 
 | ||
|  |     accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP && | ||
|  |            rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP && | ||
|  |            autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP]; | ||
|  | 
 | ||
|  |     if (!accepted) return FALSE; | ||
|  | 
 | ||
|  |     if (list[1] == 0) return TRUE; | ||
|  |     /* Might be an empty repeat. */ | ||
|  |     continue; | ||
|  |     } | ||
|  | 
 | ||
|  |   /* Control reaches here only if one of the items is a small character list.
 | ||
|  |   All characters are checked against the other side. */ | ||
|  | 
 | ||
|  |   do | ||
|  |     { | ||
|  |     chr = *chr_ptr; | ||
|  | 
 | ||
|  |     switch(list_ptr[0]) | ||
|  |       { | ||
|  |       case OP_CHAR: | ||
|  |       ochr_ptr = list_ptr + 2; | ||
|  |       do | ||
|  |         { | ||
|  |         if (chr == *ochr_ptr) return FALSE; | ||
|  |         ochr_ptr++; | ||
|  |         } | ||
|  |       while(*ochr_ptr != NOTACHAR); | ||
|  |       break; | ||
|  | 
 | ||
|  |       case OP_NOT: | ||
|  |       ochr_ptr = list_ptr + 2; | ||
|  |       do | ||
|  |         { | ||
|  |         if (chr == *ochr_ptr) | ||
|  |           break; | ||
|  |         ochr_ptr++; | ||
|  |         } | ||
|  |       while(*ochr_ptr != NOTACHAR); | ||
|  |       if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */ | ||
|  |       break; | ||
|  | 
 | ||
|  |       /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not*
 | ||
|  |       set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ | ||
|  | 
 | ||
|  |       case OP_DIGIT: | ||
|  |       if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE; | ||
|  |       break; | ||
|  | 
 | ||
|  |       case OP_NOT_DIGIT: | ||
|  |       if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE; | ||
|  |       break; | ||
|  | 
 | ||
|  |       case OP_WHITESPACE: | ||
|  |       if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE; | ||
|  |       break; | ||
|  | 
 | ||
|  |       case OP_NOT_WHITESPACE: | ||
|  |       if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE; | ||
|  |       break; | ||
|  | 
 | ||
|  |       case OP_WORDCHAR: | ||
|  |       if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE; | ||
|  |       break; | ||
|  | 
 | ||
|  |       case OP_NOT_WORDCHAR: | ||
|  |       if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE; | ||
|  |       break; | ||
|  | 
 | ||
|  |       case OP_HSPACE: | ||
|  |       switch(chr) | ||
|  |         { | ||
|  |         HSPACE_CASES: return FALSE; | ||
|  |         default: break; | ||
|  |         } | ||
|  |       break; | ||
|  | 
 | ||
|  |       case OP_NOT_HSPACE: | ||
|  |       switch(chr) | ||
|  |         { | ||
|  |         HSPACE_CASES: break; | ||
|  |         default: return FALSE; | ||
|  |         } | ||
|  |       break; | ||
|  | 
 | ||
|  |       case OP_ANYNL: | ||
|  |       case OP_VSPACE: | ||
|  |       switch(chr) | ||
|  |         { | ||
|  |         VSPACE_CASES: return FALSE; | ||
|  |         default: break; | ||
|  |         } | ||
|  |       break; | ||
|  | 
 | ||
|  |       case OP_NOT_VSPACE: | ||
|  |       switch(chr) | ||
|  |         { | ||
|  |         VSPACE_CASES: break; | ||
|  |         default: return FALSE; | ||
|  |         } | ||
|  |       break; | ||
|  | 
 | ||
|  |       case OP_DOLL: | ||
|  |       case OP_EODN: | ||
|  |       switch (chr) | ||
|  |         { | ||
|  |         case CHAR_CR: | ||
|  |         case CHAR_LF: | ||
|  |         case CHAR_VT: | ||
|  |         case CHAR_FF: | ||
|  |         case CHAR_NEL: | ||
|  | #ifndef EBCDIC
 | ||
|  |         case 0x2028: | ||
|  |         case 0x2029: | ||
|  | #endif  /* Not EBCDIC */
 | ||
|  |         return FALSE; | ||
|  |         } | ||
|  |       break; | ||
|  | 
 | ||
|  |       case OP_EOD:    /* Can always possessify before \z */ | ||
|  |       break; | ||
|  | 
 | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  |       case OP_PROP: | ||
|  |       case OP_NOTPROP: | ||
|  |       if (!check_char_prop(chr, list_ptr[2], list_ptr[3], | ||
|  |             list_ptr[0] == OP_NOTPROP)) | ||
|  |         return FALSE; | ||
|  |       break; | ||
|  | #endif
 | ||
|  | 
 | ||
|  |       case OP_NCLASS: | ||
|  |       if (chr > 255) return FALSE; | ||
|  |       /* Fall through */ | ||
|  | 
 | ||
|  |       case OP_CLASS: | ||
|  |       if (chr > 255) break; | ||
|  |       class_bitset = (uint8_t *) | ||
|  |         ((list_ptr == list ? code : base_end) - list_ptr[2]); | ||
|  |       if ((class_bitset[chr >> 3] & (1u << (chr & 7))) != 0) return FALSE; | ||
|  |       break; | ||
|  | 
 | ||
|  | #ifdef SUPPORT_WIDE_CHARS
 | ||
|  |       case OP_XCLASS: | ||
|  |       if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) - | ||
|  |           list_ptr[2] + LINK_SIZE, utf)) return FALSE; | ||
|  |       break; | ||
|  | #endif
 | ||
|  | 
 | ||
|  |       default: | ||
|  |       return FALSE; | ||
|  |       } | ||
|  | 
 | ||
|  |     chr_ptr++; | ||
|  |     } | ||
|  |   while(*chr_ptr != NOTACHAR); | ||
|  | 
 | ||
|  |   /* At least one character must be matched from this opcode. */ | ||
|  | 
 | ||
|  |   if (list[1] == 0) return TRUE; | ||
|  |   } | ||
|  | 
 | ||
|  | /* Control never reaches here. There used to be a fail-save return FALSE; here,
 | ||
|  | but some compilers complain about an unreachable statement. */ | ||
|  | } | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | /*************************************************
 | ||
|  | *    Scan compiled regex for auto-possession     * | ||
|  | *************************************************/ | ||
|  | 
 | ||
|  | /* Replaces single character iterations with their possessive alternatives
 | ||
|  | if appropriate. This function modifies the compiled opcode! Hitting a | ||
|  | non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a | ||
|  | bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches | ||
|  | overly complicated or large patterns. In these cases, the check just stops, | ||
|  | leaving the remainder of the pattern unpossessified. | ||
|  | 
 | ||
|  | Arguments: | ||
|  |   code        points to start of the byte code | ||
|  |   cb          compile data block | ||
|  | 
 | ||
|  | Returns:      0 for success | ||
|  |               -1 if a non-existant opcode is encountered | ||
|  | */ | ||
|  | 
 | ||
|  | int | ||
|  | PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb) | ||
|  | { | ||
|  | PCRE2_UCHAR c; | ||
|  | PCRE2_SPTR end; | ||
|  | PCRE2_UCHAR *repeat_opcode; | ||
|  | uint32_t list[8]; | ||
|  | int rec_limit = 1000;  /* Was 10,000 but clang+ASAN uses a lot of stack. */ | ||
|  | BOOL utf = (cb->external_options & PCRE2_UTF) != 0; | ||
|  | BOOL ucp = (cb->external_options & PCRE2_UCP) != 0; | ||
|  | 
 | ||
|  | for (;;) | ||
|  |   { | ||
|  |   c = *code; | ||
|  | 
 | ||
|  |   if (c >= OP_TABLE_LENGTH) return -1;   /* Something gone wrong */ | ||
|  | 
 | ||
|  |   if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) | ||
|  |     { | ||
|  |     c -= get_repeat_base(c) - OP_STAR; | ||
|  |     end = (c <= OP_MINUPTO) ? | ||
|  |       get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL; | ||
|  |     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; | ||
|  | 
 | ||
|  |     if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end, | ||
|  |         &rec_limit)) | ||
|  |       { | ||
|  |       switch(c) | ||
|  |         { | ||
|  |         case OP_STAR: | ||
|  |         *code += OP_POSSTAR - OP_STAR; | ||
|  |         break; | ||
|  | 
 | ||
|  |         case OP_MINSTAR: | ||
|  |         *code += OP_POSSTAR - OP_MINSTAR; | ||
|  |         break; | ||
|  | 
 | ||
|  |         case OP_PLUS: | ||
|  |         *code += OP_POSPLUS - OP_PLUS; | ||
|  |         break; | ||
|  | 
 | ||
|  |         case OP_MINPLUS: | ||
|  |         *code += OP_POSPLUS - OP_MINPLUS; | ||
|  |         break; | ||
|  | 
 | ||
|  |         case OP_QUERY: | ||
|  |         *code += OP_POSQUERY - OP_QUERY; | ||
|  |         break; | ||
|  | 
 | ||
|  |         case OP_MINQUERY: | ||
|  |         *code += OP_POSQUERY - OP_MINQUERY; | ||
|  |         break; | ||
|  | 
 | ||
|  |         case OP_UPTO: | ||
|  |         *code += OP_POSUPTO - OP_UPTO; | ||
|  |         break; | ||
|  | 
 | ||
|  |         case OP_MINUPTO: | ||
|  |         *code += OP_POSUPTO - OP_MINUPTO; | ||
|  |         break; | ||
|  |         } | ||
|  |       } | ||
|  |     c = *code; | ||
|  |     } | ||
|  |   else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS) | ||
|  |     { | ||
|  | #ifdef SUPPORT_WIDE_CHARS
 | ||
|  |     if (c == OP_XCLASS) | ||
|  |       repeat_opcode = code + GET(code, 1); | ||
|  |     else | ||
|  | #endif
 | ||
|  |       repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR)); | ||
|  | 
 | ||
|  |     c = *repeat_opcode; | ||
|  |     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE) | ||
|  |       { | ||
|  |       /* The return from get_chr_property_list() will never be NULL when
 | ||
|  |       *code (aka c) is one of the three class opcodes. However, gcc with | ||
|  |       -fanalyzer notes that a NULL return is possible, and grumbles. Hence we | ||
|  |       put in a check. */ | ||
|  | 
 | ||
|  |       end = get_chr_property_list(code, utf, ucp, cb->fcc, list); | ||
|  |       list[1] = (c & 1) == 0; | ||
|  | 
 | ||
|  |       if (end != NULL && | ||
|  |           compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit)) | ||
|  |         { | ||
|  |         switch (c) | ||
|  |           { | ||
|  |           case OP_CRSTAR: | ||
|  |           case OP_CRMINSTAR: | ||
|  |           *repeat_opcode = OP_CRPOSSTAR; | ||
|  |           break; | ||
|  | 
 | ||
|  |           case OP_CRPLUS: | ||
|  |           case OP_CRMINPLUS: | ||
|  |           *repeat_opcode = OP_CRPOSPLUS; | ||
|  |           break; | ||
|  | 
 | ||
|  |           case OP_CRQUERY: | ||
|  |           case OP_CRMINQUERY: | ||
|  |           *repeat_opcode = OP_CRPOSQUERY; | ||
|  |           break; | ||
|  | 
 | ||
|  |           case OP_CRRANGE: | ||
|  |           case OP_CRMINRANGE: | ||
|  |           *repeat_opcode = OP_CRPOSRANGE; | ||
|  |           break; | ||
|  |           } | ||
|  |         } | ||
|  |       } | ||
|  |     c = *code; | ||
|  |     } | ||
|  | 
 | ||
|  |   switch(c) | ||
|  |     { | ||
|  |     case OP_END: | ||
|  |     return 0; | ||
|  | 
 | ||
|  |     case OP_TYPESTAR: | ||
|  |     case OP_TYPEMINSTAR: | ||
|  |     case OP_TYPEPLUS: | ||
|  |     case OP_TYPEMINPLUS: | ||
|  |     case OP_TYPEQUERY: | ||
|  |     case OP_TYPEMINQUERY: | ||
|  |     case OP_TYPEPOSSTAR: | ||
|  |     case OP_TYPEPOSPLUS: | ||
|  |     case OP_TYPEPOSQUERY: | ||
|  |     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; | ||
|  |     break; | ||
|  | 
 | ||
|  |     case OP_TYPEUPTO: | ||
|  |     case OP_TYPEMINUPTO: | ||
|  |     case OP_TYPEEXACT: | ||
|  |     case OP_TYPEPOSUPTO: | ||
|  |     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) | ||
|  |       code += 2; | ||
|  |     break; | ||
|  | 
 | ||
|  |     case OP_CALLOUT_STR: | ||
|  |     code += GET(code, 1 + 2*LINK_SIZE); | ||
|  |     break; | ||
|  | 
 | ||
|  | #ifdef SUPPORT_WIDE_CHARS
 | ||
|  |     case OP_XCLASS: | ||
|  |     code += GET(code, 1); | ||
|  |     break; | ||
|  | #endif
 | ||
|  | 
 | ||
|  |     case OP_MARK: | ||
|  |     case OP_COMMIT_ARG: | ||
|  |     case OP_PRUNE_ARG: | ||
|  |     case OP_SKIP_ARG: | ||
|  |     case OP_THEN_ARG: | ||
|  |     code += code[1]; | ||
|  |     break; | ||
|  |     } | ||
|  | 
 | ||
|  |   /* Add in the fixed length from the table */ | ||
|  | 
 | ||
|  |   code += PRIV(OP_lengths)[c]; | ||
|  | 
 | ||
|  |   /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be
 | ||
|  |   followed by a multi-byte character. The length in the table is a minimum, so | ||
|  |   we have to arrange to skip the extra code units. */ | ||
|  | 
 | ||
|  | #ifdef MAYBE_UTF_MULTI
 | ||
|  |   if (utf) switch(c) | ||
|  |     { | ||
|  |     case OP_CHAR: | ||
|  |     case OP_CHARI: | ||
|  |     case OP_NOT: | ||
|  |     case OP_NOTI: | ||
|  |     case OP_STAR: | ||
|  |     case OP_MINSTAR: | ||
|  |     case OP_PLUS: | ||
|  |     case OP_MINPLUS: | ||
|  |     case OP_QUERY: | ||
|  |     case OP_MINQUERY: | ||
|  |     case OP_UPTO: | ||
|  |     case OP_MINUPTO: | ||
|  |     case OP_EXACT: | ||
|  |     case OP_POSSTAR: | ||
|  |     case OP_POSPLUS: | ||
|  |     case OP_POSQUERY: | ||
|  |     case OP_POSUPTO: | ||
|  |     case OP_STARI: | ||
|  |     case OP_MINSTARI: | ||
|  |     case OP_PLUSI: | ||
|  |     case OP_MINPLUSI: | ||
|  |     case OP_QUERYI: | ||
|  |     case OP_MINQUERYI: | ||
|  |     case OP_UPTOI: | ||
|  |     case OP_MINUPTOI: | ||
|  |     case OP_EXACTI: | ||
|  |     case OP_POSSTARI: | ||
|  |     case OP_POSPLUSI: | ||
|  |     case OP_POSQUERYI: | ||
|  |     case OP_POSUPTOI: | ||
|  |     case OP_NOTSTAR: | ||
|  |     case OP_NOTMINSTAR: | ||
|  |     case OP_NOTPLUS: | ||
|  |     case OP_NOTMINPLUS: | ||
|  |     case OP_NOTQUERY: | ||
|  |     case OP_NOTMINQUERY: | ||
|  |     case OP_NOTUPTO: | ||
|  |     case OP_NOTMINUPTO: | ||
|  |     case OP_NOTEXACT: | ||
|  |     case OP_NOTPOSSTAR: | ||
|  |     case OP_NOTPOSPLUS: | ||
|  |     case OP_NOTPOSQUERY: | ||
|  |     case OP_NOTPOSUPTO: | ||
|  |     case OP_NOTSTARI: | ||
|  |     case OP_NOTMINSTARI: | ||
|  |     case OP_NOTPLUSI: | ||
|  |     case OP_NOTMINPLUSI: | ||
|  |     case OP_NOTQUERYI: | ||
|  |     case OP_NOTMINQUERYI: | ||
|  |     case OP_NOTUPTOI: | ||
|  |     case OP_NOTMINUPTOI: | ||
|  |     case OP_NOTEXACTI: | ||
|  |     case OP_NOTPOSSTARI: | ||
|  |     case OP_NOTPOSPLUSI: | ||
|  |     case OP_NOTPOSQUERYI: | ||
|  |     case OP_NOTPOSUPTOI: | ||
|  |     if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); | ||
|  |     break; | ||
|  |     } | ||
|  | #else
 | ||
|  |   (void)(utf);  /* Keep compiler happy by referencing function argument */ | ||
|  | #endif  /* SUPPORT_WIDE_CHARS */
 | ||
|  |   } | ||
|  | } | ||
|  | 
 | ||
|  | /* End of pcre2_auto_possess.c */ |