forked from LeenkxTeam/LNXSDK
		
	
		
			
	
	
		
			290 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
		
		
			
		
	
	
			290 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
|  | /*************************************************
 | ||
|  | *      Perl-Compatible Regular Expressions       * | ||
|  | *************************************************/ | ||
|  | 
 | ||
|  | /* PCRE is a library of functions to support regular expressions whose syntax
 | ||
|  | and semantics are as close as possible to those of the Perl 5 language. | ||
|  | 
 | ||
|  |                        Written by Philip Hazel | ||
|  |      Original API code Copyright (c) 1997-2012 University of Cambridge | ||
|  |           New API code Copyright (c) 2016-2022 University of Cambridge | ||
|  | 
 | ||
|  | ----------------------------------------------------------------------------- | ||
|  | Redistribution and use in source and binary forms, with or without | ||
|  | modification, are permitted provided that the following conditions are met: | ||
|  | 
 | ||
|  |     * Redistributions of source code must retain the above copyright notice, | ||
|  |       this list of conditions and the following disclaimer. | ||
|  | 
 | ||
|  |     * Redistributions in binary form must reproduce the above copyright | ||
|  |       notice, this list of conditions and the following disclaimer in the | ||
|  |       documentation and/or other materials provided with the distribution. | ||
|  | 
 | ||
|  |     * Neither the name of the University of Cambridge nor the names of its | ||
|  |       contributors may be used to endorse or promote products derived from | ||
|  |       this software without specific prior written permission. | ||
|  | 
 | ||
|  | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
|  | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
|  | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
|  | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | ||
|  | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
|  | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
|  | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
|  | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
|  | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
|  | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
|  | POSSIBILITY OF SUCH DAMAGE. | ||
|  | ----------------------------------------------------------------------------- | ||
|  | */ | ||
|  | 
 | ||
|  | /* This module contains an internal function that is used to match an extended
 | ||
|  | class. It is used by pcre2_auto_possessify() and by both pcre2_match() and | ||
|  | pcre2_def_match(). */ | ||
|  | 
 | ||
|  | 
 | ||
|  | #ifdef HAVE_CONFIG_H
 | ||
|  | #include "config.h"
 | ||
|  | #endif
 | ||
|  | 
 | ||
|  | 
 | ||
|  | #include "pcre2_internal.h"
 | ||
|  | 
 | ||
|  | /*************************************************
 | ||
|  | *       Match character against an XCLASS        * | ||
|  | *************************************************/ | ||
|  | 
 | ||
|  | /* This function is called to match a character against an extended class that
 | ||
|  | might contain codepoints above 255 and/or Unicode properties. | ||
|  | 
 | ||
|  | Arguments: | ||
|  |   c           the character | ||
|  |   data        points to the flag code unit of the XCLASS data | ||
|  |   utf         TRUE if in UTF mode | ||
|  | 
 | ||
|  | Returns:      TRUE if character matches, else FALSE | ||
|  | */ | ||
|  | 
 | ||
|  | BOOL | ||
|  | PRIV(xclass)(uint32_t c, PCRE2_SPTR data, BOOL utf) | ||
|  | { | ||
|  | PCRE2_UCHAR t; | ||
|  | BOOL negated = (*data & XCL_NOT) != 0; | ||
|  | 
 | ||
|  | #if PCRE2_CODE_UNIT_WIDTH == 8
 | ||
|  | /* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */ | ||
|  | utf = TRUE; | ||
|  | #endif
 | ||
|  | 
 | ||
|  | /* Code points < 256 are matched against a bitmap, if one is present. If not,
 | ||
|  | we still carry on, because there may be ranges that start below 256 in the | ||
|  | additional data. */ | ||
|  | 
 | ||
|  | if (c < 256) | ||
|  |   { | ||
|  |   if ((*data & XCL_HASPROP) == 0) | ||
|  |     { | ||
|  |     if ((*data & XCL_MAP) == 0) return negated; | ||
|  |     return (((uint8_t *)(data + 1))[c/8] & (1u << (c&7))) != 0; | ||
|  |     } | ||
|  |   if ((*data & XCL_MAP) != 0 && | ||
|  |     (((uint8_t *)(data + 1))[c/8] & (1u << (c&7))) != 0) | ||
|  |     return !negated; /* char found */ | ||
|  |   } | ||
|  | 
 | ||
|  | /* First skip the bit map if present. Then match against the list of Unicode
 | ||
|  | properties or large chars or ranges that end with a large char. We won't ever | ||
|  | encounter XCL_PROP or XCL_NOTPROP when UTF support is not compiled. */ | ||
|  | 
 | ||
|  | if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(PCRE2_UCHAR); | ||
|  | 
 | ||
|  | while ((t = *data++) != XCL_END) | ||
|  |   { | ||
|  |   uint32_t x, y; | ||
|  |   if (t == XCL_SINGLE) | ||
|  |     { | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  |     if (utf) | ||
|  |       { | ||
|  |       GETCHARINC(x, data); /* macro generates multiple statements */ | ||
|  |       } | ||
|  |     else | ||
|  | #endif
 | ||
|  |     x = *data++; | ||
|  |     if (c == x) return !negated; | ||
|  |     } | ||
|  |   else if (t == XCL_RANGE) | ||
|  |     { | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  |     if (utf) | ||
|  |       { | ||
|  |       GETCHARINC(x, data); /* macro generates multiple statements */ | ||
|  |       GETCHARINC(y, data); /* macro generates multiple statements */ | ||
|  |       } | ||
|  |     else | ||
|  | #endif
 | ||
|  |       { | ||
|  |       x = *data++; | ||
|  |       y = *data++; | ||
|  |       } | ||
|  |     if (c >= x && c <= y) return !negated; | ||
|  |     } | ||
|  | 
 | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  |   else  /* XCL_PROP & XCL_NOTPROP */ | ||
|  |     { | ||
|  |     const ucd_record *prop = GET_UCD(c); | ||
|  |     BOOL isprop = t == XCL_PROP; | ||
|  |     BOOL ok; | ||
|  | 
 | ||
|  |     switch(*data) | ||
|  |       { | ||
|  |       case PT_ANY: | ||
|  |       if (isprop) return !negated; | ||
|  |       break; | ||
|  | 
 | ||
|  |       case PT_LAMP: | ||
|  |       if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || | ||
|  |            prop->chartype == ucp_Lt) == isprop) return !negated; | ||
|  |       break; | ||
|  | 
 | ||
|  |       case PT_GC: | ||
|  |       if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop) | ||
|  |         return !negated; | ||
|  |       break; | ||
|  | 
 | ||
|  |       case PT_PC: | ||
|  |       if ((data[1] == prop->chartype) == isprop) return !negated; | ||
|  |       break; | ||
|  | 
 | ||
|  |       case PT_SC: | ||
|  |       if ((data[1] == prop->script) == isprop) return !negated; | ||
|  |       break; | ||
|  | 
 | ||
|  |       case PT_SCX: | ||
|  |       ok = (data[1] == prop->script || | ||
|  |             MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), data[1]) != 0); | ||
|  |       if (ok == isprop) return !negated; | ||
|  |       break; | ||
|  | 
 | ||
|  |       case PT_ALNUM: | ||
|  |       if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || | ||
|  |            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop) | ||
|  |         return !negated; | ||
|  |       break; | ||
|  | 
 | ||
|  |       /* Perl space used to exclude VT, but from Perl 5.18 it is included,
 | ||
|  |       which means that Perl space and POSIX space are now identical. PCRE | ||
|  |       was changed at release 8.34. */ | ||
|  | 
 | ||
|  |       case PT_SPACE:    /* Perl space */ | ||
|  |       case PT_PXSPACE:  /* POSIX space */ | ||
|  |       switch(c) | ||
|  |         { | ||
|  |         HSPACE_CASES: | ||
|  |         VSPACE_CASES: | ||
|  |         if (isprop) return !negated; | ||
|  |         break; | ||
|  | 
 | ||
|  |         default: | ||
|  |         if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop) | ||
|  |           return !negated; | ||
|  |         break; | ||
|  |         } | ||
|  |       break; | ||
|  | 
 | ||
|  |       case PT_WORD: | ||
|  |       if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || | ||
|  |            PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE) | ||
|  |              == isprop) | ||
|  |         return !negated; | ||
|  |       break; | ||
|  | 
 | ||
|  |       case PT_UCNC: | ||
|  |       if (c < 0xa0) | ||
|  |         { | ||
|  |         if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || | ||
|  |              c == CHAR_GRAVE_ACCENT) == isprop) | ||
|  |           return !negated; | ||
|  |         } | ||
|  |       else | ||
|  |         { | ||
|  |         if ((c < 0xd800 || c > 0xdfff) == isprop) | ||
|  |           return !negated; | ||
|  |         } | ||
|  |       break; | ||
|  | 
 | ||
|  |       case PT_BIDICL: | ||
|  |       if ((UCD_BIDICLASS_PROP(prop) == data[1]) == isprop) | ||
|  |         return !negated; | ||
|  |       break; | ||
|  | 
 | ||
|  |       case PT_BOOL: | ||
|  |       ok = MAPBIT(PRIV(ucd_boolprop_sets) + | ||
|  |         UCD_BPROPS_PROP(prop), data[1]) != 0; | ||
|  |       if (ok == isprop) return !negated; | ||
|  |       break; | ||
|  | 
 | ||
|  |       /* The following three properties can occur only in an XCLASS, as there
 | ||
|  |       is no \p or \P coding for them. */ | ||
|  | 
 | ||
|  |       /* Graphic character. Implement this as not Z (space or separator) and
 | ||
|  |       not C (other), except for Cf (format) with a few exceptions. This seems | ||
|  |       to be what Perl does. The exceptional characters are: | ||
|  | 
 | ||
|  |       U+061C           Arabic Letter Mark | ||
|  |       U+180E           Mongolian Vowel Separator | ||
|  |       U+2066 - U+2069  Various "isolate"s | ||
|  |       */ | ||
|  | 
 | ||
|  |       case PT_PXGRAPH: | ||
|  |       if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z && | ||
|  |             (PRIV(ucp_gentype)[prop->chartype] != ucp_C || | ||
|  |               (prop->chartype == ucp_Cf && | ||
|  |                 c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069)) | ||
|  |          )) == isprop) | ||
|  |         return !negated; | ||
|  |       break; | ||
|  | 
 | ||
|  |       /* Printable character: same as graphic, with the addition of Zs, i.e.
 | ||
|  |       not Zl and not Zp, and U+180E. */ | ||
|  | 
 | ||
|  |       case PT_PXPRINT: | ||
|  |       if ((prop->chartype != ucp_Zl && | ||
|  |            prop->chartype != ucp_Zp && | ||
|  |             (PRIV(ucp_gentype)[prop->chartype] != ucp_C || | ||
|  |               (prop->chartype == ucp_Cf && | ||
|  |                 c != 0x061c && (c < 0x2066 || c > 0x2069)) | ||
|  |          )) == isprop) | ||
|  |         return !negated; | ||
|  |       break; | ||
|  | 
 | ||
|  |       /* Punctuation: all Unicode punctuation, plus ASCII characters that
 | ||
|  |       Unicode treats as symbols rather than punctuation, for Perl | ||
|  |       compatibility (these are $+<=>^`|~). */ | ||
|  | 
 | ||
|  |       case PT_PXPUNCT: | ||
|  |       if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P || | ||
|  |             (c < 128 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop) | ||
|  |         return !negated; | ||
|  |       break; | ||
|  | 
 | ||
|  |       /* This should never occur, but compilers may mutter if there is no
 | ||
|  |       default. */ | ||
|  | 
 | ||
|  |       default: | ||
|  |       return FALSE; | ||
|  |       } | ||
|  | 
 | ||
|  |     data += 2; | ||
|  |     } | ||
|  | #else
 | ||
|  |   (void)utf;  /* Avoid compiler warning */ | ||
|  | #endif  /* SUPPORT_UNICODE */
 | ||
|  |   } | ||
|  | 
 | ||
|  | return negated;   /* char did not match */ | ||
|  | } | ||
|  | 
 | ||
|  | /* End of pcre2_xclass.c */ |