345 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
		
		
			
		
	
	
			345 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
|  | /*************************************************
 | ||
|  | *      Perl-Compatible Regular Expressions       * | ||
|  | *************************************************/ | ||
|  | 
 | ||
|  | /* PCRE is a library of functions to support regular expressions whose syntax
 | ||
|  | and semantics are as close as possible to those of the Perl 5 language. | ||
|  | 
 | ||
|  |                        Written by Philip Hazel | ||
|  |      Original API code Copyright (c) 1997-2012 University of Cambridge | ||
|  |           New API code Copyright (c) 2016-2021 University of Cambridge | ||
|  | 
 | ||
|  | ----------------------------------------------------------------------------- | ||
|  | Redistribution and use in source and binary forms, with or without | ||
|  | modification, are permitted provided that the following conditions are met: | ||
|  | 
 | ||
|  |     * Redistributions of source code must retain the above copyright notice, | ||
|  |       this list of conditions and the following disclaimer. | ||
|  | 
 | ||
|  |     * Redistributions in binary form must reproduce the above copyright | ||
|  |       notice, this list of conditions and the following disclaimer in the | ||
|  |       documentation and/or other materials provided with the distribution. | ||
|  | 
 | ||
|  |     * Neither the name of the University of Cambridge nor the names of its | ||
|  |       contributors may be used to endorse or promote products derived from | ||
|  |       this software without specific prior written permission. | ||
|  | 
 | ||
|  | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
|  | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
|  | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
|  | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | ||
|  | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
|  | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
|  | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
|  | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
|  | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
|  | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
|  | POSSIBILITY OF SUCH DAMAGE. | ||
|  | ----------------------------------------------------------------------------- | ||
|  | */ | ||
|  | 
 | ||
|  | /* This module contains the function for checking a script run. */ | ||
|  | 
 | ||
|  | #ifdef HAVE_CONFIG_H
 | ||
|  | #include "config.h"
 | ||
|  | #endif
 | ||
|  | 
 | ||
|  | #include "pcre2_internal.h"
 | ||
|  | 
 | ||
|  | 
 | ||
|  | /*************************************************
 | ||
|  | *                Check script run                * | ||
|  | *************************************************/ | ||
|  | 
 | ||
|  | /* A script run is conceptually a sequence of characters all in the same
 | ||
|  | Unicode script. However, it isn't quite that simple. There are special rules | ||
|  | for scripts that are commonly used together, and also special rules for digits. | ||
|  | This function implements the appropriate checks, which is possible only when | ||
|  | PCRE2 is compiled with Unicode support. The function returns TRUE if there is | ||
|  | no Unicode support; however, it should never be called in that circumstance | ||
|  | because an error is given by pcre2_compile() if a script run is called for in a | ||
|  | version of PCRE2 compiled without Unicode support. | ||
|  | 
 | ||
|  | Arguments: | ||
|  |   pgr       point to the first character | ||
|  |   endptr    point after the last character | ||
|  |   utf       TRUE if in UTF mode | ||
|  | 
 | ||
|  | Returns:    TRUE if this is a valid script run | ||
|  | */ | ||
|  | 
 | ||
|  | /* These are states in the checking process. */ | ||
|  | 
 | ||
|  | enum { SCRIPT_UNSET,          /* Requirement as yet unknown */ | ||
|  |        SCRIPT_MAP,            /* Bitmap contains acceptable scripts */ | ||
|  |        SCRIPT_HANPENDING,     /* Have had only Han characters */ | ||
|  |        SCRIPT_HANHIRAKATA,    /* Expect Han or Hirikata */ | ||
|  |        SCRIPT_HANBOPOMOFO,    /* Expect Han or Bopomofo */ | ||
|  |        SCRIPT_HANHANGUL       /* Expect Han or Hangul */ | ||
|  |        }; | ||
|  | 
 | ||
|  | #define UCD_MAPSIZE (ucp_Unknown/32 + 1)
 | ||
|  | #define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
 | ||
|  | 
 | ||
|  | BOOL | ||
|  | PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf) | ||
|  | { | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  | uint32_t require_state = SCRIPT_UNSET; | ||
|  | uint32_t require_map[FULL_MAPSIZE]; | ||
|  | uint32_t map[FULL_MAPSIZE]; | ||
|  | uint32_t require_digitset = 0; | ||
|  | uint32_t c; | ||
|  | 
 | ||
|  | #if PCRE2_CODE_UNIT_WIDTH == 32
 | ||
|  | (void)utf;    /* Avoid compiler warning */ | ||
|  | #endif
 | ||
|  | 
 | ||
|  | /* Any string containing fewer than 2 characters is a valid script run. */ | ||
|  | 
 | ||
|  | if (ptr >= endptr) return TRUE; | ||
|  | GETCHARINCTEST(c, ptr); | ||
|  | if (ptr >= endptr) return TRUE; | ||
|  | 
 | ||
|  | /* Initialize the require map. This is a full-size bitmap that has a bit for
 | ||
|  | every script, as opposed to the maps in ucd_script_sets, which only have bits | ||
|  | for scripts less than ucp_Unknown - those that appear in script extension | ||
|  | lists. */ | ||
|  | 
 | ||
|  | for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0; | ||
|  | 
 | ||
|  | /* Scan strings of two or more characters, checking the Unicode characteristics
 | ||
|  | of each code point. There is special code for scripts that can be combined with | ||
|  | characters from the Han Chinese script. This may be used in conjunction with | ||
|  | four other scripts in these combinations: | ||
|  | 
 | ||
|  | . Han with Hiragana and Katakana is allowed (for Japanese). | ||
|  | . Han with Bopomofo is allowed (for Taiwanese Mandarin). | ||
|  | . Han with Hangul is allowed (for Korean). | ||
|  | 
 | ||
|  | If the first significant character's script is one of the four, the required | ||
|  | script type is immediately known. However, if the first significant | ||
|  | character's script is Han, we have to keep checking for a non-Han character. | ||
|  | Hence the SCRIPT_HANPENDING state. */ | ||
|  | 
 | ||
|  | for (;;) | ||
|  |   { | ||
|  |   const ucd_record *ucd = GET_UCD(c); | ||
|  |   uint32_t script = ucd->script; | ||
|  | 
 | ||
|  |   /* If the script is Unknown, the string is not a valid script run. Such
 | ||
|  |   characters can only form script runs of length one (see test above). */ | ||
|  | 
 | ||
|  |   if (script == ucp_Unknown) return FALSE; | ||
|  | 
 | ||
|  |   /* A character without any script extensions whose script is Inherited or
 | ||
|  |   Common is always accepted with any script. If there are extensions, the | ||
|  |   following processing happens for all scripts. */ | ||
|  | 
 | ||
|  |   if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common)) | ||
|  |     { | ||
|  |     BOOL OK; | ||
|  | 
 | ||
|  |     /* Set up a full-sized map for this character that can include bits for all
 | ||
|  |     scripts. Copy the scriptx map for this character (which covers those | ||
|  |     scripts that appear in script extension lists), set the remaining values to | ||
|  |     zero, and then, except for Common or Inherited, add this script's bit to | ||
|  |     the map. */ | ||
|  | 
 | ||
|  |     memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t)); | ||
|  |     memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t)); | ||
|  |     if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script); | ||
|  | 
 | ||
|  |     /* Handle the different checking states */ | ||
|  | 
 | ||
|  |     switch(require_state) | ||
|  |       { | ||
|  |       /* First significant character - it might follow Common or Inherited
 | ||
|  |       characters that do not have any script extensions. */ | ||
|  | 
 | ||
|  |       case SCRIPT_UNSET: | ||
|  |       switch(script) | ||
|  |         { | ||
|  |         case ucp_Han: | ||
|  |         require_state = SCRIPT_HANPENDING; | ||
|  |         break; | ||
|  | 
 | ||
|  |         case ucp_Hiragana: | ||
|  |         case ucp_Katakana: | ||
|  |         require_state = SCRIPT_HANHIRAKATA; | ||
|  |         break; | ||
|  | 
 | ||
|  |         case ucp_Bopomofo: | ||
|  |         require_state = SCRIPT_HANBOPOMOFO; | ||
|  |         break; | ||
|  | 
 | ||
|  |         case ucp_Hangul: | ||
|  |         require_state = SCRIPT_HANHANGUL; | ||
|  |         break; | ||
|  | 
 | ||
|  |         default: | ||
|  |         memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t)); | ||
|  |         require_state = SCRIPT_MAP; | ||
|  |         break; | ||
|  |         } | ||
|  |       break; | ||
|  | 
 | ||
|  |       /* The first significant character was Han. An inspection of the Unicode
 | ||
|  |       11.0.0 files shows that there are the following types of Script Extension | ||
|  |       list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul | ||
|  |       scripts: | ||
|  | 
 | ||
|  |       . Bopomofo + Han | ||
|  |       . Han + Hiragana + Katakana | ||
|  |       . Hiragana + Katakana | ||
|  |       . Bopopmofo + Hangul + Han + Hiragana + Katakana | ||
|  | 
 | ||
|  |       The following code tries to make sense of this. */ | ||
|  | 
 | ||
|  | #define FOUND_BOPOMOFO 1
 | ||
|  | #define FOUND_HIRAGANA 2
 | ||
|  | #define FOUND_KATAKANA 4
 | ||
|  | #define FOUND_HANGUL   8
 | ||
|  | 
 | ||
|  |       case SCRIPT_HANPENDING: | ||
|  |       if (script != ucp_Han)   /* Another Han does nothing */ | ||
|  |         { | ||
|  |         uint32_t chspecial = 0; | ||
|  | 
 | ||
|  |         if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO; | ||
|  |         if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA; | ||
|  |         if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA; | ||
|  |         if (MAPBIT(map, ucp_Hangul) != 0)   chspecial |= FOUND_HANGUL; | ||
|  | 
 | ||
|  |         if (chspecial == 0) return FALSE;   /* Not allowed with Han */ | ||
|  | 
 | ||
|  |         if (chspecial == FOUND_BOPOMOFO) | ||
|  |           require_state = SCRIPT_HANBOPOMOFO; | ||
|  |         else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA)) | ||
|  |           require_state = SCRIPT_HANHIRAKATA; | ||
|  | 
 | ||
|  |         /* Otherwise this character must be allowed with all of them, so remain
 | ||
|  |         in the pending state. */ | ||
|  |         } | ||
|  |       break; | ||
|  | 
 | ||
|  |       /* Previously encountered one of the "with Han" scripts. Check that
 | ||
|  |       this character is appropriate. */ | ||
|  | 
 | ||
|  |       case SCRIPT_HANHIRAKATA: | ||
|  |       if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) + | ||
|  |           MAPBIT(map, ucp_Katakana) == 0) return FALSE; | ||
|  |       break; | ||
|  | 
 | ||
|  |       case SCRIPT_HANBOPOMOFO: | ||
|  |       if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE; | ||
|  |       break; | ||
|  | 
 | ||
|  |       case SCRIPT_HANHANGUL: | ||
|  |       if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE; | ||
|  |       break; | ||
|  | 
 | ||
|  |       /* Previously encountered one or more characters that are allowed with a
 | ||
|  |       list of scripts. */ | ||
|  | 
 | ||
|  |       case SCRIPT_MAP: | ||
|  |       OK = FALSE; | ||
|  | 
 | ||
|  |       for (int i = 0; i < FULL_MAPSIZE; i++) | ||
|  |         { | ||
|  |         if ((require_map[i] & map[i]) != 0) | ||
|  |           { | ||
|  |           OK = TRUE; | ||
|  |           break; | ||
|  |           } | ||
|  |         } | ||
|  | 
 | ||
|  |       if (!OK) return FALSE; | ||
|  | 
 | ||
|  |       /* The rest of the string must be in this script, but we have to
 | ||
|  |       allow for the Han complications. */ | ||
|  | 
 | ||
|  |       switch(script) | ||
|  |         { | ||
|  |         case ucp_Han: | ||
|  |         require_state = SCRIPT_HANPENDING; | ||
|  |         break; | ||
|  | 
 | ||
|  |         case ucp_Hiragana: | ||
|  |         case ucp_Katakana: | ||
|  |         require_state = SCRIPT_HANHIRAKATA; | ||
|  |         break; | ||
|  | 
 | ||
|  |         case ucp_Bopomofo: | ||
|  |         require_state = SCRIPT_HANBOPOMOFO; | ||
|  |         break; | ||
|  | 
 | ||
|  |         case ucp_Hangul: | ||
|  |         require_state = SCRIPT_HANHANGUL; | ||
|  |         break; | ||
|  | 
 | ||
|  |         /* Compute the intersection of the required list of scripts and the
 | ||
|  |         allowed scripts for this character. */ | ||
|  | 
 | ||
|  |         default: | ||
|  |         for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i]; | ||
|  |         break; | ||
|  |         } | ||
|  | 
 | ||
|  |       break; | ||
|  |       } | ||
|  |     }   /* End checking character's script and extensions. */ | ||
|  | 
 | ||
|  |   /* The character is in an acceptable script. We must now ensure that all
 | ||
|  |   decimal digits in the string come from the same set. Some scripts (e.g. | ||
|  |   Common, Arabic) have more than one set of decimal digits. This code does | ||
|  |   not allow mixing sets, even within the same script. The vector called | ||
|  |   PRIV(ucd_digit_sets)[] contains, in its first element, the number of | ||
|  |   following elements, and then, in ascending order, the code points of the | ||
|  |   '9' characters in every set of 10 digits. Each set is identified by the | ||
|  |   offset in the vector of its '9' character. An initial check of the first | ||
|  |   value picks up ASCII digits quickly. Otherwise, a binary chop is used. */ | ||
|  | 
 | ||
|  |   if (ucd->chartype == ucp_Nd) | ||
|  |     { | ||
|  |     uint32_t digitset; | ||
|  | 
 | ||
|  |     if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else | ||
|  |       { | ||
|  |       int mid; | ||
|  |       int bot = 1; | ||
|  |       int top = PRIV(ucd_digit_sets)[0]; | ||
|  |       for (;;) | ||
|  |         { | ||
|  |         if (top <= bot + 1)    /* <= rather than == is paranoia */ | ||
|  |           { | ||
|  |           digitset = top; | ||
|  |           break; | ||
|  |           } | ||
|  |         mid = (top + bot) / 2; | ||
|  |         if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid; | ||
|  |         } | ||
|  |       } | ||
|  | 
 | ||
|  |     /* A required value of 0 means "unset". */ | ||
|  | 
 | ||
|  |     if (require_digitset == 0) require_digitset = digitset; | ||
|  |       else if (digitset != require_digitset) return FALSE; | ||
|  |     }   /* End digit handling */ | ||
|  | 
 | ||
|  |   /* If we haven't yet got to the end, pick up the next character. */ | ||
|  | 
 | ||
|  |   if (ptr >= endptr) return TRUE; | ||
|  |   GETCHARINCTEST(c, ptr); | ||
|  |   }  /* End checking loop */ | ||
|  | 
 | ||
|  | #else   /* NOT SUPPORT_UNICODE */
 | ||
|  | (void)ptr; | ||
|  | (void)endptr; | ||
|  | (void)utf; | ||
|  | return TRUE; | ||
|  | #endif  /* SUPPORT_UNICODE */
 | ||
|  | } | ||
|  | 
 | ||
|  | /* End of pcre2_script_run.c */ |