forked from LeenkxTeam/LNXSDK
		
	
		
			
				
	
	
		
			345 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			345 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*************************************************
 | |
| *      Perl-Compatible Regular Expressions       *
 | |
| *************************************************/
 | |
| 
 | |
| /* PCRE is a library of functions to support regular expressions whose syntax
 | |
| and semantics are as close as possible to those of the Perl 5 language.
 | |
| 
 | |
|                        Written by Philip Hazel
 | |
|      Original API code Copyright (c) 1997-2012 University of Cambridge
 | |
|           New API code Copyright (c) 2016-2021 University of Cambridge
 | |
| 
 | |
| -----------------------------------------------------------------------------
 | |
| Redistribution and use in source and binary forms, with or without
 | |
| modification, are permitted provided that the following conditions are met:
 | |
| 
 | |
|     * Redistributions of source code must retain the above copyright notice,
 | |
|       this list of conditions and the following disclaimer.
 | |
| 
 | |
|     * Redistributions in binary form must reproduce the above copyright
 | |
|       notice, this list of conditions and the following disclaimer in the
 | |
|       documentation and/or other materials provided with the distribution.
 | |
| 
 | |
|     * Neither the name of the University of Cambridge nor the names of its
 | |
|       contributors may be used to endorse or promote products derived from
 | |
|       this software without specific prior written permission.
 | |
| 
 | |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | |
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | |
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | |
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 | |
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 | |
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 | |
| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 | |
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 | |
| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 | |
| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 | |
| POSSIBILITY OF SUCH DAMAGE.
 | |
| -----------------------------------------------------------------------------
 | |
| */
 | |
| 
 | |
| /* This module contains the function for checking a script run. */
 | |
| 
 | |
| #ifdef HAVE_CONFIG_H
 | |
| #include "config.h"
 | |
| #endif
 | |
| 
 | |
| #include "pcre2_internal.h"
 | |
| 
 | |
| 
 | |
| /*************************************************
 | |
| *                Check script run                *
 | |
| *************************************************/
 | |
| 
 | |
| /* A script run is conceptually a sequence of characters all in the same
 | |
| Unicode script. However, it isn't quite that simple. There are special rules
 | |
| for scripts that are commonly used together, and also special rules for digits.
 | |
| This function implements the appropriate checks, which is possible only when
 | |
| PCRE2 is compiled with Unicode support. The function returns TRUE if there is
 | |
| no Unicode support; however, it should never be called in that circumstance
 | |
| because an error is given by pcre2_compile() if a script run is called for in a
 | |
| version of PCRE2 compiled without Unicode support.
 | |
| 
 | |
| Arguments:
 | |
|   pgr       point to the first character
 | |
|   endptr    point after the last character
 | |
|   utf       TRUE if in UTF mode
 | |
| 
 | |
| Returns:    TRUE if this is a valid script run
 | |
| */
 | |
| 
 | |
| /* These are states in the checking process. */
 | |
| 
 | |
| enum { SCRIPT_UNSET,          /* Requirement as yet unknown */
 | |
|        SCRIPT_MAP,            /* Bitmap contains acceptable scripts */
 | |
|        SCRIPT_HANPENDING,     /* Have had only Han characters */
 | |
|        SCRIPT_HANHIRAKATA,    /* Expect Han or Hirikata */
 | |
|        SCRIPT_HANBOPOMOFO,    /* Expect Han or Bopomofo */
 | |
|        SCRIPT_HANHANGUL       /* Expect Han or Hangul */
 | |
|        };
 | |
| 
 | |
| #define UCD_MAPSIZE (ucp_Unknown/32 + 1)
 | |
| #define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
 | |
| 
 | |
| BOOL
 | |
| PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
 | |
| {
 | |
| #ifdef SUPPORT_UNICODE
 | |
| uint32_t require_state = SCRIPT_UNSET;
 | |
| uint32_t require_map[FULL_MAPSIZE];
 | |
| uint32_t map[FULL_MAPSIZE];
 | |
| uint32_t require_digitset = 0;
 | |
| uint32_t c;
 | |
| 
 | |
| #if PCRE2_CODE_UNIT_WIDTH == 32
 | |
| (void)utf;    /* Avoid compiler warning */
 | |
| #endif
 | |
| 
 | |
| /* Any string containing fewer than 2 characters is a valid script run. */
 | |
| 
 | |
| if (ptr >= endptr) return TRUE;
 | |
| GETCHARINCTEST(c, ptr);
 | |
| if (ptr >= endptr) return TRUE;
 | |
| 
 | |
| /* Initialize the require map. This is a full-size bitmap that has a bit for
 | |
| every script, as opposed to the maps in ucd_script_sets, which only have bits
 | |
| for scripts less than ucp_Unknown - those that appear in script extension
 | |
| lists. */
 | |
| 
 | |
| for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
 | |
| 
 | |
| /* Scan strings of two or more characters, checking the Unicode characteristics
 | |
| of each code point. There is special code for scripts that can be combined with
 | |
| characters from the Han Chinese script. This may be used in conjunction with
 | |
| four other scripts in these combinations:
 | |
| 
 | |
| . Han with Hiragana and Katakana is allowed (for Japanese).
 | |
| . Han with Bopomofo is allowed (for Taiwanese Mandarin).
 | |
| . Han with Hangul is allowed (for Korean).
 | |
| 
 | |
| If the first significant character's script is one of the four, the required
 | |
| script type is immediately known. However, if the first significant
 | |
| character's script is Han, we have to keep checking for a non-Han character.
 | |
| Hence the SCRIPT_HANPENDING state. */
 | |
| 
 | |
| for (;;)
 | |
|   {
 | |
|   const ucd_record *ucd = GET_UCD(c);
 | |
|   uint32_t script = ucd->script;
 | |
| 
 | |
|   /* If the script is Unknown, the string is not a valid script run. Such
 | |
|   characters can only form script runs of length one (see test above). */
 | |
| 
 | |
|   if (script == ucp_Unknown) return FALSE;
 | |
| 
 | |
|   /* A character without any script extensions whose script is Inherited or
 | |
|   Common is always accepted with any script. If there are extensions, the
 | |
|   following processing happens for all scripts. */
 | |
| 
 | |
|   if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common))
 | |
|     {
 | |
|     BOOL OK;
 | |
| 
 | |
|     /* Set up a full-sized map for this character that can include bits for all
 | |
|     scripts. Copy the scriptx map for this character (which covers those
 | |
|     scripts that appear in script extension lists), set the remaining values to
 | |
|     zero, and then, except for Common or Inherited, add this script's bit to
 | |
|     the map. */
 | |
| 
 | |
|     memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));
 | |
|     memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
 | |
|     if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
 | |
| 
 | |
|     /* Handle the different checking states */
 | |
| 
 | |
|     switch(require_state)
 | |
|       {
 | |
|       /* First significant character - it might follow Common or Inherited
 | |
|       characters that do not have any script extensions. */
 | |
| 
 | |
|       case SCRIPT_UNSET:
 | |
|       switch(script)
 | |
|         {
 | |
|         case ucp_Han:
 | |
|         require_state = SCRIPT_HANPENDING;
 | |
|         break;
 | |
| 
 | |
|         case ucp_Hiragana:
 | |
|         case ucp_Katakana:
 | |
|         require_state = SCRIPT_HANHIRAKATA;
 | |
|         break;
 | |
| 
 | |
|         case ucp_Bopomofo:
 | |
|         require_state = SCRIPT_HANBOPOMOFO;
 | |
|         break;
 | |
| 
 | |
|         case ucp_Hangul:
 | |
|         require_state = SCRIPT_HANHANGUL;
 | |
|         break;
 | |
| 
 | |
|         default:
 | |
|         memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
 | |
|         require_state = SCRIPT_MAP;
 | |
|         break;
 | |
|         }
 | |
|       break;
 | |
| 
 | |
|       /* The first significant character was Han. An inspection of the Unicode
 | |
|       11.0.0 files shows that there are the following types of Script Extension
 | |
|       list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
 | |
|       scripts:
 | |
| 
 | |
|       . Bopomofo + Han
 | |
|       . Han + Hiragana + Katakana
 | |
|       . Hiragana + Katakana
 | |
|       . Bopopmofo + Hangul + Han + Hiragana + Katakana
 | |
| 
 | |
|       The following code tries to make sense of this. */
 | |
| 
 | |
| #define FOUND_BOPOMOFO 1
 | |
| #define FOUND_HIRAGANA 2
 | |
| #define FOUND_KATAKANA 4
 | |
| #define FOUND_HANGUL   8
 | |
| 
 | |
|       case SCRIPT_HANPENDING:
 | |
|       if (script != ucp_Han)   /* Another Han does nothing */
 | |
|         {
 | |
|         uint32_t chspecial = 0;
 | |
| 
 | |
|         if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
 | |
|         if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
 | |
|         if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
 | |
|         if (MAPBIT(map, ucp_Hangul) != 0)   chspecial |= FOUND_HANGUL;
 | |
| 
 | |
|         if (chspecial == 0) return FALSE;   /* Not allowed with Han */
 | |
| 
 | |
|         if (chspecial == FOUND_BOPOMOFO)
 | |
|           require_state = SCRIPT_HANBOPOMOFO;
 | |
|         else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
 | |
|           require_state = SCRIPT_HANHIRAKATA;
 | |
| 
 | |
|         /* Otherwise this character must be allowed with all of them, so remain
 | |
|         in the pending state. */
 | |
|         }
 | |
|       break;
 | |
| 
 | |
|       /* Previously encountered one of the "with Han" scripts. Check that
 | |
|       this character is appropriate. */
 | |
| 
 | |
|       case SCRIPT_HANHIRAKATA:
 | |
|       if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
 | |
|           MAPBIT(map, ucp_Katakana) == 0) return FALSE;
 | |
|       break;
 | |
| 
 | |
|       case SCRIPT_HANBOPOMOFO:
 | |
|       if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
 | |
|       break;
 | |
| 
 | |
|       case SCRIPT_HANHANGUL:
 | |
|       if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
 | |
|       break;
 | |
| 
 | |
|       /* Previously encountered one or more characters that are allowed with a
 | |
|       list of scripts. */
 | |
| 
 | |
|       case SCRIPT_MAP:
 | |
|       OK = FALSE;
 | |
| 
 | |
|       for (int i = 0; i < FULL_MAPSIZE; i++)
 | |
|         {
 | |
|         if ((require_map[i] & map[i]) != 0)
 | |
|           {
 | |
|           OK = TRUE;
 | |
|           break;
 | |
|           }
 | |
|         }
 | |
| 
 | |
|       if (!OK) return FALSE;
 | |
| 
 | |
|       /* The rest of the string must be in this script, but we have to
 | |
|       allow for the Han complications. */
 | |
| 
 | |
|       switch(script)
 | |
|         {
 | |
|         case ucp_Han:
 | |
|         require_state = SCRIPT_HANPENDING;
 | |
|         break;
 | |
| 
 | |
|         case ucp_Hiragana:
 | |
|         case ucp_Katakana:
 | |
|         require_state = SCRIPT_HANHIRAKATA;
 | |
|         break;
 | |
| 
 | |
|         case ucp_Bopomofo:
 | |
|         require_state = SCRIPT_HANBOPOMOFO;
 | |
|         break;
 | |
| 
 | |
|         case ucp_Hangul:
 | |
|         require_state = SCRIPT_HANHANGUL;
 | |
|         break;
 | |
| 
 | |
|         /* Compute the intersection of the required list of scripts and the
 | |
|         allowed scripts for this character. */
 | |
| 
 | |
|         default:
 | |
|         for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
 | |
|         break;
 | |
|         }
 | |
| 
 | |
|       break;
 | |
|       }
 | |
|     }   /* End checking character's script and extensions. */
 | |
| 
 | |
|   /* The character is in an acceptable script. We must now ensure that all
 | |
|   decimal digits in the string come from the same set. Some scripts (e.g.
 | |
|   Common, Arabic) have more than one set of decimal digits. This code does
 | |
|   not allow mixing sets, even within the same script. The vector called
 | |
|   PRIV(ucd_digit_sets)[] contains, in its first element, the number of
 | |
|   following elements, and then, in ascending order, the code points of the
 | |
|   '9' characters in every set of 10 digits. Each set is identified by the
 | |
|   offset in the vector of its '9' character. An initial check of the first
 | |
|   value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
 | |
| 
 | |
|   if (ucd->chartype == ucp_Nd)
 | |
|     {
 | |
|     uint32_t digitset;
 | |
| 
 | |
|     if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
 | |
|       {
 | |
|       int mid;
 | |
|       int bot = 1;
 | |
|       int top = PRIV(ucd_digit_sets)[0];
 | |
|       for (;;)
 | |
|         {
 | |
|         if (top <= bot + 1)    /* <= rather than == is paranoia */
 | |
|           {
 | |
|           digitset = top;
 | |
|           break;
 | |
|           }
 | |
|         mid = (top + bot) / 2;
 | |
|         if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
 | |
|         }
 | |
|       }
 | |
| 
 | |
|     /* A required value of 0 means "unset". */
 | |
| 
 | |
|     if (require_digitset == 0) require_digitset = digitset;
 | |
|       else if (digitset != require_digitset) return FALSE;
 | |
|     }   /* End digit handling */
 | |
| 
 | |
|   /* If we haven't yet got to the end, pick up the next character. */
 | |
| 
 | |
|   if (ptr >= endptr) return TRUE;
 | |
|   GETCHARINCTEST(c, ptr);
 | |
|   }  /* End checking loop */
 | |
| 
 | |
| #else   /* NOT SUPPORT_UNICODE */
 | |
| (void)ptr;
 | |
| (void)endptr;
 | |
| (void)utf;
 | |
| return TRUE;
 | |
| #endif  /* SUPPORT_UNICODE */
 | |
| }
 | |
| 
 | |
| /* End of pcre2_script_run.c */
 |