forked from LeenkxTeam/LNXSDK
Update Files
This commit is contained in:
355
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/GenerateCommon.py
vendored
Normal file
355
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/GenerateCommon.py
vendored
Normal file
@ -0,0 +1,355 @@
|
||||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
|
||||
# This file is a Python module containing common lists and functions for the
|
||||
# GenerateXXX scripts that create various.c and .h files from Unicode data
|
||||
# files. It was created as part of a re-organizaton of these scripts in
|
||||
# December 2021.
|
||||
|
||||
|
||||
import re
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DATA LISTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# BIDI classes in the DerivedBidiClass.txt file, with comments.
|
||||
|
||||
bidi_classes = [
|
||||
'AL', 'Arabic letter',
|
||||
'AN', 'Arabic number',
|
||||
'B', 'Paragraph separator',
|
||||
'BN', 'Boundary neutral',
|
||||
'CS', 'Common separator',
|
||||
'EN', 'European number',
|
||||
'ES', 'European separator',
|
||||
'ET', 'European terminator',
|
||||
'FSI', 'First strong isolate',
|
||||
'L', 'Left to right',
|
||||
'LRE', 'Left to right embedding',
|
||||
'LRI', 'Left to right isolate',
|
||||
'LRO', 'Left to right override',
|
||||
'NSM', 'Non-spacing mark',
|
||||
'ON', 'Other neutral',
|
||||
'PDF', 'Pop directional format',
|
||||
'PDI', 'Pop directional isolate',
|
||||
'R', 'Right to left',
|
||||
'RLE', 'Right to left embedding',
|
||||
'RLI', 'Right to left isolate',
|
||||
'RLO', 'Right to left override',
|
||||
'S', 'Segment separator',
|
||||
'WS', 'White space'
|
||||
]
|
||||
|
||||
# Particular category property names, with comments. NOTE: If ever this list
|
||||
# is changed, the table called "catposstab" in the pcre2_auto_possess.c file
|
||||
# must be edited to keep in step.
|
||||
|
||||
category_names = [
|
||||
'Cc', 'Control',
|
||||
'Cf', 'Format',
|
||||
'Cn', 'Unassigned',
|
||||
'Co', 'Private use',
|
||||
'Cs', 'Surrogate',
|
||||
'Ll', 'Lower case letter',
|
||||
'Lm', 'Modifier letter',
|
||||
'Lo', 'Other letter',
|
||||
'Lt', 'Title case letter',
|
||||
'Lu', 'Upper case letter',
|
||||
'Mc', 'Spacing mark',
|
||||
'Me', 'Enclosing mark',
|
||||
'Mn', 'Non-spacing mark',
|
||||
'Nd', 'Decimal number',
|
||||
'Nl', 'Letter number',
|
||||
'No', 'Other number',
|
||||
'Pc', 'Connector punctuation',
|
||||
'Pd', 'Dash punctuation',
|
||||
'Pe', 'Close punctuation',
|
||||
'Pf', 'Final punctuation',
|
||||
'Pi', 'Initial punctuation',
|
||||
'Po', 'Other punctuation',
|
||||
'Ps', 'Open punctuation',
|
||||
'Sc', 'Currency symbol',
|
||||
'Sk', 'Modifier symbol',
|
||||
'Sm', 'Mathematical symbol',
|
||||
'So', 'Other symbol',
|
||||
'Zl', 'Line separator',
|
||||
'Zp', 'Paragraph separator',
|
||||
'Zs', 'Space separator'
|
||||
]
|
||||
|
||||
# The Extended_Pictographic property is not found in the file where all the
|
||||
# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
|
||||
# file, but we list it here so that the name has the correct index value.
|
||||
|
||||
break_properties = [
|
||||
'CR', ' 0',
|
||||
'LF', ' 1',
|
||||
'Control', ' 2',
|
||||
'Extend', ' 3',
|
||||
'Prepend', ' 4',
|
||||
'SpacingMark', ' 5',
|
||||
'L', ' 6 Hangul syllable type L',
|
||||
'V', ' 7 Hangul syllable type V',
|
||||
'T', ' 8 Hangul syllable type T',
|
||||
'LV', ' 9 Hangul syllable type LV',
|
||||
'LVT', '10 Hangul syllable type LVT',
|
||||
'Regional_Indicator', '11',
|
||||
'Other', '12',
|
||||
'ZWJ', '13',
|
||||
'Extended_Pictographic', '14'
|
||||
]
|
||||
|
||||
# List of files from which the names of Boolean properties are obtained, along
|
||||
# with a list of regex patterns for properties to be ignored, and a list of
|
||||
# extra pattern names to add.
|
||||
|
||||
bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt']
|
||||
bool_propsignore = [r'^Other_', r'^Hyphen$']
|
||||
bool_propsextras = ['ASCII', 'Bidi_Mirrored']
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# GET BOOLEAN PROPERTY NAMES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Get a list of Boolean property names from a number of files.
|
||||
|
||||
def getbpropslist():
|
||||
bplist = []
|
||||
bplast = ""
|
||||
|
||||
for filename in bool_propsfiles:
|
||||
try:
|
||||
file = open('Unicode.tables/' + filename, 'r')
|
||||
except IOError:
|
||||
print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
|
||||
sys.exit(1)
|
||||
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
data = list(map(str.strip, line.split(';')))
|
||||
if len(data) <= 1 or data[1] == bplast:
|
||||
continue
|
||||
bplast = data[1]
|
||||
for pat in bool_propsignore:
|
||||
if re.match(pat, bplast) != None:
|
||||
break
|
||||
else:
|
||||
bplist.append(bplast)
|
||||
|
||||
file.close()
|
||||
|
||||
bplist.extend(bool_propsextras)
|
||||
bplist.sort()
|
||||
return bplist
|
||||
|
||||
bool_properties = getbpropslist()
|
||||
bool_props_list_item_size = (len(bool_properties) + 31) // 32
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# COLLECTING PROPERTY NAMES AND ALIASES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
script_names = ['Unknown']
|
||||
abbreviations = {}
|
||||
|
||||
def collect_property_names():
|
||||
global script_names
|
||||
global abbreviations
|
||||
|
||||
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #')
|
||||
|
||||
last_script_name = ""
|
||||
with open("Unicode.tables/Scripts.txt") as f:
|
||||
for line in f:
|
||||
match_obj = names_re.match(line)
|
||||
|
||||
if match_obj == None or match_obj.group(1) == last_script_name:
|
||||
continue
|
||||
|
||||
last_script_name = match_obj.group(1)
|
||||
script_names.append(last_script_name)
|
||||
|
||||
# Sometimes there is comment in the line
|
||||
# so splitting around semicolon is not enough
|
||||
value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?')
|
||||
|
||||
with open("Unicode.tables/PropertyValueAliases.txt") as f:
|
||||
for line in f:
|
||||
match_obj = value_alias_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
if match_obj.group(1) == "sc":
|
||||
if match_obj.group(2) == match_obj.group(3):
|
||||
abbreviations[match_obj.group(3)] = ()
|
||||
elif match_obj.group(4) == None:
|
||||
abbreviations[match_obj.group(3)] = (match_obj.group(2),)
|
||||
else:
|
||||
abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4))
|
||||
|
||||
# We can also collect Boolean property abbreviations into the same dictionary
|
||||
|
||||
bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?')
|
||||
with open("Unicode.tables/PropertyAliases.txt") as f:
|
||||
for line in f:
|
||||
match_obj = bin_alias_re.match(line)
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
if match_obj.group(2) in bool_properties:
|
||||
if match_obj.group(3) == None:
|
||||
abbreviations[match_obj.group(2)] = (match_obj.group(1),)
|
||||
else:
|
||||
abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3))
|
||||
|
||||
collect_property_names()
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# REORDERING SCRIPT NAMES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
script_abbrevs = []
|
||||
|
||||
def reorder_scripts():
|
||||
global script_names
|
||||
global script_abbrevs
|
||||
global abbreviations
|
||||
|
||||
for name in script_names:
|
||||
abbrevs = abbreviations[name]
|
||||
script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0])
|
||||
|
||||
extended_script_abbrevs = set()
|
||||
with open("Unicode.tables/ScriptExtensions.txt") as f:
|
||||
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')
|
||||
|
||||
for line in f:
|
||||
match_obj = names_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
for name in match_obj.group(1).split(" "):
|
||||
extended_script_abbrevs.add(name)
|
||||
|
||||
new_script_names = []
|
||||
new_script_abbrevs = []
|
||||
|
||||
for idx, abbrev in enumerate(script_abbrevs):
|
||||
if abbrev in extended_script_abbrevs:
|
||||
new_script_names.append(script_names[idx])
|
||||
new_script_abbrevs.append(abbrev)
|
||||
|
||||
for idx, abbrev in enumerate(script_abbrevs):
|
||||
if abbrev not in extended_script_abbrevs:
|
||||
new_script_names.append(script_names[idx])
|
||||
new_script_abbrevs.append(abbrev)
|
||||
|
||||
script_names = new_script_names
|
||||
script_abbrevs = new_script_abbrevs
|
||||
|
||||
reorder_scripts()
|
||||
script_list_item_size = (script_names.index('Unknown') + 31) // 32
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DERIVED LISTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Create general character property names from the first letters of the
|
||||
# particular categories.
|
||||
|
||||
gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2))
|
||||
general_category_names = list(gcn_set)
|
||||
general_category_names.sort()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FUNCTIONS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
import sys
|
||||
|
||||
# Open an output file, using the command's argument or a default. Write common
|
||||
# preliminary header information.
|
||||
|
||||
def open_output(default):
|
||||
if len(sys.argv) > 2:
|
||||
print('** Too many arguments: just give a file name')
|
||||
sys.exit(1)
|
||||
if len(sys.argv) == 2:
|
||||
output_name = sys.argv[1]
|
||||
else:
|
||||
output_name = default
|
||||
try:
|
||||
file = open(output_name, "w")
|
||||
except IOError:
|
||||
print ("** Couldn't open %s" % output_name)
|
||||
sys.exit(1)
|
||||
|
||||
script_name = sys.argv[0]
|
||||
i = script_name.rfind('/')
|
||||
if i >= 0:
|
||||
script_name = script_name[i+1:]
|
||||
|
||||
file.write("""\
|
||||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
|
||||
""")
|
||||
|
||||
file.write("Instead, modify the maint/%s script and run it to generate\n"
|
||||
"a new version of this code.\n\n" % script_name)
|
||||
|
||||
file.write("""\
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
\n""")
|
||||
return file
|
||||
|
||||
# End of UcpCommon.py
|
188
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/GenerateTest26.py
vendored
Normal file
188
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/GenerateTest26.py
vendored
Normal file
@ -0,0 +1,188 @@
|
||||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
#
|
||||
# This file auto-generates unicode property tests and their expected output.
|
||||
# It is recommended to re-run this generator after the unicode files are
|
||||
# updated. The names of the generated files are `testinput26` and `testoutput26`
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
from GenerateCommon import \
|
||||
script_names, \
|
||||
script_abbrevs
|
||||
|
||||
def write_both(text):
|
||||
input_file.write(text)
|
||||
output_file.write(text)
|
||||
|
||||
def to_string_char(ch_idx):
|
||||
if ch_idx < 128:
|
||||
if ch_idx < 16:
|
||||
return "\\x{0%x}" % ch_idx
|
||||
if ch_idx >= 32:
|
||||
return chr(ch_idx)
|
||||
return "\\x{%x}" % ch_idx
|
||||
|
||||
output_directory = ""
|
||||
|
||||
if len(sys.argv) > 2:
|
||||
print('** Too many arguments: just give a directory name')
|
||||
sys.exit(1)
|
||||
if len(sys.argv) == 2:
|
||||
output_directory = sys.argv[1]
|
||||
if not output_directory.endswith("/"):
|
||||
output_directory += "/"
|
||||
|
||||
try:
|
||||
input_file = open(output_directory + "testinput26", "w")
|
||||
output_file = open(output_directory + "testoutput26", "w")
|
||||
except IOError:
|
||||
print ("** Couldn't open output files")
|
||||
sys.exit(1)
|
||||
|
||||
write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# UNICODE SCRIPT EXTENSION TESTS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
write_both("# Unicode Script Extension tests.\n\n")
|
||||
|
||||
def gen_script_tests():
|
||||
script_data = [None] * len(script_names)
|
||||
char_data = [None] * 0x110000
|
||||
|
||||
property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
|
||||
prev_name = ""
|
||||
script_idx = -1
|
||||
|
||||
with open("Unicode.tables/Scripts.txt") as f:
|
||||
for line in f:
|
||||
match_obj = property_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
name = match_obj.group(3)
|
||||
if name != prev_name:
|
||||
script_idx = script_names.index(name)
|
||||
prev_name = name
|
||||
|
||||
low = int(match_obj.group(1), 16)
|
||||
high = low
|
||||
char_data[low] = name
|
||||
|
||||
if match_obj.group(2) != None:
|
||||
high = int(match_obj.group(2), 16)
|
||||
for idx in range(low + 1, high + 1):
|
||||
char_data[idx] = name
|
||||
|
||||
if script_data[script_idx] == None:
|
||||
script_data[script_idx] = [low, None, None, None, None]
|
||||
script_data[script_idx][1] = high
|
||||
|
||||
extended_script_indicies = {}
|
||||
|
||||
with open("Unicode.tables/ScriptExtensions.txt") as f:
|
||||
for line in f:
|
||||
match_obj = property_re.match(line)
|
||||
|
||||
if match_obj == None:
|
||||
continue
|
||||
|
||||
low = int(match_obj.group(1), 16)
|
||||
high = low
|
||||
if match_obj.group(2) != None:
|
||||
high = int(match_obj.group(2), 16)
|
||||
|
||||
for abbrev in match_obj.group(3).split(" "):
|
||||
if abbrev not in extended_script_indicies:
|
||||
idx = script_abbrevs.index(abbrev)
|
||||
extended_script_indicies[abbrev] = idx
|
||||
rec = script_data[idx]
|
||||
rec[2] = low
|
||||
rec[3] = high
|
||||
else:
|
||||
idx = extended_script_indicies[abbrev]
|
||||
rec = script_data[idx]
|
||||
if rec[2] > low:
|
||||
rec[2] = low
|
||||
if rec[3] < high:
|
||||
rec[3] = high
|
||||
|
||||
if rec[4] == None:
|
||||
name = script_names[idx]
|
||||
for idx in range(low, high + 1):
|
||||
if char_data[idx] != name:
|
||||
rec[4] = idx
|
||||
break
|
||||
|
||||
long_property_name = False
|
||||
|
||||
for idx, rec in enumerate(script_data):
|
||||
script_name = script_names[idx]
|
||||
|
||||
if script_name == "Unknown":
|
||||
continue
|
||||
|
||||
script_abbrev = script_abbrevs[idx]
|
||||
|
||||
write_both("# Base script check\n")
|
||||
write_both("/^\\p{sc=%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[0]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[0]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
|
||||
write_both(" %s\n" % to_string_char(rec[1]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[1]))
|
||||
write_both("\n")
|
||||
|
||||
if rec[2] != None:
|
||||
property_name = "scx"
|
||||
if long_property_name:
|
||||
property_name = "Script_Extensions"
|
||||
|
||||
write_both("# Script extension check\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[2]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[2]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
|
||||
write_both(" %s\n" % to_string_char(rec[3]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[3]))
|
||||
write_both("\n")
|
||||
|
||||
long_property_name = not long_property_name
|
||||
|
||||
if rec[4] != None:
|
||||
write_both("# Script extension only character\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[4]))
|
||||
output_file.write(" 0: %s\n" % to_string_char(rec[4]))
|
||||
write_both("\n")
|
||||
|
||||
write_both("/^\\p{sc=%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(rec[4]))
|
||||
output_file.write("No match\n")
|
||||
write_both("\n")
|
||||
else:
|
||||
print("External character has not found for %s" % script_name)
|
||||
|
||||
high = rec[1]
|
||||
if rec[3] != None and rec[3] > rec[1]:
|
||||
high = rec[3]
|
||||
write_both("# Character not in script\n")
|
||||
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||
write_both(" %s\n" % to_string_char(high + 1))
|
||||
output_file.write("No match\n")
|
||||
write_both("\n")
|
||||
|
||||
|
||||
gen_script_tests()
|
||||
|
||||
write_both("# End of testinput26\n")
|
923
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/GenerateUcd.py
vendored
Normal file
923
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/GenerateUcd.py
vendored
Normal file
@ -0,0 +1,923 @@
|
||||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
#
|
||||
# This script generates the pcre2_ucd.c file from Unicode data files. This is
|
||||
# the compressed Unicode property data used by PCRE2. The script was created in
|
||||
# December 2021 as part of the Unicode data generation refactoring. It is
|
||||
# basically a re-working of the MultiStage2.py script that was submitted to the
|
||||
# PCRE project by Peter Kankowski in 2008 as part of a previous upgrading of
|
||||
# Unicode property support. A number of extensions have since been added. The
|
||||
# main difference in the 2021 upgrade (apart from comments and layout) is that
|
||||
# the data tables (e.g. list of script names) are now listed in or generated by
|
||||
# a separate Python module that is shared with the other Generate scripts.
|
||||
#
|
||||
# This script must be run in the "maint" directory. It requires the following
|
||||
# Unicode data tables: BidiMirrorring.txt, CaseFolding.txt,
|
||||
# DerivedBidiClass.txt, DerivedCoreProperties.txt, DerivedGeneralCategory.txt,
|
||||
# GraphemeBreakProperty.txt, PropList.txt, PropertyAliases.txt,
|
||||
# PropertyValueAliases.txt, ScriptExtensions.txt, Scripts.txt, and
|
||||
# emoji-data.txt. These must be in the Unicode.tables subdirectory.
|
||||
#
|
||||
# The emoji-data.txt file is found in the "emoji" subdirectory even though it
|
||||
# is technically part of a different (but coordinated) standard as shown
|
||||
# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"),
|
||||
# for example:
|
||||
#
|
||||
# http://unicode.org/Public/emoji/13.0/ReadMe.txt
|
||||
#
|
||||
# DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted"
|
||||
# subdirectory of the Unicode database (UCD) on the Unicode web site;
|
||||
# GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. The other files
|
||||
# are in the top-level UCD directory.
|
||||
#
|
||||
# -----------------------------------------------------------------------------
|
||||
# Minor modifications made to the original script:
|
||||
# Added #! line at start
|
||||
# Removed tabs
|
||||
# Made it work with Python 2.4 by rewriting two statements that needed 2.5
|
||||
# Consequent code tidy
|
||||
# Adjusted data file names to take from the Unicode.tables directory
|
||||
# Adjusted global table names by prefixing _pcre_.
|
||||
# Commented out stuff relating to the casefolding table, which isn't used;
|
||||
# removed completely in 2012.
|
||||
# Corrected size calculation
|
||||
# Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
|
||||
# Update for PCRE2: name changes, and SUPPORT_UCP is abolished.
|
||||
#
|
||||
# Major modifications made to the original script:
|
||||
# Added code to add a grapheme break property field to records.
|
||||
#
|
||||
# Added code to search for sets of more than two characters that must match
|
||||
# each other caselessly. A new table is output containing these sets, and
|
||||
# offsets into the table are added to the main output records. This new
|
||||
# code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer
|
||||
# used.
|
||||
#
|
||||
# Update for Python3:
|
||||
# . Processed with 2to3, but that didn't fix everything
|
||||
# . Changed string.strip to str.strip
|
||||
# . Added encoding='utf-8' to the open() call
|
||||
# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
|
||||
# required and the result of the division is a float
|
||||
#
|
||||
# Added code to scan the emoji-data.txt file to find the Extended Pictographic
|
||||
# property, which is used by PCRE2 as a grapheme breaking property. This was
|
||||
# done when updating to Unicode 11.0.0 (July 2018).
|
||||
#
|
||||
# Added code to add a Script Extensions field to records. This has increased
|
||||
# their size from 8 to 12 bytes, only 10 of which are currently used.
|
||||
#
|
||||
# Added code to add a bidi class field to records by scanning the
|
||||
# DerivedBidiClass.txt and PropList.txt files. This uses one of the two spare
|
||||
# bytes, so now 11 out of 12 are in use.
|
||||
#
|
||||
# 01-March-2010: Updated list of scripts for Unicode 5.2.0
|
||||
# 30-April-2011: Updated list of scripts for Unicode 6.0.0
|
||||
# July-2012: Updated list of scripts for Unicode 6.1.0
|
||||
# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
|
||||
# field in the record to hold the value. Luckily, the
|
||||
# structure had a hole in it, so the resulting table is
|
||||
# not much bigger than before.
|
||||
# 18-September-2012: Added code for multiple caseless sets. This uses the
|
||||
# final hole in the structure.
|
||||
# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
|
||||
# 13-May-2014: Updated for PCRE2
|
||||
# 03-June-2014: Updated for Python 3
|
||||
# 20-June-2014: Updated for Unicode 7.0.0
|
||||
# 12-August-2014: Updated to put Unicode version into the file
|
||||
# 19-June-2015: Updated for Unicode 8.0.0
|
||||
# 02-July-2017: Updated for Unicode 10.0.0
|
||||
# 03-July-2018: Updated for Unicode 11.0.0
|
||||
# 07-July-2018: Added code to scan emoji-data.txt for the Extended
|
||||
# Pictographic property.
|
||||
# 01-October-2018: Added the 'Unknown' script name
|
||||
# 03-October-2018: Added new field for Script Extensions
|
||||
# 27-July-2019: Updated for Unicode 12.1.0
|
||||
# 10-March-2020: Updated for Unicode 13.0.0
|
||||
# PCRE2-10.39: Updated for Unicode 14.0.0
|
||||
# 05-December-2021: Added code to scan DerivedBidiClass.txt for bidi class,
|
||||
# and also PropList.txt for the Bidi_Control property
|
||||
# 19-December-2021: Reworked script extensions lists to be bit maps instead
|
||||
# of zero-terminated lists of script numbers.
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
# Changes to the refactored script:
|
||||
#
|
||||
# 26-December-2021: Refactoring completed
|
||||
# 10-January-2022: Addition of general Boolean property support
|
||||
# 12-January-2022: Merge scriptx and bidiclass fields
|
||||
# 14-January-2022: Enlarge Boolean property offset to 12 bits
|
||||
#
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
#
|
||||
# The main tables generated by this script are used by macros defined in
|
||||
# pcre2_internal.h. They look up Unicode character properties using short
|
||||
# sequences of code that contains no branches, which makes for greater speed.
|
||||
#
|
||||
# Conceptually, there is a table of records (of type ucd_record), one for each
|
||||
# Unicode character. Each record contains the script number, script extension
|
||||
# value, character type, grapheme break type, offset to caseless matching set,
|
||||
# offset to the character's other case, the bidi class, and offset to bitmap of
|
||||
# Boolean properties.
|
||||
#
|
||||
# A real table covering all Unicode characters would be far too big. It can be
|
||||
# efficiently compressed by observing that many characters have the same
|
||||
# record, and many blocks of characters (taking 128 characters in a block) have
|
||||
# the same set of records as other blocks. This leads to a 2-stage lookup
|
||||
# process.
|
||||
#
|
||||
# This script constructs seven tables. The ucd_caseless_sets table contains
|
||||
# lists of characters that all match each other caselessly. Each list is
|
||||
# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
|
||||
# any valid character. The first list is empty; this is used for characters
|
||||
# that are not part of any list.
|
||||
#
|
||||
# The ucd_digit_sets table contains the code points of the '9' characters in
|
||||
# each set of 10 decimal digits in Unicode. This is used to ensure that digits
|
||||
# in script runs all come from the same set. The first element in the vector
|
||||
# contains the number of subsequent elements, which are in ascending order.
|
||||
#
|
||||
# Scripts are partitioned into two groups. Scripts that appear in at least one
|
||||
# character's script extension list come first, followed by "Unknown" and then
|
||||
# all the rest. This sorting is done automatically in the GenerateCommon.py
|
||||
# script. A script's number is its index in the script_names list.
|
||||
#
|
||||
# The ucd_script_sets table contains bitmaps that represent lists of scripts
|
||||
# for Script Extensions properties. Each bitmap consists of a fixed number of
|
||||
# unsigned 32-bit numbers, enough to allocate a bit for every script that is
|
||||
# used in any character's extension list, that is, enough for every script
|
||||
# whose number is less than ucp_Unknown. A character's script extension value
|
||||
# in its ucd record is an offset into the ucd_script_sets vector. The first
|
||||
# bitmap has no bits set; characters that have no script extensions have zero
|
||||
# as their script extensions value so that they use this map.
|
||||
#
|
||||
# The ucd_boolprop_sets table contains bitmaps that represent lists of Boolean
|
||||
# properties. Each bitmap consists of a fixed number of unsigned 32-bit
|
||||
# numbers, enough to allocate a bit for each supported Boolean property.
|
||||
#
|
||||
# The ucd_records table contains one instance of every unique character record
|
||||
# that is required. The ucd_stage1 table is indexed by a character's block
|
||||
# number, which is the character's code point divided by 128, since 128 is the
|
||||
# size of each block. The result of a lookup in ucd_stage1 a "virtual" block
|
||||
# number.
|
||||
#
|
||||
# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by
|
||||
# the offset of a character within its own block, and the result is the index
|
||||
# number of the required record in the ucd_records vector.
|
||||
#
|
||||
# The following examples are correct for the Unicode 14.0.0 database. Future
|
||||
# updates may make change the actual lookup values.
|
||||
#
|
||||
# Example: lowercase "a" (U+0061) is in block 0
|
||||
# lookup 0 in stage1 table yields 0
|
||||
# lookup 97 (0x61) in the first table in stage2 yields 35
|
||||
# record 35 is { 0, 5, 12, 0, -32, 18432, 44 }
|
||||
# 0 = ucp_Latin => Latin script
|
||||
# 5 = ucp_Ll => Lower case letter
|
||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => Not part of a caseless set
|
||||
# -32 (-0x20) => Other case is U+0041
|
||||
# 18432 = 0x4800 => Combined Bidi class + script extension values
|
||||
# 44 => Offset to Boolean properties
|
||||
#
|
||||
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
|
||||
# script extension value, giving:
|
||||
#
|
||||
# 9 = ucp_bidiL => Bidi class left-to-right
|
||||
# 0 => No special script extension property
|
||||
#
|
||||
# Almost all lowercase latin characters resolve to the same record. One or two
|
||||
# are different because they are part of a multi-character caseless set (for
|
||||
# example, k, K and the Kelvin symbol are such a set).
|
||||
#
|
||||
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
|
||||
# lookup 96 in stage1 table yields 93
|
||||
# lookup 66 (0x42) in table 93 in stage2 yields 819
|
||||
# record 819 is { 20, 7, 12, 0, 0, 18432, 82 }
|
||||
# 20 = ucp_Hiragana => Hiragana script
|
||||
# 7 = ucp_Lo => Other letter
|
||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => Not part of a caseless set
|
||||
# 0 => No other case
|
||||
# 18432 = 0x4800 => Combined Bidi class + script extension values
|
||||
# 82 => Offset to Boolean properties
|
||||
#
|
||||
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
|
||||
# script extension value, giving:
|
||||
#
|
||||
# 9 = ucp_bidiL => Bidi class left-to-right
|
||||
# 0 => No special script extension property
|
||||
#
|
||||
# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
|
||||
# lookup 57 in stage1 table yields 55
|
||||
# lookup 80 (0x50) in table 55 in stage2 yields 621
|
||||
# record 621 is { 84, 12, 3, 0, 0, 26762, 96 }
|
||||
# 84 = ucp_Inherited => Script inherited from predecessor
|
||||
# 12 = ucp_Mn => Non-spacing mark
|
||||
# 3 = ucp_gbExtend => Grapheme break property "Extend"
|
||||
# 0 => Not part of a caseless set
|
||||
# 0 => No other case
|
||||
# 26762 = 0x688A => Combined Bidi class + script extension values
|
||||
# 96 => Offset to Boolean properties
|
||||
#
|
||||
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
|
||||
# script extension value, giving:
|
||||
#
|
||||
# 13 = ucp_bidiNSM => Bidi class non-spacing mark
|
||||
# 138 => Script Extension list offset = 138
|
||||
#
|
||||
# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
|
||||
# 18, and 47 set. This means that this character is expected to be used with
|
||||
# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
|
||||
#
|
||||
# Philip Hazel, last updated 14 January 2022.
|
||||
##############################################################################
|
||||
|
||||
|
||||
# Import standard modules
|
||||
|
||||
import re
|
||||
import string
|
||||
import sys
|
||||
|
||||
# Import common data lists and functions
|
||||
|
||||
from GenerateCommon import \
|
||||
bidi_classes, \
|
||||
bool_properties, \
|
||||
bool_propsfiles, \
|
||||
bool_props_list_item_size, \
|
||||
break_properties, \
|
||||
category_names, \
|
||||
general_category_names, \
|
||||
script_abbrevs, \
|
||||
script_list_item_size, \
|
||||
script_names, \
|
||||
open_output
|
||||
|
||||
# Some general parameters
|
||||
|
||||
MAX_UNICODE = 0x110000
|
||||
NOTACHAR = 0xffffffff
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DEFINE FUNCTIONS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
# Parse a line of Scripts.txt, GraphemeBreakProperty.txt, DerivedBidiClass.txt
|
||||
# or DerivedGeneralCategory.txt
|
||||
|
||||
def make_get_names(enum):
|
||||
return lambda chardata: enum.index(chardata[1])
|
||||
|
||||
|
||||
# Parse a line of CaseFolding.txt
|
||||
|
||||
def get_other_case(chardata):
|
||||
if chardata[1] == 'C' or chardata[1] == 'S':
|
||||
return int(chardata[2], 16) - int(chardata[0], 16)
|
||||
return 0
|
||||
|
||||
|
||||
# Parse a line of ScriptExtensions.txt
|
||||
|
||||
def get_script_extension(chardata):
|
||||
global last_script_extension
|
||||
|
||||
offset = len(script_lists) * script_list_item_size
|
||||
if last_script_extension == chardata[1]:
|
||||
return offset - script_list_item_size
|
||||
|
||||
last_script_extension = chardata[1]
|
||||
script_lists.append(tuple(script_abbrevs.index(abbrev) for abbrev in last_script_extension.split(' ')))
|
||||
return offset
|
||||
|
||||
|
||||
# Read a whole table in memory, setting/checking the Unicode version
|
||||
|
||||
def read_table(file_name, get_value, default_value):
|
||||
global unicode_version
|
||||
|
||||
f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name)
|
||||
file_base = f.group(1)
|
||||
version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$"
|
||||
file = open(file_name, 'r', encoding='utf-8')
|
||||
f = re.match(version_pat, file.readline())
|
||||
version = f.group(1)
|
||||
if unicode_version == "":
|
||||
unicode_version = version
|
||||
elif unicode_version != version:
|
||||
print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr)
|
||||
|
||||
table = [default_value] * MAX_UNICODE
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
chardata = list(map(str.strip, line.split(';')))
|
||||
if len(chardata) <= 1:
|
||||
continue
|
||||
value = get_value(chardata)
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
for i in range(char, last + 1):
|
||||
# It is important not to overwrite a previously set value because in the
|
||||
# CaseFolding file there are lines to be ignored (returning the default
|
||||
# value of 0) which often come after a line which has already set data.
|
||||
if table[i] == default_value:
|
||||
table[i] = value
|
||||
file.close()
|
||||
return table
|
||||
|
||||
|
||||
# Get the smallest possible C language type for the values in a table
|
||||
|
||||
def get_type_size(table):
|
||||
type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4),
|
||||
("signed char", 1), ("int16_t", 2), ("int32_t", 4)]
|
||||
limits = [(0, 255), (0, 65535), (0, 4294967295), (-128, 127),
|
||||
(-32768, 32767), (-2147483648, 2147483647)]
|
||||
minval = min(table)
|
||||
maxval = max(table)
|
||||
for num, (minlimit, maxlimit) in enumerate(limits):
|
||||
if minlimit <= minval and maxval <= maxlimit:
|
||||
return type_size[num]
|
||||
raise OverflowError("Too large to fit into C types")
|
||||
|
||||
|
||||
# Get the total size of a list of tables
|
||||
|
||||
def get_tables_size(*tables):
|
||||
total_size = 0
|
||||
for table in tables:
|
||||
type, size = get_type_size(table)
|
||||
total_size += size * len(table)
|
||||
return total_size
|
||||
|
||||
|
||||
# Compress a table into the two stages
|
||||
|
||||
def compress_table(table, block_size):
|
||||
blocks = {} # Dictionary for finding identical blocks
|
||||
stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
|
||||
stage2 = [] # Stage 2 table contains the blocks with property values
|
||||
table = tuple(table)
|
||||
for i in range(0, len(table), block_size):
|
||||
block = table[i:i+block_size]
|
||||
start = blocks.get(block)
|
||||
if start is None:
|
||||
# Allocate a new block
|
||||
start = len(stage2) / block_size
|
||||
stage2 += block
|
||||
blocks[block] = start
|
||||
stage1.append(start)
|
||||
return stage1, stage2
|
||||
|
||||
|
||||
# Output a table
|
||||
|
||||
def write_table(table, table_name, block_size = None):
|
||||
type, size = get_type_size(table)
|
||||
ELEMS_PER_LINE = 16
|
||||
|
||||
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
|
||||
if block_size:
|
||||
s += ", block = %d" % block_size
|
||||
f.write(s + " */\n")
|
||||
table = tuple(table)
|
||||
if block_size is None:
|
||||
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */\n"
|
||||
mult = MAX_UNICODE / len(table)
|
||||
for i in range(0, len(table), ELEMS_PER_LINE):
|
||||
f.write(fmt % (table[i:i+ELEMS_PER_LINE] + (int(i * mult),)))
|
||||
else:
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
el = ELEMS_PER_LINE
|
||||
else:
|
||||
el = block_size
|
||||
fmt = "%3d," * el + "\n"
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
fmt = fmt * int(block_size / ELEMS_PER_LINE)
|
||||
for i in range(0, len(table), block_size):
|
||||
f.write(("\n/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
|
||||
f.write("};\n\n")
|
||||
|
||||
|
||||
# Extract the unique combinations of properties into records
|
||||
|
||||
def combine_tables(*tables):
|
||||
records = {}
|
||||
index = []
|
||||
for t in zip(*tables):
|
||||
i = records.get(t)
|
||||
if i is None:
|
||||
i = records[t] = len(records)
|
||||
index.append(i)
|
||||
return index, records
|
||||
|
||||
|
||||
# Create a record struct
|
||||
|
||||
def get_record_size_struct(records):
|
||||
size = 0
|
||||
structure = 'typedef struct {\n'
|
||||
for i in range(len(records[0])):
|
||||
record_slice = [record[i] for record in records]
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
# add padding: round up to the nearest power of slice_size
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
size += slice_size
|
||||
structure += '%s property_%d;\n' % (slice_type, i)
|
||||
|
||||
# round up to the first item of the next structure in array
|
||||
record_slice = [record[0] for record in records]
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
|
||||
structure += '} ucd_record;\n*/\n'
|
||||
return size, structure
|
||||
|
||||
|
||||
# Write records
|
||||
|
||||
def write_records(records, record_size):
|
||||
f.write('const ucd_record PRIV(ucd_records)[] = { ' + \
|
||||
'/* %d bytes, record size %d */\n' % (len(records) * record_size, record_size))
|
||||
records = list(zip(list(records.keys()), list(records.values())))
|
||||
records.sort(key = lambda x: x[1])
|
||||
for i, record in enumerate(records):
|
||||
f.write((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */\n') % (record[0] + (i,)))
|
||||
f.write('};\n\n')
|
||||
|
||||
|
||||
# Write a bit set
|
||||
|
||||
def write_bitsets(list, item_size):
|
||||
for d in list:
|
||||
bitwords = [0] * item_size
|
||||
for idx in d:
|
||||
bitwords[idx // 32] |= 1 << (idx & 31)
|
||||
s = " "
|
||||
for x in bitwords:
|
||||
f.write("%s" % s)
|
||||
s = ", "
|
||||
f.write("0x%08xu" % x)
|
||||
f.write(",\n")
|
||||
f.write("};\n\n")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# This bit of code must have been useful when the original script was being
|
||||
# developed. Retain it just in case it is ever needed again.
|
||||
|
||||
# def test_record_size():
|
||||
# tests = [ \
|
||||
# ( [(3,), (6,), (6,), (1,)], 1 ), \
|
||||
# ( [(300,), (600,), (600,), (100,)], 2 ), \
|
||||
# ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
|
||||
# ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
|
||||
# ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
||||
# ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
||||
# ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
|
||||
# ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
|
||||
# ]
|
||||
# for test in tests:
|
||||
# size, struct = get_record_size_struct(test[0])
|
||||
# assert(size == test[1])
|
||||
# test_record_size()
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# MAIN CODE FOR CREATING TABLES
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
unicode_version = ""
|
||||
|
||||
# Some of the tables imported from GenerateCommon.py have alternate comment
|
||||
# strings for use by GenerateUcpHeader. The comments are not wanted here, so
|
||||
# remove them.
|
||||
|
||||
bidi_classes = bidi_classes[::2]
|
||||
break_properties = break_properties[::2]
|
||||
category_names = category_names[::2]
|
||||
|
||||
# Create the various tables from Unicode data files
|
||||
|
||||
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown'))
|
||||
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
|
||||
break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_properties), break_properties.index('Other'))
|
||||
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
|
||||
bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', make_get_names(bidi_classes), bidi_classes.index('L'))
|
||||
|
||||
# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now
|
||||
# we need to find the Extended_Pictographic property for emoji characters. This
|
||||
# can be set as an additional grapheme break property, because the default for
|
||||
# all the emojis is "other". We scan the emoji-data.txt file and modify the
|
||||
# break-props table.
|
||||
|
||||
file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8')
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
chardata = list(map(str.strip, line.split(';')))
|
||||
if len(chardata) <= 1:
|
||||
continue
|
||||
if chardata[1] != "Extended_Pictographic":
|
||||
continue
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
for i in range(char, last + 1):
|
||||
if break_props[i] != break_properties.index('Other'):
|
||||
print("WARNING: Emoji 0x%x has break property %s, not 'Other'",
|
||||
i, break_properties[break_props[i]], file=sys.stderr)
|
||||
break_props[i] = break_properties.index('Extended_Pictographic')
|
||||
file.close()
|
||||
|
||||
# Handle script extensions. The get_script_extesion() function maintains a
|
||||
# list of unique bitmaps representing lists of scripts, returning the offset
|
||||
# in that list. Initialize the list with an empty set, which is used for
|
||||
# characters that have no script extensions.
|
||||
|
||||
script_lists = [[]]
|
||||
last_script_extension = ""
|
||||
scriptx_bidi_class = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0)
|
||||
|
||||
for idx in range(len(scriptx_bidi_class)):
|
||||
scriptx_bidi_class[idx] = scriptx_bidi_class[idx] | (bidi_class[idx] << 11)
|
||||
bidi_class = None
|
||||
|
||||
# Find the Boolean properties of each character. This next bit of magic creates
|
||||
# a list of empty lists. Using [[]] * MAX_UNICODE gives a list of references to
|
||||
# the *same* list, which is not what we want.
|
||||
|
||||
bprops = [[] for _ in range(MAX_UNICODE)]
|
||||
|
||||
# Collect the properties from the various files
|
||||
|
||||
for filename in bool_propsfiles:
|
||||
try:
|
||||
file = open('Unicode.tables/' + filename, 'r')
|
||||
except IOError:
|
||||
print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
|
||||
sys.exit(1)
|
||||
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
data = list(map(str.strip, line.split(';')))
|
||||
if len(data) <= 1:
|
||||
continue
|
||||
|
||||
try:
|
||||
ix = bool_properties.index(data[1])
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', data[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
|
||||
for i in range(char, last + 1):
|
||||
bprops[i].append(ix)
|
||||
|
||||
file.close()
|
||||
|
||||
# The ASCII property isn't listed in any files, but it is easy enough to add
|
||||
# it manually.
|
||||
|
||||
ix = bool_properties.index("ASCII")
|
||||
for i in range(128):
|
||||
bprops[i].append(ix)
|
||||
|
||||
# The Bidi_Mirrored property isn't listed in any property files. We have to
|
||||
# deduce it from the file that lists the mirrored characters.
|
||||
|
||||
ix = bool_properties.index("Bidi_Mirrored")
|
||||
|
||||
try:
|
||||
file = open('Unicode.tables/BidiMirroring.txt', 'r')
|
||||
except IOError:
|
||||
print(f"** Couldn't open {'Unicode.tables/BidiMirroring.txt'}\n")
|
||||
sys.exit(1)
|
||||
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
data = list(map(str.strip, line.split(';')))
|
||||
if len(data) <= 1:
|
||||
continue
|
||||
c = int(data[0], 16)
|
||||
bprops[c].append(ix)
|
||||
|
||||
file.close()
|
||||
|
||||
# Scan each character's boolean property list and created a list of unique
|
||||
# lists, at the same time, setting the index in that list for each property in
|
||||
# the bool_props vector.
|
||||
|
||||
bool_props = [0] * MAX_UNICODE
|
||||
bool_props_lists = [[]]
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
s = set(bprops[c])
|
||||
for i in range(len(bool_props_lists)):
|
||||
if s == set(bool_props_lists[i]):
|
||||
break;
|
||||
else:
|
||||
bool_props_lists.append(bprops[c])
|
||||
i += 1
|
||||
|
||||
bool_props[c] = i * bool_props_list_item_size
|
||||
|
||||
# This block of code was added by PH in September 2012. It scans the other_case
|
||||
# table to find sets of more than two characters that must all match each other
|
||||
# caselessly. Later in this script a table of these sets is written out.
|
||||
# However, we have to do this work here in order to compute the offsets in the
|
||||
# table that are inserted into the main table.
|
||||
|
||||
# The CaseFolding.txt file lists pairs, but the common logic for reading data
|
||||
# sets only one value, so first we go through the table and set "return"
|
||||
# offsets for those that are not already set.
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
|
||||
other_case[c + other_case[c]] = -other_case[c]
|
||||
|
||||
# Now scan again and create equivalence sets.
|
||||
|
||||
caseless_sets = []
|
||||
|
||||
for c in range(MAX_UNICODE):
|
||||
o = c + other_case[c]
|
||||
|
||||
# Trigger when this character's other case does not point back here. We
|
||||
# now have three characters that are case-equivalent.
|
||||
|
||||
if other_case[o] != -other_case[c]:
|
||||
t = o + other_case[o]
|
||||
|
||||
# Scan the existing sets to see if any of the three characters are already
|
||||
# part of a set. If so, unite the existing set with the new set.
|
||||
|
||||
appended = 0
|
||||
for s in caseless_sets:
|
||||
found = 0
|
||||
for x in s:
|
||||
if x == c or x == o or x == t:
|
||||
found = 1
|
||||
|
||||
# Add new characters to an existing set
|
||||
|
||||
if found:
|
||||
found = 0
|
||||
for y in [c, o, t]:
|
||||
for x in s:
|
||||
if x == y:
|
||||
found = 1
|
||||
if not found:
|
||||
s.append(y)
|
||||
appended = 1
|
||||
|
||||
# If we have not added to an existing set, create a new one.
|
||||
|
||||
if not appended:
|
||||
caseless_sets.append([c, o, t])
|
||||
|
||||
# End of loop looking for caseless sets.
|
||||
|
||||
# Now scan the sets and set appropriate offsets for the characters.
|
||||
|
||||
caseless_offsets = [0] * MAX_UNICODE
|
||||
|
||||
offset = 1;
|
||||
for s in caseless_sets:
|
||||
for x in s:
|
||||
caseless_offsets[x] = offset
|
||||
offset += len(s) + 1
|
||||
|
||||
# End of block of code for creating offsets for caseless matching sets.
|
||||
|
||||
|
||||
# Combine all the tables
|
||||
|
||||
table, records = combine_tables(script, category, break_props,
|
||||
caseless_offsets, other_case, scriptx_bidi_class, bool_props)
|
||||
|
||||
# Find the record size and create a string definition of the structure for
|
||||
# outputting as a comment.
|
||||
|
||||
record_size, record_struct = get_record_size_struct(list(records.keys()))
|
||||
|
||||
# Find the optimum block size for the two-stage table
|
||||
|
||||
min_size = sys.maxsize
|
||||
for block_size in [2 ** i for i in range(5,10)]:
|
||||
size = len(records) * record_size
|
||||
stage1, stage2 = compress_table(table, block_size)
|
||||
size += get_tables_size(stage1, stage2)
|
||||
#print "/* block size %5d => %5d bytes */" % (block_size, size)
|
||||
if size < min_size:
|
||||
min_size = size
|
||||
min_stage1, min_stage2 = stage1, stage2
|
||||
min_block_size = block_size
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# MAIN CODE FOR WRITING THE OUTPUT FILE
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Open the output file (no return on failure). This call also writes standard
|
||||
# header boilerplate.
|
||||
|
||||
f = open_output("pcre2_ucd.c")
|
||||
|
||||
# Output this file's heading text
|
||||
|
||||
f.write("""\
|
||||
/* This file contains tables of Unicode properties that are extracted from
|
||||
Unicode data files. See the comments at the start of maint/GenerateUcd.py for
|
||||
details.
|
||||
|
||||
As well as being part of the PCRE2 library, this file is #included by the
|
||||
pcre2test program, which redefines the PRIV macro to change table names from
|
||||
_pcre2_xxx to xxxx, thereby avoiding name clashes with the library. At present,
|
||||
just one of these tables is actually needed. When compiling the library, some
|
||||
headers are needed. */
|
||||
|
||||
#ifndef PCRE2_PCRE2TEST
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
#include "pcre2_internal.h"
|
||||
#endif /* PCRE2_PCRE2TEST */
|
||||
|
||||
/* The tables herein are needed only when UCP support is built, and in PCRE2
|
||||
that happens automatically with UTF support. This module should not be
|
||||
referenced otherwise, so it should not matter whether it is compiled or not.
|
||||
However a comment was received about space saving - maybe the guy linked all
|
||||
the modules rather than using a library - so we include a condition to cut out
|
||||
the tables when not needed. But don't leave a totally empty module because some
|
||||
compilers barf at that. Instead, just supply some small dummy tables. */
|
||||
|
||||
#ifndef SUPPORT_UNICODE
|
||||
const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0}};
|
||||
const uint16_t PRIV(ucd_stage1)[] = {0};
|
||||
const uint16_t PRIV(ucd_stage2)[] = {0};
|
||||
const uint32_t PRIV(ucd_caseless_sets)[] = {0};
|
||||
#else
|
||||
\n""")
|
||||
|
||||
# --- Output some variable heading stuff ---
|
||||
|
||||
f.write("/* Total size: %d bytes, block size: %d. */\n\n" % (min_size, min_block_size))
|
||||
f.write('const char *PRIV(unicode_version) = "{}";\n\n'.format(unicode_version))
|
||||
|
||||
f.write("""\
|
||||
/* When recompiling tables with a new Unicode version, please check the types
|
||||
in this structure definition with those in pcre2_internal.h (the actual field
|
||||
names will be different).
|
||||
\n""")
|
||||
|
||||
f.write(record_struct)
|
||||
|
||||
f.write("""
|
||||
/* If the 32-bit library is run in non-32-bit mode, character values greater
|
||||
than 0x10ffff may be encountered. For these we set up a special record. */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
const ucd_record PRIV(dummy_ucd_record)[] = {{
|
||||
ucp_Unknown, /* script */
|
||||
ucp_Cn, /* type unassigned */
|
||||
ucp_gbOther, /* grapheme break property */
|
||||
0, /* case set */
|
||||
0, /* other case */
|
||||
0 | (ucp_bidiL << UCD_BIDICLASS_SHIFT), /* script extension and bidi class */
|
||||
0, /* bool properties offset */
|
||||
}};
|
||||
#endif
|
||||
\n""")
|
||||
|
||||
# --- Output the table of caseless character sets ---
|
||||
|
||||
f.write("""\
|
||||
/* This table contains lists of characters that are caseless sets of
|
||||
more than one character. Each list is terminated by NOTACHAR. */
|
||||
|
||||
const uint32_t PRIV(ucd_caseless_sets)[] = {
|
||||
NOTACHAR,
|
||||
""")
|
||||
|
||||
for s in caseless_sets:
|
||||
s = sorted(s)
|
||||
for x in s:
|
||||
f.write(' 0x%04x,' % x)
|
||||
f.write(' NOTACHAR,\n')
|
||||
f.write('};\n\n')
|
||||
|
||||
# --- Other tables are not needed by pcre2test ---
|
||||
|
||||
f.write("""\
|
||||
/* When #included in pcre2test, we don't need the table of digit sets, nor the
|
||||
the large main UCD tables. */
|
||||
|
||||
#ifndef PCRE2_PCRE2TEST
|
||||
\n""")
|
||||
|
||||
# --- Read Scripts.txt again for the sets of 10 digits. ---
|
||||
|
||||
digitsets = []
|
||||
file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8')
|
||||
|
||||
for line in file:
|
||||
m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line)
|
||||
if m is None:
|
||||
continue
|
||||
first = int(m.group(1),16)
|
||||
last = int(m.group(2),16)
|
||||
if ((last - first + 1) % 10) != 0:
|
||||
f.write("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last),
|
||||
file=sys.stderr)
|
||||
while first < last:
|
||||
digitsets.append(first + 9)
|
||||
first += 10
|
||||
file.close()
|
||||
digitsets.sort()
|
||||
|
||||
f.write("""\
|
||||
/* This table lists the code points for the '9' characters in each set of
|
||||
decimal digits. It is used to ensure that all the digits in a script run come
|
||||
from the same set. */
|
||||
|
||||
const uint32_t PRIV(ucd_digit_sets)[] = {
|
||||
""")
|
||||
|
||||
f.write(" %d, /* Number of subsequent values */" % len(digitsets))
|
||||
count = 8
|
||||
for d in digitsets:
|
||||
if count == 8:
|
||||
f.write("\n ")
|
||||
count = 0
|
||||
f.write(" 0x%05x," % d)
|
||||
count += 1
|
||||
f.write("\n};\n\n")
|
||||
|
||||
f.write("""\
|
||||
/* This vector is a list of script bitsets for the Script Extension property.
|
||||
The number of 32-bit words in each bitset is #defined in pcre2_ucp.h as
|
||||
ucd_script_sets_item_size. */
|
||||
|
||||
const uint32_t PRIV(ucd_script_sets)[] = {
|
||||
""")
|
||||
write_bitsets(script_lists, script_list_item_size)
|
||||
|
||||
f.write("""\
|
||||
/* This vector is a list of bitsets for Boolean properties. The number of
|
||||
32_bit words in each bitset is #defined as ucd_boolprop_sets_item_size in
|
||||
pcre2_ucp.h. */
|
||||
|
||||
const uint32_t PRIV(ucd_boolprop_sets)[] = {
|
||||
""")
|
||||
write_bitsets(bool_props_lists, bool_props_list_item_size)
|
||||
|
||||
|
||||
# Output the main UCD tables.
|
||||
|
||||
f.write("""\
|
||||
/* These are the main two-stage UCD tables. The fields in each record are:
|
||||
script (8 bits), character type (8 bits), grapheme break property (8 bits),
|
||||
offset to multichar other cases or zero (8 bits), offset to other case or zero
|
||||
(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed
|
||||
into a 16-bit field, and offset in binary properties table (16 bits). */
|
||||
\n""")
|
||||
|
||||
write_records(records, record_size)
|
||||
write_table(min_stage1, 'PRIV(ucd_stage1)')
|
||||
write_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
|
||||
|
||||
f.write("#if UCD_BLOCK_SIZE != %d\n" % min_block_size)
|
||||
f.write("""\
|
||||
#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h
|
||||
#endif
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
#endif /* PCRE2_PCRE2TEST */
|
||||
|
||||
/* End of pcre2_ucd.c */
|
||||
""")
|
||||
|
||||
f.close
|
||||
|
||||
# End
|
98
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/GenerateUcpHeader.py
vendored
Normal file
98
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/GenerateUcpHeader.py
vendored
Normal file
@ -0,0 +1,98 @@
|
||||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
|
||||
# This script generates the pcre2_ucp.h file from Unicode data files. This
|
||||
# header uses enumerations to give names to Unicode property types and script
|
||||
# names.
|
||||
|
||||
# This script was created in December 2021 as part of the Unicode data
|
||||
# generation refactoring.
|
||||
|
||||
|
||||
# Import common data lists and functions
|
||||
|
||||
from GenerateCommon import \
|
||||
bidi_classes, \
|
||||
bool_properties, \
|
||||
bool_props_list_item_size, \
|
||||
break_properties, \
|
||||
category_names, \
|
||||
general_category_names, \
|
||||
script_list_item_size, \
|
||||
script_names, \
|
||||
open_output
|
||||
|
||||
# Open the output file (no return on failure). This call also writes standard
|
||||
# header boilerplate.
|
||||
|
||||
f = open_output("pcre2_ucp.h")
|
||||
|
||||
# Output this file's heading text
|
||||
|
||||
f.write("""\
|
||||
#ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD
|
||||
#define PCRE2_UCP_H_IDEMPOTENT_GUARD
|
||||
|
||||
/* This file contains definitions of the Unicode property values that are
|
||||
returned by the UCD access macros and used throughout PCRE2.
|
||||
|
||||
IMPORTANT: The specific values of the first two enums (general and particular
|
||||
character categories) are assumed by the table called catposstab in the file
|
||||
pcre2_auto_possess.c. They are unlikely to change, but should be checked after
|
||||
an update. */
|
||||
\n""")
|
||||
|
||||
f.write("/* These are the general character categories. */\n\nenum {\n")
|
||||
for i in general_category_names:
|
||||
f.write(" ucp_%s,\n" % i)
|
||||
f.write("};\n\n")
|
||||
|
||||
f.write("/* These are the particular character categories. */\n\nenum {\n")
|
||||
for i in range(0, len(category_names), 2):
|
||||
f.write(" ucp_%s, /* %s */\n" % (category_names[i], category_names[i+1]))
|
||||
f.write("};\n\n")
|
||||
|
||||
f.write("/* These are Boolean properties. */\n\nenum {\n")
|
||||
for i in bool_properties:
|
||||
f.write(" ucp_%s,\n" % i)
|
||||
|
||||
f.write(" /* This must be last */\n")
|
||||
f.write(" ucp_Bprop_Count\n};\n\n")
|
||||
|
||||
f.write("/* Size of entries in ucd_boolprop_sets[] */\n\n")
|
||||
f.write("#define ucd_boolprop_sets_item_size %d\n\n" % bool_props_list_item_size)
|
||||
|
||||
f.write("/* These are the bidi class values. */\n\nenum {\n")
|
||||
for i in range(0, len(bidi_classes), 2):
|
||||
sp = ' ' * (4 - len(bidi_classes[i]))
|
||||
f.write(" ucp_bidi%s,%s /* %s */\n" % (bidi_classes[i], sp, bidi_classes[i+1]))
|
||||
f.write("};\n\n")
|
||||
|
||||
f.write("/* These are grapheme break properties. The Extended Pictographic "
|
||||
"property\ncomes from the emoji-data.txt file. */\n\nenum {\n")
|
||||
for i in range(0, len(break_properties), 2):
|
||||
sp = ' ' * (21 - len(break_properties[i]))
|
||||
f.write(" ucp_gb%s,%s /* %s */\n" % (break_properties[i], sp, break_properties[i+1]))
|
||||
f.write("};\n\n")
|
||||
|
||||
f.write("/* These are the script identifications. */\n\nenum {\n /* Scripts which has characters in other scripts. */\n")
|
||||
for i in script_names:
|
||||
if i == "Unknown":
|
||||
f.write("\n /* Scripts which has no characters in other scripts. */\n")
|
||||
f.write(" ucp_%s,\n" % i)
|
||||
f.write("\n")
|
||||
|
||||
f.write(" /* This must be last */\n")
|
||||
f.write(" ucp_Script_Count\n};\n\n")
|
||||
|
||||
f.write("/* Size of entries in ucd_script_sets[] */\n\n")
|
||||
f.write("#define ucd_script_sets_item_size %d\n\n" % script_list_item_size)
|
||||
|
||||
f.write("#endif /* PCRE2_UCP_H_IDEMPOTENT_GUARD */\n\n")
|
||||
f.write("/* End of pcre2_ucp.h */\n")
|
||||
|
||||
f.close()
|
||||
|
||||
# End
|
203
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/GenerateUcpTables.py
vendored
Normal file
203
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/GenerateUcpTables.py
vendored
Normal file
@ -0,0 +1,203 @@
|
||||
#! /usr/bin/python
|
||||
|
||||
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||
# ------------------------------
|
||||
|
||||
# This script generates the pcre2_ucptables.c file, which contains tables for
|
||||
# recognizing Unicode property names. It is #included by pcre2_tables.c. In
|
||||
# order to reduce the number of relocations when loading the PCRE2 library, the
|
||||
# names are held as a single large string, with offsets in the table. This is
|
||||
# tedious to maintain by hand. Therefore, a script is used to generate the
|
||||
# table.
|
||||
|
||||
# This script was created in December 2021 based on the previous GenerateUtt
|
||||
# script, whose output had to be manually edited into pcre2_tables.c. Here is
|
||||
# the history of the original script:
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Modified by PH 17-March-2009 to generate the more verbose form that works
|
||||
# for UTF-support in EBCDIC as well as ASCII environments.
|
||||
# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
|
||||
# Modified by PH 04-May-2010 to add new "X.." special categories.
|
||||
# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
|
||||
# Modified by ChPe 30-September-2012 to add this note; no other changes were
|
||||
# necessary for Unicode 6.2.0 support.
|
||||
# Modfied by PH 26-February-2013 to add the Xuc special category.
|
||||
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
|
||||
# Script updated to Python 3 by running it through the 2to3 converter.
|
||||
# Added script names for Unicode 7.0.0, 20-June-2014.
|
||||
# Added script names for Unicode 8.0.0, 19-June-2015.
|
||||
# Added script names for Unicode 10.0.0, 02-July-2017.
|
||||
# Added script names for Unicode 11.0.0, 03-July-2018.
|
||||
# Added 'Unknown' script, 01-October-2018.
|
||||
# Added script names for Unicode 12.1.0, 27-July-2019.
|
||||
# Added script names for Unicode 13.0.0, 10-March-2020.
|
||||
# Added Script names for Unicode 14.0.0, PCRE2-10.39
|
||||
# Added support for bidi class and bidi control, 06-December-2021
|
||||
# This also involved lower casing strings and removing underscores, in
|
||||
# accordance with Unicode's "loose matching" rules, which Perl observes.
|
||||
# Changed default script type from PT_SC to PT_SCX, 18-December-2021
|
||||
# -----------------------------------------------------------------------------
|
||||
#
|
||||
# Note subsequent changes here:
|
||||
#
|
||||
# 27-December-2021: Added support for 4-letter script abbreviations.
|
||||
# 10-January-2022: Further updates for Boolean property support
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
# Import common data lists and functions
|
||||
|
||||
from GenerateCommon import \
|
||||
abbreviations, \
|
||||
bool_properties, \
|
||||
bidi_classes, \
|
||||
category_names, \
|
||||
general_category_names, \
|
||||
script_names, \
|
||||
open_output
|
||||
|
||||
# Open the output file (no return on failure). This call also writes standard
|
||||
# header boilerplate.
|
||||
|
||||
f = open_output("pcre2_ucptables.c")
|
||||
|
||||
# The list in bidi_classes contains just the Unicode classes such as AN, LRE,
|
||||
# etc., along with comments. We need to add "bidi" in front of each value, in
|
||||
# order to create names that don't clash with other types of property.
|
||||
|
||||
bidi_class_names = []
|
||||
for i in range(0, len(bidi_classes), 2):
|
||||
bidi_class_names.append("bidi" + bidi_classes[i])
|
||||
|
||||
# Remove the comments from other lists that contain them.
|
||||
|
||||
category_names = category_names[::2]
|
||||
|
||||
# Create standardized versions of the names by lowercasing and removing
|
||||
# underscores.
|
||||
|
||||
def stdname(x):
|
||||
return x.lower().replace('_', '')
|
||||
|
||||
def stdnames(x):
|
||||
y = [''] * len(x)
|
||||
for i in range(len(x)):
|
||||
y[i] = stdname(x[i])
|
||||
return y
|
||||
|
||||
std_category_names = stdnames(category_names)
|
||||
std_general_category_names = stdnames(general_category_names)
|
||||
std_bidi_class_names = stdnames(bidi_class_names)
|
||||
std_bool_properties = stdnames(bool_properties)
|
||||
|
||||
# Create the table, starting with the Unicode script, category and bidi class
|
||||
# names. We keep both the standardized name and the original, because the
|
||||
# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
|
||||
# still use the full original names.
|
||||
|
||||
utt_table = []
|
||||
|
||||
scx_end = script_names.index('Unknown')
|
||||
|
||||
for idx, name in enumerate(script_names):
|
||||
pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC'
|
||||
utt_table.append((stdname(name), name, pt_type))
|
||||
for abbrev in abbreviations[name]:
|
||||
utt_table.append((stdname(abbrev), name, pt_type))
|
||||
|
||||
# Add the remaining property lists
|
||||
|
||||
utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names)))
|
||||
utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
|
||||
utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))
|
||||
|
||||
for name in bool_properties:
|
||||
utt_table.append((stdname(name), name, 'PT_BOOL'))
|
||||
if name in abbreviations:
|
||||
for abbrev in abbreviations[name]:
|
||||
utt_table.append((stdname(abbrev), name, 'PT_BOOL'))
|
||||
|
||||
# Now add specials and synonyms. Note both the standardized and capitalized
|
||||
# forms are needed.
|
||||
|
||||
utt_table.append(('any', 'Any', 'PT_ANY'))
|
||||
utt_table.append(('l&', 'L&', 'PT_LAMP'))
|
||||
utt_table.append(('lc', 'LC', 'PT_LAMP'))
|
||||
utt_table.append(('xan', 'Xan', 'PT_ALNUM'))
|
||||
utt_table.append(('xps', 'Xps', 'PT_PXSPACE'))
|
||||
utt_table.append(('xsp', 'Xsp', 'PT_SPACE'))
|
||||
utt_table.append(('xuc', 'Xuc', 'PT_UCNC'))
|
||||
utt_table.append(('xwd', 'Xwd', 'PT_WORD'))
|
||||
|
||||
# Remove duplicates from the table and then sort it.
|
||||
|
||||
utt_table = list(set(utt_table))
|
||||
utt_table.sort()
|
||||
|
||||
# Output file-specific heading
|
||||
|
||||
f.write("""\
|
||||
#ifdef SUPPORT_UNICODE
|
||||
|
||||
/* The PRIV(utt)[] table below translates Unicode property names into type and
|
||||
code values. It is searched by binary chop, so must be in collating sequence of
|
||||
name. Originally, the table contained pointers to the name strings in the first
|
||||
field of each entry. However, that leads to a large number of relocations when
|
||||
a shared library is dynamically loaded. A significant reduction is made by
|
||||
putting all the names into a single, large string and using offsets instead.
|
||||
All letters are lower cased, and underscores are removed, in accordance with
|
||||
the "loose matching" rules that Unicode advises and Perl uses. */
|
||||
\n""")
|
||||
|
||||
# We have to use STR_ macros to define the strings so that it all works in
|
||||
# UTF-8 mode on EBCDIC platforms.
|
||||
|
||||
for utt in utt_table:
|
||||
f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')))
|
||||
for c in utt[0]:
|
||||
if c == '&':
|
||||
f.write(' STR_AMPERSAND')
|
||||
else:
|
||||
f.write(' STR_%s' % c);
|
||||
f.write(' "\\0"\n')
|
||||
|
||||
# Output the long string of concatenated names
|
||||
|
||||
f.write('\nconst char PRIV(utt_names)[] =\n');
|
||||
last = ''
|
||||
for utt in utt_table:
|
||||
if utt == utt_table[-1]:
|
||||
last = ';'
|
||||
f.write(' STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last))
|
||||
|
||||
# Output the property type table
|
||||
|
||||
f.write('\nconst ucp_type_table PRIV(utt)[] = {\n')
|
||||
offset = 0
|
||||
last = ','
|
||||
for utt in utt_table:
|
||||
if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
|
||||
'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
|
||||
value = '0'
|
||||
else:
|
||||
value = 'ucp_' + utt[1]
|
||||
if utt == utt_table[-1]:
|
||||
last = ''
|
||||
f.write(' { %3d, %s, %s }%s\n' % (offset, utt[2], value, last))
|
||||
offset += len(utt[0]) + 1
|
||||
f.write('};\n\n')
|
||||
|
||||
# Ending text
|
||||
|
||||
f.write("""\
|
||||
const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
|
||||
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
/* End of pcre2_ucptables.c */
|
||||
""")
|
||||
|
||||
f.close
|
||||
|
||||
# End
|
453
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/ManyConfigTests
vendored
Normal file
453
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/ManyConfigTests
vendored
Normal file
@ -0,0 +1,453 @@
|
||||
#! /bin/sh
|
||||
|
||||
# This is a script for the use of PCRE2 maintainers. It configures and rebuilds
|
||||
# PCRE2 with a variety of configuration options, and in each case runs the
|
||||
# tests to ensure that all goes well. Every possible combination would take far
|
||||
# too long, so we use a representative sample. This script should be run in the
|
||||
# PCRE2 source directory.
|
||||
|
||||
# While debugging, it is sometimes useful to be able to cut out some of the
|
||||
# tests, in order to run those that are giving errors. The following options
|
||||
# do this:
|
||||
#
|
||||
# -noasan skip the test that uses -fsanitize=address
|
||||
# -nousan skip the test that uses -fsanitize=undefined
|
||||
# -nodebug skip the test that uses --enable-debug
|
||||
# -nojit skip all JIT tests
|
||||
# -nojitmain skip non-valgrind JIT tests
|
||||
# -nojitvalgrind skip JIT tests with valgrind
|
||||
# -nomain skip all the main (non-JIT) set of tests
|
||||
# -nomainvalgrind skip the main (non-JIT) valgrind tests
|
||||
# -notmp skip the test in a temporary directory
|
||||
# -novalgrind skip all the valgrind tests
|
||||
|
||||
# Alternatively, if any of those names are given with '+' instead of '-no',
|
||||
# only those groups named with '+' are run (e.g. +jit). If -dummy is given,
|
||||
# no tests are actually run - this provides a means of testing the selectors.
|
||||
|
||||
# The -v option causes a call to 'pcre2test -C' to happen for each
|
||||
# configuration.
|
||||
|
||||
useasan=1
|
||||
useusan=1
|
||||
usedebug=1
|
||||
usejit=1
|
||||
usejitvalgrind=1
|
||||
usemain=1
|
||||
usemainvalgrind=1
|
||||
usetmp=1
|
||||
usevalgrind=1
|
||||
|
||||
dummy=0
|
||||
seenplus=0
|
||||
verbose=0
|
||||
|
||||
while [ $# -gt 0 ] ; do
|
||||
case $1 in
|
||||
+*) if [ $seenplus -eq 0 ]; then
|
||||
useasan=0
|
||||
useusan=0
|
||||
usedebug=0
|
||||
usejit=0
|
||||
usejitvalgrind=0
|
||||
usemain=0
|
||||
usemainvalgrind=0
|
||||
usetmp=0
|
||||
seenplus=1
|
||||
fi;;
|
||||
esac
|
||||
|
||||
case $1 in
|
||||
-dummy) dummy=1;;
|
||||
-v) verbose=1;;
|
||||
-noasan) useasan=0;;
|
||||
-nousan) useusan=0;;
|
||||
-nodebug) usedebug=0;;
|
||||
-nojit) usejit=0; usejitvalgrind=0;;
|
||||
-nojitmain) usejit=0;;
|
||||
-nojitvalgrind) usejitvalgrind=0;;
|
||||
-nomain) usemain=0; usemainvalgrind=0;;
|
||||
-nomainvalgrind) usemainvalgrind=0;;
|
||||
-notmp) usetmp=0;;
|
||||
-novalgrind) usevalgrind=0;;
|
||||
+asan) useasan=1;;
|
||||
+usan) useusan=1;;
|
||||
+debug) usedebug=1;;
|
||||
+jit) usejit=1; usejitvalgrind=1;;
|
||||
+jitmain) usejit=1;;
|
||||
+jitvalgrind) usejitvalgrind=1;;
|
||||
+main) usemain=1; usemainvalgrind=1;;
|
||||
+mainvalgrind) usemainvalgrind=1;;
|
||||
+tmp) usetmp=1;;
|
||||
+valgrind) usevalgrind=1; usejitvalgrind=1; usemainvalgrind=1;;
|
||||
*) echo "Unknown option '$1'"; exit 1;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
if [ $usejitvalgrind -eq 0 -a $usemainvalgrind -eq 0 ] ; then
|
||||
usevalgrind=0
|
||||
fi
|
||||
|
||||
# This is in case the caller has set aliases (as I do - PH)
|
||||
|
||||
unset cp ls mv rm
|
||||
|
||||
# This is a temporary directory for testing out-of-line builds
|
||||
|
||||
tmp=/tmp/pcre2testing
|
||||
|
||||
# Don't bother with compiler optimization for most tests; it just slows down
|
||||
# compilation a lot (and running the tests themselves is quick). However, one
|
||||
# special test turns optimization on, because it can provoke some compiler
|
||||
# warnings.
|
||||
|
||||
CFLAGS="-g"
|
||||
OFLAGS="-O0"
|
||||
ISGCC=0
|
||||
|
||||
# If the compiler is gcc, add a lot of warning switches.
|
||||
|
||||
cc --version >/tmp/pcre2ccversion 2>/dev/null
|
||||
if [ $? -eq 0 ] && grep GCC /tmp/pcre2ccversion >/dev/null; then
|
||||
ISGCC=1
|
||||
CFLAGS="$CFLAGS -Wall"
|
||||
CFLAGS="$CFLAGS -Wno-overlength-strings"
|
||||
CFLAGS="$CFLAGS -Wpointer-arith"
|
||||
CFLAGS="$CFLAGS -Wwrite-strings"
|
||||
CFLAGS="$CFLAGS -Wundef -Wshadow"
|
||||
CFLAGS="$CFLAGS -Wmissing-field-initializers"
|
||||
CFLAGS="$CFLAGS -Wunused-parameter"
|
||||
CFLAGS="$CFLAGS -Wextra -Wformat"
|
||||
CFLAGS="$CFLAGS -Wbad-function-cast"
|
||||
CFLAGS="$CFLAGS -Wmissing-declarations"
|
||||
CFLAGS="$CFLAGS -Wnested-externs"
|
||||
CFLAGS="$CFLAGS -pedantic"
|
||||
CFLAGS="$CFLAGS -Wuninitialized"
|
||||
CFLAGS="$CFLAGS -Wmaybe-uninitialized"
|
||||
CFLAGS="$CFLAGS -Wmissing-prototypes"
|
||||
CFLAGS="$CFLAGS -Wstrict-prototypes"
|
||||
fi
|
||||
rm -f /tmp/pcre2ccversion
|
||||
|
||||
# This function runs a single test with the set of configuration options that
|
||||
# are in $opts. The source directory must be set in srcdir. The function must
|
||||
# be defined as "runtest()" not "function runtest()" in order to run on
|
||||
# Solaris.
|
||||
|
||||
runtest()
|
||||
{
|
||||
rm -f $srcdir/pcre2test $srcdir/pcre2grep $srcdir/pcre2_jit_test
|
||||
testcount=`expr $testcount + 1`
|
||||
|
||||
if [ "$opts" = "" ] ; then
|
||||
echo "[$testcount/$testtotal] Configuring with: default settings"
|
||||
else
|
||||
echo "[$testcount/$testtotal] Configuring with:"
|
||||
echo " $opts"
|
||||
fi
|
||||
|
||||
if [ $dummy -eq 1 ]; then return; fi
|
||||
|
||||
CFLAGS="$CFLAGS" \
|
||||
$srcdir/configure $opts >/dev/null 2>teststderrM
|
||||
if [ $? -ne 0 ]; then
|
||||
echo " "
|
||||
echo "******** Error while configuring ********"
|
||||
cat teststderrM
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# There is an infelicity in the Autotools world (as of October 2015) which
|
||||
# causes the message
|
||||
#
|
||||
# ar: `u' modifier ignored since `D' is the default (see `U')
|
||||
#
|
||||
# to be output while linking. This triggers an unwanted error report from this
|
||||
# script, because it expects no stderr output while making. To get round this
|
||||
# we filter the stderr output through sed, removing all occurrences of the
|
||||
# above lines. Just for paranoia, check that sed is available before doing
|
||||
# this.
|
||||
|
||||
echo "Making"
|
||||
make -j >/dev/null 2>teststderrM
|
||||
makeRC=$?
|
||||
if command -v sed >/dev/null 2>&1 ; then
|
||||
sed "/\`u' modifier ignored since \`D' is the default/ d" \
|
||||
teststderrM > teststderrMM
|
||||
mv -f teststderrMM teststderrM
|
||||
fi
|
||||
if [ $makeRC -ne 0 -o -s teststderrM ]; then
|
||||
echo " "
|
||||
echo "******** Errors or warnings while making ********"
|
||||
echo " "
|
||||
cat teststderrM
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $verbose -eq 1 ]; then
|
||||
./pcre2test -C
|
||||
fi
|
||||
|
||||
./pcre2test -C jit >/dev/null
|
||||
jit=$?
|
||||
./pcre2test -C pcre2-8 >/dev/null
|
||||
pcre2_8=$?
|
||||
|
||||
echo "Running PCRE2 library tests $withvalgrind"
|
||||
$srcdir/RunTest $valgrind >teststdoutM 2>teststderrM
|
||||
|
||||
if [ $? -ne 0 -o -s teststderrM ]; then
|
||||
echo " "
|
||||
echo "**** Test failed ****"
|
||||
if [ -s teststderrM ] ; then
|
||||
cat teststderrM
|
||||
else
|
||||
cat teststdoutM
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $pcre2_8 -gt 0 ]; then
|
||||
echo "Running pcre2grep tests $withvalgrind"
|
||||
$srcdir/RunGrepTest $valgrind >teststdoutM 2>teststderrM
|
||||
if [ $? -ne 0 -o -s teststderrM ]; then
|
||||
echo " "
|
||||
echo "**** Test failed ****"
|
||||
cat teststderrM
|
||||
cat teststdoutM
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "Skipping pcre2grep tests: 8-bit library not compiled"
|
||||
fi
|
||||
|
||||
if [ "$jit" -gt 0 ]; then
|
||||
echo "Running JIT regression tests $withvalgrind"
|
||||
$jrvalgrind $srcdir/pcre2_jit_test >teststdoutM 2>teststderrM
|
||||
if [ $? -ne 0 -o -s teststderrM ]; then
|
||||
echo " "
|
||||
echo "**** Test failed ****"
|
||||
cat teststderrM
|
||||
cat teststdoutM
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "Skipping JIT regression tests: JIT is not enabled"
|
||||
fi
|
||||
}
|
||||
|
||||
# Update the total count whenever a new test is added; it is used to show
|
||||
# progess as each test is run.
|
||||
|
||||
testtotal=`expr 17 \* $usemain + \
|
||||
1 \* $usemain \* $usedebug + \
|
||||
1 \* $usetmp + \
|
||||
1 \* $ISGCC \* $usemain + \
|
||||
1 \* $ISGCC \* $usemain \* $useasan + \
|
||||
1 \* $ISGCC \* $usemain \* $useusan + \
|
||||
13 \* $usejit + \
|
||||
2 \* $usemainvalgrind + \
|
||||
2 \* $usejitvalgrind`
|
||||
|
||||
testcount=0
|
||||
|
||||
if [ $testtotal -eq 0 ] ; then
|
||||
echo "** No tests selected"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
valgrind=
|
||||
jrvalgrind=
|
||||
withvalgrind=
|
||||
srcdir=.
|
||||
export srcdir
|
||||
|
||||
if [ $usejit -ne 0 ]; then
|
||||
enable_jit=--enable-jit
|
||||
else
|
||||
enable_jit=
|
||||
fi
|
||||
|
||||
# If gcc is in use, run a maximally configured test with -O2, because that can
|
||||
# throw up warnings that are not detected with -O0. Then run a second test with
|
||||
# -fsanitize=address, which also may throw up new warnings as well as checking
|
||||
# things at runtime. Finally, run another test using -fsanitize=undefined
|
||||
# -std-gnu99 to check for runtime actions that are not well defined. However,
|
||||
# we also use -fno-sanitize=shift to avoid warnings for shifts of negative
|
||||
# numbers, which occur in src/pcre2_jit_compile.c.
|
||||
|
||||
if [ $ISGCC -ne 0 -a $usemain -ne 0 ]; then
|
||||
echo "---------- Maximally configured test with -O2 ----------"
|
||||
SAVECFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 $CFLAGS"
|
||||
echo "CFLAGS=$CFLAGS"
|
||||
opts="--disable-shared $enable_jit --enable-pcre2-16 --enable-pcre2-32"
|
||||
runtest
|
||||
if [ $useasan -ne 0 ]; then
|
||||
echo "---------- Maximally configured test with -fsanitize=address ----------"
|
||||
# Following a kernel change, sanitize address doesn't work unless the extra
|
||||
# PIE options are also set.
|
||||
CFLAGS="$OFLAGS $SAVECFLAGS -no-pie -fno-PIE -fsanitize=address"
|
||||
echo "CFLAGS=$CFLAGS"
|
||||
opts="--disable-shared $enable_jit --enable-pcre2-16 --enable-pcre2-32"
|
||||
runtest
|
||||
fi
|
||||
# This also seems to be the case for sanitize undefined.
|
||||
if [ $useusan -ne 0 ]; then
|
||||
echo "------- Maximally configured test with -fsanitize=undefined -fno-sanitize=shift -fno-sanitize=alignment -std=gnu99 -------"
|
||||
CFLAGS="$OFLAGS $SAVECFLAGS -no-pie -fno-PIE -fsanitize=undefined -fno-sanitize=shift -fno-sanitize=alignment -std=gnu99"
|
||||
echo "CFLAGS=$CFLAGS"
|
||||
opts="--disable-shared $enable_jit --enable-pcre2-16 --enable-pcre2-32"
|
||||
runtest
|
||||
fi
|
||||
CFLAGS="$OFLAGS $SAVECFLAGS"
|
||||
fi
|
||||
|
||||
# This set of tests builds PCRE2 and runs the tests with a variety of configure
|
||||
# options, in the current (source) directory. The empty configuration builds
|
||||
# with all the default settings. As well as testing that these options work, we
|
||||
# use --disable-shared or --disable-static except for the default test (which
|
||||
# builds both) to save a bit of time by building only one version of the
|
||||
# library for the subsequent tests.
|
||||
|
||||
echo "---------- CFLAGS for the remaining tests ----------"
|
||||
echo "CFLAGS=$CFLAGS"
|
||||
|
||||
if [ $usemain -ne 0 ]; then
|
||||
if [ $usedebug -ne 0 ]; then
|
||||
echo "---------- Maximally configured test with --enable-debug ----------"
|
||||
opts="--disable-shared $enable_jit --enable-pcre2-16 --enable-pcre2-32 --enable-debug"
|
||||
runtest
|
||||
fi
|
||||
|
||||
echo "---------- Non-JIT tests in the current directory ----------"
|
||||
for opts in \
|
||||
"" \
|
||||
"--disable-static" \
|
||||
"--disable-shared" \
|
||||
"--disable-unicode --disable-shared --enable-never-backslash-C" \
|
||||
"--with-link-size=3 --disable-shared --disable-pcre2grep-callout" \
|
||||
"--disable-unicode --enable-rebuild-chartables --disable-shared" \
|
||||
"--disable-unicode --enable-newline-is-any --disable-shared" \
|
||||
"--disable-unicode --enable-newline-is-cr --disable-shared" \
|
||||
"--disable-unicode --enable-newline-is-crlf --disable-shared" \
|
||||
"--disable-unicode --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared" \
|
||||
"--enable-newline-is-any --disable-static" \
|
||||
"--disable-unicode --enable-pcre2-16" \
|
||||
"--enable-pcre2-16 --disable-shared" \
|
||||
"--disable-unicode --enable-pcre2-32" \
|
||||
"--enable-pcre2-32 --disable-shared" \
|
||||
"--disable-unicode --enable-pcre2-32 --enable-pcre2-16 --disable-shared" \
|
||||
"--disable-unicode --enable-pcre2-32 --enable-pcre2-16 --disable-pcre2-8 --disable-shared"
|
||||
do
|
||||
runtest
|
||||
done
|
||||
fi
|
||||
|
||||
# Now run the JIT tests unless disabled
|
||||
|
||||
if [ $usejit -ne 0 ]; then
|
||||
echo "---------- JIT tests in the current directory ----------"
|
||||
for opts in \
|
||||
"--disable-unicode --enable-jit --disable-shared" \
|
||||
"--enable-jit --disable-shared" \
|
||||
"--enable-jit --with-link-size=3 --disable-shared" \
|
||||
"--enable-jit --enable-pcre2-16 --disable-shared" \
|
||||
"--disable-unicode --enable-jit --enable-pcre2-16 --disable-pcre2-8 --disable-shared" \
|
||||
"--enable-jit --enable-pcre2-16 --disable-pcre2-8 --disable-shared" \
|
||||
"--enable-jit --enable-pcre2-16 --with-link-size=3 --disable-shared" \
|
||||
"--enable-jit --enable-pcre2-16 --with-link-size=4 --disable-shared" \
|
||||
"--enable-jit --enable-pcre2-32 --disable-shared" \
|
||||
"--disable-unicode --enable-jit --enable-pcre2-32 --disable-pcre2-8 --disable-shared" \
|
||||
"--enable-jit --enable-pcre2-32 --disable-pcre2-8 --disable-shared" \
|
||||
"--enable-jit --enable-pcre2-32 --with-link-size=4 --disable-shared" \
|
||||
"--enable-jit --enable-pcre2-32 --enable-pcre2-16 --disable-pcre2-8 --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared"
|
||||
do
|
||||
runtest
|
||||
done
|
||||
fi
|
||||
|
||||
# Now re-run some of the tests under valgrind.
|
||||
|
||||
if [ $usevalgrind -ne 0 ]; then
|
||||
echo "---------- Tests in the current directory using valgrind ----------"
|
||||
valgrind=valgrind
|
||||
withvalgrind="with valgrind"
|
||||
|
||||
if [ $usemainvalgrind -ne 0 ]; then
|
||||
for opts in \
|
||||
"--disable-shared" \
|
||||
"--with-link-size=3 --enable-pcre2-16 --enable-pcre2-32 --disable-shared"
|
||||
do
|
||||
opts="--enable-valgrind $opts"
|
||||
runtest
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $usejitvalgrind -ne 0 ]; then
|
||||
jrvalgrind="valgrind --tool=memcheck -q --smc-check=all-non-file --suppressions=$srcdir/testdata/valgrind-jit.supp"
|
||||
for opts in \
|
||||
"--enable-jit --disable-shared" \
|
||||
"--enable-jit --enable-pcre2-16 --enable-pcre2-32"
|
||||
do
|
||||
opts="--enable-valgrind $opts"
|
||||
runtest
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
valgrind=
|
||||
jrvalgrind=
|
||||
withvalgrind=
|
||||
|
||||
# Clean up the distribution and then do at least one build and test in a
|
||||
# directory other than the source directory. It doesn't work unless the
|
||||
# source directory is cleaned up first.
|
||||
|
||||
if [ -f Makefile ]; then
|
||||
echo "Running 'make distclean'"
|
||||
make distclean >/dev/null 2>&1
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "** 'make distclean' failed"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "---------- End of tests in the source directory ----------"
|
||||
echo "Removing teststdoutM and teststderrM"
|
||||
rm -rf teststdoutM teststderrM
|
||||
|
||||
if [ $usetmp -ne 0 ]; then
|
||||
echo "---------- Tests in the $tmp directory ----------"
|
||||
srcdir=`pwd`
|
||||
export srcdir
|
||||
|
||||
if [ ! -e $tmp ]; then
|
||||
mkdir $tmp
|
||||
fi
|
||||
|
||||
if [ ! -d $tmp ]; then
|
||||
echo "** Failed to create $tmp or it is not a directory"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cd $tmp
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "** Failed to cd to $tmp"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for opts in \
|
||||
"--disable-shared"
|
||||
do
|
||||
runtest
|
||||
done
|
||||
|
||||
echo "Removing $tmp"
|
||||
rm -rf $tmp
|
||||
fi
|
||||
|
||||
echo "---------- All done ----------"
|
||||
|
||||
# End
|
460
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/README
vendored
Normal file
460
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/README
vendored
Normal file
@ -0,0 +1,460 @@
|
||||
MAINTENANCE README FOR PCRE2
|
||||
============================
|
||||
|
||||
The files in the "maint" directory of the PCRE2 source contain data, scripts,
|
||||
and programs that are used for the maintenance of PCRE2, but which do not form
|
||||
part of the PCRE2 distribution tarballs. This document describes these files
|
||||
and also contains some notes for maintainers. Its contents are:
|
||||
|
||||
Files in the maint directory
|
||||
Updating to a new Unicode release
|
||||
Preparing for a PCRE2 release
|
||||
Making a PCRE2 release
|
||||
Long-term ideas (wish list)
|
||||
|
||||
|
||||
Files in the maint directory
|
||||
============================
|
||||
|
||||
GenerateCommon.py
|
||||
A Python module containing data and functions that are used by the other
|
||||
Generate scripts.
|
||||
|
||||
GenerateTest26.py
|
||||
A Python script that generates input and expected output test data for test
|
||||
26, which tests certain aspects of Unicode property support.
|
||||
|
||||
GenerateUcd.py
|
||||
A Python script that generates the file pcre2_ucd.c from GenerateCommon.py
|
||||
and Unicode data files, which are themselves downloaded from the Unicode web
|
||||
site. The generated file contains the tables for a 2-stage lookup of Unicode
|
||||
properties, along with some auxiliary tables. The script starts with a long
|
||||
comment that gives details of the tables it constructs.
|
||||
|
||||
GenerateUcpHeader.py
|
||||
A Python script that generates the file pcre2_ucp.h from GenerateCommon.py
|
||||
and Unicode data files. The generated file defines constants for various
|
||||
Unicode property values.
|
||||
|
||||
GenerateUcpTables.py
|
||||
A Python script that generates the file pcre2_ucptables.c from
|
||||
GenerateCommon.py and Unicode data files. The generated file contains tables
|
||||
for looking up Unicode property names.
|
||||
|
||||
ManyConfigTests
|
||||
A shell script that runs "configure, make, test" a number of times with
|
||||
different configuration settings.
|
||||
|
||||
pcre2_chartables.c.non-standard
|
||||
This is a set of character tables that came from a Windows system. It has
|
||||
characters greater than 128 that are set as spaces, amongst other things. I
|
||||
kept it so that it can be used for testing from time to time.
|
||||
|
||||
README
|
||||
This file.
|
||||
|
||||
Unicode.tables
|
||||
The files in this directory were downloaded from the Unicode web site. They
|
||||
contain information about Unicode characters and scripts, and are used by the
|
||||
Generate scripts. There is also UnicodeData.txt, which is no longer used by
|
||||
any script, because it is useful occasionally for manually looking up the
|
||||
details of certain characters. However, note that character names in this
|
||||
file such as "Arabic sign sanah" do NOT mean that the character is in a
|
||||
particular script (in this case, Arabic). Scripts.txt and
|
||||
ScriptExtensions.txt are where to look for script information.
|
||||
|
||||
ucptest.c
|
||||
A program for testing the Unicode property macros that do lookups in the
|
||||
pcre2_ucd.c data, mainly useful after rebuilding the Unicode property tables.
|
||||
Compile and run this in the "maint" directory (see comments at its head).
|
||||
This program can also be used to find characters with specific properties and
|
||||
to list which properties are supported.
|
||||
|
||||
ucptestdata
|
||||
A directory containing four files, testinput{1,2} and testoutput{1,2}, for
|
||||
use in conjunction with the ucptest program.
|
||||
|
||||
utf8.c
|
||||
A short, freestanding C program for converting a Unicode code point into a
|
||||
sequence of bytes in the UTF-8 encoding, and vice versa. If its argument is a
|
||||
hex number such as 0x1234, it outputs a list of the equivalent UTF-8 bytes.
|
||||
If its argument is a sequence of concatenated UTF-8 bytes (e.g. 12e188b4) it
|
||||
treats them as a UTF-8 string and outputs the equivalent code points in hex.
|
||||
See comments at its head for details.
|
||||
|
||||
|
||||
Updating to a new Unicode release
|
||||
=================================
|
||||
|
||||
When there is a new release of Unicode, the files in Unicode.tables must be
|
||||
refreshed from the web site. Once that is done, the four Python scripts that
|
||||
generate files from the Unicode data can be run from within the "maint"
|
||||
directory.
|
||||
|
||||
Note: Previously, it was necessary to update lists of scripts and their
|
||||
abbreviations by hand before running the Python scripts. This is no longer
|
||||
necessary because the scripts have been upgraded to extract this information
|
||||
themselves. Also, there used to be explicit lists of scripts in two of the man
|
||||
pages. This is no longer the case; the pcre2test program can now output a list
|
||||
of supported scripts.
|
||||
|
||||
You can give an output file name as an argument to the following scripts, but
|
||||
by default:
|
||||
|
||||
GenerateUcd.py creates pcre2_ucd.c )
|
||||
GenerateUcpHeader.py creates pcre2_ucp.h ) in the current directory
|
||||
GenerateUcpTables.py creates pcre2_ucptables.c )
|
||||
|
||||
These files can be compared against the existing versions in the src directory
|
||||
to check on any changes before replacing the old files, but you can also
|
||||
generate directly into the final location by running:
|
||||
|
||||
./GenerateUcd.py ../src/pcre2_ucd.c
|
||||
./GenerateUcpHeader.py ../src/pcre2_ucp.h
|
||||
./GenerateUcpTables.py ../src/pcre2_ucptables.c
|
||||
|
||||
Once the .c and .h files are in the ../src directory, the ucptest program can
|
||||
be compiled and used to check that the new tables work properly. The data files
|
||||
in ucptestdata are set up to check a number of test characters. See the
|
||||
comments at the start of ucptest.c. If there are new scripts, adding a few
|
||||
tests to the files in ucptestdata is a good idea.
|
||||
|
||||
Finally, you should run the GenerateTest26.py script to regenerate new versions
|
||||
of the input and expected output from a series of Unicode property tests that
|
||||
are automatically generated from the Unicode data files. By default, the files
|
||||
are written to testinput26 and testoutput26 in the current directory, but you
|
||||
can give an alternative directory name as an argument to the script. These
|
||||
files should eventually be installed in the main testdata directory.
|
||||
|
||||
|
||||
Preparing for a PCRE2 release
|
||||
=============================
|
||||
|
||||
This section contains a checklist of things that I do before building a new
|
||||
release.
|
||||
|
||||
. Ensure that the version number and version date are correct in configure.ac.
|
||||
|
||||
. Update the library version numbers in configure.ac according to the rules
|
||||
given below.
|
||||
|
||||
. If new build options or new source files have been added, ensure that they
|
||||
are added to the CMake files as well as to the autoconf files. The relevant
|
||||
files are CMakeLists.txt and config-cmake.h.in. After making a release, test
|
||||
it out with CMake if there have been changes here.
|
||||
|
||||
. Run ./autogen.sh to ensure everything is up-to-date.
|
||||
|
||||
. Compile and test with many different config options, and combinations of
|
||||
options. Also, test with valgrind by running "RunTest valgrind" and
|
||||
"RunGrepTest valgrind". The script maint/ManyConfigTests now encapsulates
|
||||
this testing. It runs tests with different configurations, and it also runs
|
||||
some of them with valgrind, all of which can take quite some time.
|
||||
|
||||
. Run tests in both 32-bit and 64-bit environments if possible. I can no longer
|
||||
run 32-bit tests.
|
||||
|
||||
. Run tests with two or more different compilers (e.g. clang and gcc), and
|
||||
make use of -fsanitize=address and friends where possible. For gcc,
|
||||
-fsanitize=undefined -std=gnu99 picks up undefined behaviour at runtime, but
|
||||
needs -fno-sanitize=shift to get rid of warnings for shifts of negative
|
||||
numbers in the JIT compiler. For clang, -fsanitize=address,undefined,integer
|
||||
can be used but -fno-sanitize=alignment,shift,unsigned-integer-overflow must
|
||||
be added when compiling with JIT. Another useful clang option is
|
||||
-fsanitize=signed-integer-overflow
|
||||
|
||||
. Do a test build using CMake. Remove src/config.h first, lest it override the
|
||||
version that CMake creates. Also do a CMake unity build to check that it
|
||||
still works: [c]cmake -DCMAKE_UNITY_BUILD=ON sets up a unity build.
|
||||
|
||||
. Run perltest.sh on the test data for tests 1 and 4. The output should match
|
||||
the PCRE2 test output, apart from the version identification at the start of
|
||||
each test. Sometimes there are other differences in test 4 if PCRE2 and Perl
|
||||
are using different Unicode releases. The other tests are not Perl-compatible
|
||||
(they use various PCRE2-specific features or options).
|
||||
|
||||
. It is possible to test with the emulated memmove() function by undefining
|
||||
HAVE_MEMMOVE and HAVE_BCOPY in config.h, though I do not do this often.
|
||||
|
||||
. Documentation: check AUTHORS, ChangeLog (check version and date), LICENCE,
|
||||
NEWS (check version and date), NON-AUTOTOOLS-BUILD, and README. Many of these
|
||||
won't need changing, but over the long term things do change.
|
||||
|
||||
. I used to test new releases myself on a number of different operating
|
||||
systems. For example, on Solaris it is helpful to test using Sun's cc
|
||||
compiler as a change from gcc. Adding -xarch=v9 to the cc options does a
|
||||
64-bit test, but it also needs -S 64 for pcre2test to increase the stack size
|
||||
for test 2. Since I retired I can no longer do much of this. There are
|
||||
automated tests under Ubuntu, Alpine, and Windows that are now set up as
|
||||
GitHub actions. Check that they are running clean.
|
||||
|
||||
. The buildbots at http://buildfarm.opencsw.org/ do some automated testing
|
||||
of PCRE2 and should also be checked before putting out a release.
|
||||
|
||||
|
||||
Updating version info for libtool
|
||||
=================================
|
||||
|
||||
This set of rules for updating library version information came from a web page
|
||||
whose URL I have forgotten. The version information consists of three parts:
|
||||
(current, revision, age).
|
||||
|
||||
1. Start with version information of 0:0:0 for each libtool library.
|
||||
|
||||
2. Update the version information only immediately before a public release of
|
||||
your software. More frequent updates are unnecessary, and only guarantee
|
||||
that the current interface number gets larger faster.
|
||||
|
||||
3. If the library source code has changed at all since the last update, then
|
||||
increment revision; c:r:a becomes c:r+1:a.
|
||||
|
||||
4. If any interfaces have been added, removed, or changed since the last
|
||||
update, increment current, and set revision to 0.
|
||||
|
||||
5. If any interfaces have been added since the last public release, then
|
||||
increment age.
|
||||
|
||||
6. If any interfaces have been removed or changed since the last public
|
||||
release, then set age to 0.
|
||||
|
||||
The following explanation may help in understanding the above rules a bit
|
||||
better. Consider that there are three possible kinds of reaction from users to
|
||||
changes in a shared library:
|
||||
|
||||
1. Programs using the previous version may use the new version as a drop-in
|
||||
replacement, and programs using the new version can also work with the
|
||||
previous one. In other words, no recompiling nor relinking is needed. In
|
||||
this case, increment revision only, don't touch current or age.
|
||||
|
||||
2. Programs using the previous version may use the new version as a drop-in
|
||||
replacement, but programs using the new version may use APIs not present in
|
||||
the previous one. In other words, a program linking against the new version
|
||||
may fail if linked against the old version at run time. In this case, set
|
||||
revision to 0, increment current and age.
|
||||
|
||||
3. Programs may need to be changed, recompiled, relinked in order to use the
|
||||
new version. Increment current, set revision and age to 0.
|
||||
|
||||
|
||||
Making a PCRE2 release
|
||||
======================
|
||||
|
||||
Run PrepareRelease and commit the files that it changes. The first thing this
|
||||
script does is to run CheckMan on the man pages; if it finds any markup errors,
|
||||
it reports them and then aborts. Otherwise it removes trailing spaces from
|
||||
sources and refreshes the HTML documentation. Update the GitHub repository with
|
||||
"git push".
|
||||
|
||||
Once PrepareRelease has run clean, run "make distcheck" to create the tarballs
|
||||
and the zipball. I then sign these files. Double-check with "git status" that
|
||||
the repository is fully up-to-date, then create a new tag and a release on
|
||||
GitHub. Upload the tarballs, zipball, and the signatures as "assets" of the
|
||||
GitHub release.
|
||||
|
||||
When the new release is out, don't forget to tell webmaster@pcre.org and the
|
||||
mailing list.
|
||||
|
||||
|
||||
Future ideas (wish list)
|
||||
========================
|
||||
|
||||
This section records a list of ideas so that they do not get forgotten. They
|
||||
vary enormously in their usefulness and potential for implementation. Some are
|
||||
very sensible; some are rather wacky. Some have been on this list for many
|
||||
years.
|
||||
|
||||
. Optimization
|
||||
|
||||
There are always ideas for new optimizations so as to speed up pattern
|
||||
matching. Most of them try to save work by recognizing a non-match without
|
||||
having to scan all the possibilities. These are some that I've recorded:
|
||||
|
||||
* /((A{0,5}){0,5}){0,5}(something complex)/ on a non-matching string is very
|
||||
slow, though Perl is fast. Can we speed up somehow? Convert to {0,125}?
|
||||
OTOH, this is pathological - the user could easily fix it.
|
||||
|
||||
* Turn ={4} into ==== ? (for speed). I once did an experiment, and it seems
|
||||
to have little effect, and maybe makes things worse.
|
||||
|
||||
* "Ends with literal string" - note that a single character doesn't gain much
|
||||
over the existing "required code unit" feature that just remembers one code
|
||||
unit.
|
||||
|
||||
* Remember an initial string rather than just 1 code unit.
|
||||
|
||||
* A required code unit from alternatives - not just the last unit, but an
|
||||
earlier one if common to all alternatives.
|
||||
|
||||
* Friedl contains other ideas.
|
||||
|
||||
* The code does not set initial code unit flags for Unicode property types
|
||||
such as \p; I don't know how much benefit there would be for, for example,
|
||||
setting the bits for 0-9 and all values >= xC0 (in 8-bit mode) when a
|
||||
pattern starts with \p{N}.
|
||||
|
||||
. If Perl gets to a consistent state over the settings of capturing sub-
|
||||
patterns inside repeats, see if we can match it. One example of the
|
||||
difference is the matching of /(main(O)?)+/ against mainOmain, where PCRE2
|
||||
leaves $2 set. In Perl, it's unset. Changing this in PCRE2 will be very hard
|
||||
because I think it needs much more state to be remembered.
|
||||
|
||||
. A feature to suspend a match via a callout was once requested.
|
||||
|
||||
. An option to convert results into character offsets and character lengths.
|
||||
|
||||
. A (non-Unix) user wanted pcregrep options to (a) list a file name just once,
|
||||
preceded by a blank line, instead of adding it to every matched line, and (b)
|
||||
support --outputfile=name.
|
||||
|
||||
. Define a union for the results from pcre2_pattern_info().
|
||||
|
||||
. Provide a "random access to the subject" facility so that the way in which it
|
||||
is stored is independent of PCRE2. For efficiency, it probably isn't possible
|
||||
to switch this dynamically. It would have to be specified when PCRE2 was
|
||||
compiled. PCRE2 would then call a function every time it wanted a character.
|
||||
|
||||
. pcre2grep: add -rs for a sorted recurse. Having to store file names and sort
|
||||
them will of course slow it down.
|
||||
|
||||
. Someone suggested --disable-callout to save code space when callouts are
|
||||
never wanted. This seems rather marginal.
|
||||
|
||||
. A user suggested a parameter to limit the length of string matched, for
|
||||
example if the parameter is N, the current match should fail if the matched
|
||||
substring exceeds N. This could apply to both match functions. The value
|
||||
could be a new field in the match context. Compare the offset_limit feature,
|
||||
which limits where a match must start.
|
||||
|
||||
. Write a function that generates random matching strings for a compiled
|
||||
pattern.
|
||||
|
||||
. Pcre2grep: an option to specify the output line separator, either as a string
|
||||
or select from a fixed list. This is not straightforward, because at the
|
||||
moment it outputs whatever is in the input file.
|
||||
|
||||
. Improve the code for duplicate checking in pcre2_dfa_match(). An incomplete,
|
||||
non-thread-safe patch showed that this can help performance for patterns
|
||||
where there are many alternatives. However, a simple thread-safe
|
||||
implementation that I tried made things worse in many simple cases, so this
|
||||
is not an obviously good thing.
|
||||
|
||||
. PCRE2 cannot at present distinguish between subpatterns with different names,
|
||||
but the same number (created by the use of ?|). In order to do so, a way of
|
||||
remembering *which* subpattern numbered n matched is needed. (*MARK) can
|
||||
perhaps be used as a way round this problem. However, note that Perl does not
|
||||
distinguish: like PCRE2, a name is just an alias for a number in Perl.
|
||||
|
||||
. Instead of having #ifdef HAVE_CONFIG_H in each module, put #include
|
||||
"something" and the the #ifdef appears only in one place, in "something".
|
||||
|
||||
. Implement something like (?(R2+)... to check outer recursions.
|
||||
|
||||
. If Perl ever supports the POSIX notation [[.something.]] PCRE2 should try
|
||||
to follow.
|
||||
|
||||
. A user wanted a way of ignoring all Unicode "mark" characters so that, for
|
||||
example "a" followed by an accent would, together, match "a". This can only
|
||||
be done clumsily at present by using a lookahead such as /(?=a)\X/, which
|
||||
works for "combining" characters.
|
||||
|
||||
. Perl supports [\N{x}-\N{y}] as a Unicode range, even in EBCDIC. PCRE2
|
||||
supports \N{U+dd..} everywhere, but not in EBCDIC.
|
||||
|
||||
. Unicode stuff from Perl:
|
||||
|
||||
\b{gcb} or \b{g} grapheme cluster boundary
|
||||
\b{sb} sentence boundary
|
||||
\b{wb} word boundary
|
||||
|
||||
See Unicode TR 29. The last two are very much aimed at natural language.
|
||||
|
||||
. Allow a callout to specify a number of characters to skip. This can be done
|
||||
compatibly via an extra callout field.
|
||||
|
||||
. Allow callouts to return *PRUNE, *COMMIT, *THEN, *SKIP, with and without
|
||||
continuing (that is, with and without an implied *FAIL). A new option,
|
||||
PCRE2_CALLOUT_EXTENDED say, would be needed. This is unlikely ever to be
|
||||
implemented by JIT, so this could be an option for pcre2_match().
|
||||
|
||||
. A limit on substitutions: a user suggested somehow finding a way of making
|
||||
match_limit apply to the whole operation instead of each match separately.
|
||||
|
||||
. Some #defines could be replaced with enums to improve robustness.
|
||||
|
||||
. There was a request for an option for pcre2_match() to return the longest
|
||||
match. This would mean searching for all possible matches, of course.
|
||||
|
||||
. Perl's /a modifier sets Unicode, but restricts \d etc to ASCII characters,
|
||||
which is the PCRE2 default for PCRE2_UTF (use PCRE2_UCP to change). However,
|
||||
Perl also has /aa, which in addition, disables ASCII/non-ASCII caseless
|
||||
matching. Perhaps we need a new option PCRE2_CASELESS_RESTRICT_ASCII. In
|
||||
practice, this just means not using the ucd_caseless_sets[] table.
|
||||
|
||||
. There is more that could be done to the oss-fuzz setup (needs some research).
|
||||
A seed corpus could be built. I noted something about $LIB_FUZZING_ENGINE.
|
||||
The test function could make use of get_substrings() to cover more code.
|
||||
|
||||
. A neater way of handling recursion file names in pcre2grep, e.g. a single
|
||||
buffer that can grow. See also GitHub issue #2 (recursion looping via
|
||||
symlinks).
|
||||
|
||||
. A user suggested that before/after parameters in pcre2grep could have
|
||||
negative values, to list lines near to the matched line, but not necessarily
|
||||
the line itself. For example, --before-context=-1 would list the line *after*
|
||||
each matched line, without showing the matched line. The problem here is what
|
||||
to do with matches that are close together. Maybe a simpler way would be a
|
||||
flag to disable showing matched lines, only valid with either -A or -B?
|
||||
|
||||
. There was a suggestiong for a pcre2grep colour default, or possibly a more
|
||||
general PCRE2GREP_OPT, but only for some options - not file names or patterns.
|
||||
|
||||
. Breaking loops that match an empty string: perhaps find a way of continuing
|
||||
if *something* has changed, but this might mean remembering additional data.
|
||||
"Something" could be a capture value, but then a list of previous values
|
||||
would be needed to avoid a cycle of changes.
|
||||
|
||||
. If a function could be written to find 3-character (or other length) fixed
|
||||
strings, at least one of which must be present for a match, efficient
|
||||
pre-searching of large datasets could be implemented.
|
||||
|
||||
. If pcre2grep had --first-line (match only in the first line) it could be
|
||||
efficiently used to find files "starting with xxx". What about --last-line?
|
||||
There was also the suggestion of an option for pcre2grep to scan only the
|
||||
start of a file. I am not keen - this is the job of "head".
|
||||
|
||||
. A user requested a means of determining whether a failed match was failed by
|
||||
the start-of-match optimizations, or by running the match engine. Easy enough
|
||||
to define a bit in the match data, but all three matchers would need work.
|
||||
|
||||
. Would inlining "simple" recursions provide a useful performance boost for the
|
||||
interpreters? JIT already does some of this, but it may not be worth it for
|
||||
the interpreters.
|
||||
|
||||
. Redesign handling of class/nclass/xclass because the compile code logic is
|
||||
currently very contorted and obscure. Also there was a request for a way of
|
||||
re-defining \w (and therefore \W, \b, and \B). An in-pattern sequence such as
|
||||
(?w=[...]) was suggested. Easiest way would be simply to inline the class,
|
||||
with lookarounds for \b and \B. Ideally the setting should last till the end
|
||||
of the group, which means remembering all previous settings; maybe a fixed
|
||||
amount of stack would do - how deep would anyone want to nest these things?
|
||||
See GitHub issue #13 for a compendium of character class issues, including
|
||||
(?[...]) extended classes.
|
||||
|
||||
. A user suggested something like --with-build-info to set a build information
|
||||
string that could be retrieved by pcre2_config(). However, there's no
|
||||
facility for a length limit in pcre2_config(), and what would be the
|
||||
encoding?
|
||||
|
||||
. Quantified groups with a fixed count currently operate by replicating the
|
||||
group in the compiled bytecode. This may not really matter in these days of
|
||||
gigabyte memory, but perhaps another implementation might be considered.
|
||||
Needs coordination between the interpreters and JIT.
|
||||
|
||||
. There are regular requests for variable-length lookbehinds.
|
||||
|
||||
. See also any suggestions in the GitHub issues.
|
||||
|
||||
Philip Hazel
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
Last updated: 25 April 2022
|
@ -0,0 +1,633 @@
|
||||
# BidiMirroring-14.0.0.txt
|
||||
# Date: 2021-08-08, 22:55:00 GMT [KW, RP]
|
||||
# © 2021 Unicode®, Inc.
|
||||
# For terms of use, see https://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
# Unicode Character Database
|
||||
# For documentation, see https://www.unicode.org/reports/tr44/
|
||||
#
|
||||
# Bidi_Mirroring_Glyph Property
|
||||
#
|
||||
# This file is an informative contributory data file in the
|
||||
# Unicode Character Database.
|
||||
#
|
||||
# This data file lists characters that have the Bidi_Mirrored=Yes property
|
||||
# value, for which there is another Unicode character that typically has a glyph
|
||||
# that is the mirror image of the original character's glyph.
|
||||
#
|
||||
# The repertoire covered by the file is Unicode 14.0.0.
|
||||
#
|
||||
# The file contains a list of lines with mappings from one code point
|
||||
# to another one for character-based mirroring.
|
||||
# Note that for "real" mirroring, a rendering engine needs to select
|
||||
# appropriate alternative glyphs, and that many Unicode characters do not
|
||||
# have a mirror-image Unicode character.
|
||||
#
|
||||
# Each mapping line contains two fields, separated by a semicolon (';').
|
||||
# Each of the two fields contains a code point represented as a
|
||||
# variable-length hexadecimal value with 4 to 6 digits.
|
||||
# A comment indicates where the characters are "BEST FIT" mirroring.
|
||||
#
|
||||
# Code points for which Bidi_Mirrored=Yes, but for which no appropriate
|
||||
# characters exist with mirrored glyphs, are
|
||||
# listed as comments at the end of the file.
|
||||
#
|
||||
# Formally, the default value of the Bidi_Mirroring_Glyph property
|
||||
# for each code point is <none>, unless a mapping to
|
||||
# some other character is specified in this data file. When a code
|
||||
# point has the default value for the Bidi_Mirroring_Glyph property,
|
||||
# that means that no other character exists whose glyph is suitable
|
||||
# for character-based mirroring.
|
||||
#
|
||||
# For information on bidi mirroring, see UAX #9: Unicode Bidirectional Algorithm,
|
||||
# at https://www.unicode.org/reports/tr9/
|
||||
#
|
||||
# This file was originally created by Markus Scherer.
|
||||
# Extended for Unicode 3.2, 4.0, 4.1, 5.0, 5.1, 5.2, and 6.0 by Ken Whistler,
|
||||
# and for subsequent versions by Ken Whistler, Laurentiu Iancu, and Roozbeh Pournader.
|
||||
#
|
||||
# Historical and Compatibility Information:
|
||||
#
|
||||
# The OpenType Mirroring Pairs List (OMPL) is frozen to match the
|
||||
# Unicode 5.1 version of the Bidi_Mirroring_Glyph property (2008).
|
||||
# See https://www.microsoft.com/typography/otspec/ompl.txt
|
||||
#
|
||||
# The Unicode 6.1 version of the Bidi_Mirroring_Glyph property (2011)
|
||||
# added one mirroring pair: 27CB <--> 27CD.
|
||||
#
|
||||
# The Unicode 11.0 version of the Bidi_Mirroring_Glyph property (2018)
|
||||
# underwent a substantial revision, to formally recognize all of the
|
||||
# exact mirroring pairs and "BEST FIT" mirroring pairs that had been
|
||||
# added after the freezing of the OMPL list. As a result, starting
|
||||
# with Unicode 11.0, the bmg mapping values more accurately reflect
|
||||
# the current status of glyphs for Bidi_Mirrored characters in
|
||||
# the Unicode Standard, but this listing now extends significantly
|
||||
# beyond the frozen OMPL list. Implementers should be aware of this
|
||||
# intentional distinction.
|
||||
#
|
||||
# ############################################################
|
||||
#
|
||||
# Property: Bidi_Mirroring_Glyph
|
||||
#
|
||||
# @missing: 0000..10FFFF; <none>
|
||||
|
||||
0028; 0029 # LEFT PARENTHESIS
|
||||
0029; 0028 # RIGHT PARENTHESIS
|
||||
003C; 003E # LESS-THAN SIGN
|
||||
003E; 003C # GREATER-THAN SIGN
|
||||
005B; 005D # LEFT SQUARE BRACKET
|
||||
005D; 005B # RIGHT SQUARE BRACKET
|
||||
007B; 007D # LEFT CURLY BRACKET
|
||||
007D; 007B # RIGHT CURLY BRACKET
|
||||
00AB; 00BB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
00BB; 00AB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
0F3A; 0F3B # TIBETAN MARK GUG RTAGS GYON
|
||||
0F3B; 0F3A # TIBETAN MARK GUG RTAGS GYAS
|
||||
0F3C; 0F3D # TIBETAN MARK ANG KHANG GYON
|
||||
0F3D; 0F3C # TIBETAN MARK ANG KHANG GYAS
|
||||
169B; 169C # OGHAM FEATHER MARK
|
||||
169C; 169B # OGHAM REVERSED FEATHER MARK
|
||||
2039; 203A # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
203A; 2039 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
2045; 2046 # LEFT SQUARE BRACKET WITH QUILL
|
||||
2046; 2045 # RIGHT SQUARE BRACKET WITH QUILL
|
||||
207D; 207E # SUPERSCRIPT LEFT PARENTHESIS
|
||||
207E; 207D # SUPERSCRIPT RIGHT PARENTHESIS
|
||||
208D; 208E # SUBSCRIPT LEFT PARENTHESIS
|
||||
208E; 208D # SUBSCRIPT RIGHT PARENTHESIS
|
||||
2208; 220B # ELEMENT OF
|
||||
2209; 220C # [BEST FIT] NOT AN ELEMENT OF
|
||||
220A; 220D # SMALL ELEMENT OF
|
||||
220B; 2208 # CONTAINS AS MEMBER
|
||||
220C; 2209 # [BEST FIT] DOES NOT CONTAIN AS MEMBER
|
||||
220D; 220A # SMALL CONTAINS AS MEMBER
|
||||
2215; 29F5 # DIVISION SLASH
|
||||
221F; 2BFE # RIGHT ANGLE
|
||||
2220; 29A3 # ANGLE
|
||||
2221; 299B # MEASURED ANGLE
|
||||
2222; 29A0 # SPHERICAL ANGLE
|
||||
2224; 2AEE # DOES NOT DIVIDE
|
||||
223C; 223D # TILDE OPERATOR
|
||||
223D; 223C # REVERSED TILDE
|
||||
2243; 22CD # ASYMPTOTICALLY EQUAL TO
|
||||
2245; 224C # APPROXIMATELY EQUAL TO
|
||||
224C; 2245 # ALL EQUAL TO
|
||||
2252; 2253 # APPROXIMATELY EQUAL TO OR THE IMAGE OF
|
||||
2253; 2252 # IMAGE OF OR APPROXIMATELY EQUAL TO
|
||||
2254; 2255 # COLON EQUALS
|
||||
2255; 2254 # EQUALS COLON
|
||||
2264; 2265 # LESS-THAN OR EQUAL TO
|
||||
2265; 2264 # GREATER-THAN OR EQUAL TO
|
||||
2266; 2267 # LESS-THAN OVER EQUAL TO
|
||||
2267; 2266 # GREATER-THAN OVER EQUAL TO
|
||||
2268; 2269 # [BEST FIT] LESS-THAN BUT NOT EQUAL TO
|
||||
2269; 2268 # [BEST FIT] GREATER-THAN BUT NOT EQUAL TO
|
||||
226A; 226B # MUCH LESS-THAN
|
||||
226B; 226A # MUCH GREATER-THAN
|
||||
226E; 226F # [BEST FIT] NOT LESS-THAN
|
||||
226F; 226E # [BEST FIT] NOT GREATER-THAN
|
||||
2270; 2271 # [BEST FIT] NEITHER LESS-THAN NOR EQUAL TO
|
||||
2271; 2270 # [BEST FIT] NEITHER GREATER-THAN NOR EQUAL TO
|
||||
2272; 2273 # [BEST FIT] LESS-THAN OR EQUIVALENT TO
|
||||
2273; 2272 # [BEST FIT] GREATER-THAN OR EQUIVALENT TO
|
||||
2274; 2275 # [BEST FIT] NEITHER LESS-THAN NOR EQUIVALENT TO
|
||||
2275; 2274 # [BEST FIT] NEITHER GREATER-THAN NOR EQUIVALENT TO
|
||||
2276; 2277 # LESS-THAN OR GREATER-THAN
|
||||
2277; 2276 # GREATER-THAN OR LESS-THAN
|
||||
2278; 2279 # [BEST FIT] NEITHER LESS-THAN NOR GREATER-THAN
|
||||
2279; 2278 # [BEST FIT] NEITHER GREATER-THAN NOR LESS-THAN
|
||||
227A; 227B # PRECEDES
|
||||
227B; 227A # SUCCEEDS
|
||||
227C; 227D # PRECEDES OR EQUAL TO
|
||||
227D; 227C # SUCCEEDS OR EQUAL TO
|
||||
227E; 227F # [BEST FIT] PRECEDES OR EQUIVALENT TO
|
||||
227F; 227E # [BEST FIT] SUCCEEDS OR EQUIVALENT TO
|
||||
2280; 2281 # [BEST FIT] DOES NOT PRECEDE
|
||||
2281; 2280 # [BEST FIT] DOES NOT SUCCEED
|
||||
2282; 2283 # SUBSET OF
|
||||
2283; 2282 # SUPERSET OF
|
||||
2284; 2285 # [BEST FIT] NOT A SUBSET OF
|
||||
2285; 2284 # [BEST FIT] NOT A SUPERSET OF
|
||||
2286; 2287 # SUBSET OF OR EQUAL TO
|
||||
2287; 2286 # SUPERSET OF OR EQUAL TO
|
||||
2288; 2289 # [BEST FIT] NEITHER A SUBSET OF NOR EQUAL TO
|
||||
2289; 2288 # [BEST FIT] NEITHER A SUPERSET OF NOR EQUAL TO
|
||||
228A; 228B # [BEST FIT] SUBSET OF WITH NOT EQUAL TO
|
||||
228B; 228A # [BEST FIT] SUPERSET OF WITH NOT EQUAL TO
|
||||
228F; 2290 # SQUARE IMAGE OF
|
||||
2290; 228F # SQUARE ORIGINAL OF
|
||||
2291; 2292 # SQUARE IMAGE OF OR EQUAL TO
|
||||
2292; 2291 # SQUARE ORIGINAL OF OR EQUAL TO
|
||||
2298; 29B8 # CIRCLED DIVISION SLASH
|
||||
22A2; 22A3 # RIGHT TACK
|
||||
22A3; 22A2 # LEFT TACK
|
||||
22A6; 2ADE # ASSERTION
|
||||
22A8; 2AE4 # TRUE
|
||||
22A9; 2AE3 # FORCES
|
||||
22AB; 2AE5 # DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
|
||||
22B0; 22B1 # PRECEDES UNDER RELATION
|
||||
22B1; 22B0 # SUCCEEDS UNDER RELATION
|
||||
22B2; 22B3 # NORMAL SUBGROUP OF
|
||||
22B3; 22B2 # CONTAINS AS NORMAL SUBGROUP
|
||||
22B4; 22B5 # NORMAL SUBGROUP OF OR EQUAL TO
|
||||
22B5; 22B4 # CONTAINS AS NORMAL SUBGROUP OR EQUAL TO
|
||||
22B6; 22B7 # ORIGINAL OF
|
||||
22B7; 22B6 # IMAGE OF
|
||||
22B8; 27DC # MULTIMAP
|
||||
22C9; 22CA # LEFT NORMAL FACTOR SEMIDIRECT PRODUCT
|
||||
22CA; 22C9 # RIGHT NORMAL FACTOR SEMIDIRECT PRODUCT
|
||||
22CB; 22CC # LEFT SEMIDIRECT PRODUCT
|
||||
22CC; 22CB # RIGHT SEMIDIRECT PRODUCT
|
||||
22CD; 2243 # REVERSED TILDE EQUALS
|
||||
22D0; 22D1 # DOUBLE SUBSET
|
||||
22D1; 22D0 # DOUBLE SUPERSET
|
||||
22D6; 22D7 # LESS-THAN WITH DOT
|
||||
22D7; 22D6 # GREATER-THAN WITH DOT
|
||||
22D8; 22D9 # VERY MUCH LESS-THAN
|
||||
22D9; 22D8 # VERY MUCH GREATER-THAN
|
||||
22DA; 22DB # LESS-THAN EQUAL TO OR GREATER-THAN
|
||||
22DB; 22DA # GREATER-THAN EQUAL TO OR LESS-THAN
|
||||
22DC; 22DD # EQUAL TO OR LESS-THAN
|
||||
22DD; 22DC # EQUAL TO OR GREATER-THAN
|
||||
22DE; 22DF # EQUAL TO OR PRECEDES
|
||||
22DF; 22DE # EQUAL TO OR SUCCEEDS
|
||||
22E0; 22E1 # [BEST FIT] DOES NOT PRECEDE OR EQUAL
|
||||
22E1; 22E0 # [BEST FIT] DOES NOT SUCCEED OR EQUAL
|
||||
22E2; 22E3 # [BEST FIT] NOT SQUARE IMAGE OF OR EQUAL TO
|
||||
22E3; 22E2 # [BEST FIT] NOT SQUARE ORIGINAL OF OR EQUAL TO
|
||||
22E4; 22E5 # [BEST FIT] SQUARE IMAGE OF OR NOT EQUAL TO
|
||||
22E5; 22E4 # [BEST FIT] SQUARE ORIGINAL OF OR NOT EQUAL TO
|
||||
22E6; 22E7 # [BEST FIT] LESS-THAN BUT NOT EQUIVALENT TO
|
||||
22E7; 22E6 # [BEST FIT] GREATER-THAN BUT NOT EQUIVALENT TO
|
||||
22E8; 22E9 # [BEST FIT] PRECEDES BUT NOT EQUIVALENT TO
|
||||
22E9; 22E8 # [BEST FIT] SUCCEEDS BUT NOT EQUIVALENT TO
|
||||
22EA; 22EB # [BEST FIT] NOT NORMAL SUBGROUP OF
|
||||
22EB; 22EA # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP
|
||||
22EC; 22ED # [BEST FIT] NOT NORMAL SUBGROUP OF OR EQUAL TO
|
||||
22ED; 22EC # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL
|
||||
22F0; 22F1 # UP RIGHT DIAGONAL ELLIPSIS
|
||||
22F1; 22F0 # DOWN RIGHT DIAGONAL ELLIPSIS
|
||||
22F2; 22FA # ELEMENT OF WITH LONG HORIZONTAL STROKE
|
||||
22F3; 22FB # ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
|
||||
22F4; 22FC # SMALL ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
|
||||
22F6; 22FD # ELEMENT OF WITH OVERBAR
|
||||
22F7; 22FE # SMALL ELEMENT OF WITH OVERBAR
|
||||
22FA; 22F2 # CONTAINS WITH LONG HORIZONTAL STROKE
|
||||
22FB; 22F3 # CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
|
||||
22FC; 22F4 # SMALL CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
|
||||
22FD; 22F6 # CONTAINS WITH OVERBAR
|
||||
22FE; 22F7 # SMALL CONTAINS WITH OVERBAR
|
||||
2308; 2309 # LEFT CEILING
|
||||
2309; 2308 # RIGHT CEILING
|
||||
230A; 230B # LEFT FLOOR
|
||||
230B; 230A # RIGHT FLOOR
|
||||
2329; 232A # LEFT-POINTING ANGLE BRACKET
|
||||
232A; 2329 # RIGHT-POINTING ANGLE BRACKET
|
||||
2768; 2769 # MEDIUM LEFT PARENTHESIS ORNAMENT
|
||||
2769; 2768 # MEDIUM RIGHT PARENTHESIS ORNAMENT
|
||||
276A; 276B # MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
|
||||
276B; 276A # MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT
|
||||
276C; 276D # MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT
|
||||
276D; 276C # MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT
|
||||
276E; 276F # HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT
|
||||
276F; 276E # HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT
|
||||
2770; 2771 # HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT
|
||||
2771; 2770 # HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT
|
||||
2772; 2773 # LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT
|
||||
2773; 2772 # LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT
|
||||
2774; 2775 # MEDIUM LEFT CURLY BRACKET ORNAMENT
|
||||
2775; 2774 # MEDIUM RIGHT CURLY BRACKET ORNAMENT
|
||||
27C3; 27C4 # OPEN SUBSET
|
||||
27C4; 27C3 # OPEN SUPERSET
|
||||
27C5; 27C6 # LEFT S-SHAPED BAG DELIMITER
|
||||
27C6; 27C5 # RIGHT S-SHAPED BAG DELIMITER
|
||||
27C8; 27C9 # REVERSE SOLIDUS PRECEDING SUBSET
|
||||
27C9; 27C8 # SUPERSET PRECEDING SOLIDUS
|
||||
27CB; 27CD # MATHEMATICAL RISING DIAGONAL
|
||||
27CD; 27CB # MATHEMATICAL FALLING DIAGONAL
|
||||
27D5; 27D6 # LEFT OUTER JOIN
|
||||
27D6; 27D5 # RIGHT OUTER JOIN
|
||||
27DC; 22B8 # LEFT MULTIMAP
|
||||
27DD; 27DE # LONG RIGHT TACK
|
||||
27DE; 27DD # LONG LEFT TACK
|
||||
27E2; 27E3 # WHITE CONCAVE-SIDED DIAMOND WITH LEFTWARDS TICK
|
||||
27E3; 27E2 # WHITE CONCAVE-SIDED DIAMOND WITH RIGHTWARDS TICK
|
||||
27E4; 27E5 # WHITE SQUARE WITH LEFTWARDS TICK
|
||||
27E5; 27E4 # WHITE SQUARE WITH RIGHTWARDS TICK
|
||||
27E6; 27E7 # MATHEMATICAL LEFT WHITE SQUARE BRACKET
|
||||
27E7; 27E6 # MATHEMATICAL RIGHT WHITE SQUARE BRACKET
|
||||
27E8; 27E9 # MATHEMATICAL LEFT ANGLE BRACKET
|
||||
27E9; 27E8 # MATHEMATICAL RIGHT ANGLE BRACKET
|
||||
27EA; 27EB # MATHEMATICAL LEFT DOUBLE ANGLE BRACKET
|
||||
27EB; 27EA # MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET
|
||||
27EC; 27ED # MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET
|
||||
27ED; 27EC # MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET
|
||||
27EE; 27EF # MATHEMATICAL LEFT FLATTENED PARENTHESIS
|
||||
27EF; 27EE # MATHEMATICAL RIGHT FLATTENED PARENTHESIS
|
||||
2983; 2984 # LEFT WHITE CURLY BRACKET
|
||||
2984; 2983 # RIGHT WHITE CURLY BRACKET
|
||||
2985; 2986 # LEFT WHITE PARENTHESIS
|
||||
2986; 2985 # RIGHT WHITE PARENTHESIS
|
||||
2987; 2988 # Z NOTATION LEFT IMAGE BRACKET
|
||||
2988; 2987 # Z NOTATION RIGHT IMAGE BRACKET
|
||||
2989; 298A # Z NOTATION LEFT BINDING BRACKET
|
||||
298A; 2989 # Z NOTATION RIGHT BINDING BRACKET
|
||||
298B; 298C # LEFT SQUARE BRACKET WITH UNDERBAR
|
||||
298C; 298B # RIGHT SQUARE BRACKET WITH UNDERBAR
|
||||
298D; 2990 # LEFT SQUARE BRACKET WITH TICK IN TOP CORNER
|
||||
298E; 298F # RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
|
||||
298F; 298E # LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
|
||||
2990; 298D # RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER
|
||||
2991; 2992 # LEFT ANGLE BRACKET WITH DOT
|
||||
2992; 2991 # RIGHT ANGLE BRACKET WITH DOT
|
||||
2993; 2994 # LEFT ARC LESS-THAN BRACKET
|
||||
2994; 2993 # RIGHT ARC GREATER-THAN BRACKET
|
||||
2995; 2996 # DOUBLE LEFT ARC GREATER-THAN BRACKET
|
||||
2996; 2995 # DOUBLE RIGHT ARC LESS-THAN BRACKET
|
||||
2997; 2998 # LEFT BLACK TORTOISE SHELL BRACKET
|
||||
2998; 2997 # RIGHT BLACK TORTOISE SHELL BRACKET
|
||||
299B; 2221 # MEASURED ANGLE OPENING LEFT
|
||||
29A0; 2222 # SPHERICAL ANGLE OPENING LEFT
|
||||
29A3; 2220 # REVERSED ANGLE
|
||||
29A4; 29A5 # ANGLE WITH UNDERBAR
|
||||
29A5; 29A4 # REVERSED ANGLE WITH UNDERBAR
|
||||
29A8; 29A9 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND RIGHT
|
||||
29A9; 29A8 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND LEFT
|
||||
29AA; 29AB # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND RIGHT
|
||||
29AB; 29AA # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND LEFT
|
||||
29AC; 29AD # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND UP
|
||||
29AD; 29AC # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND UP
|
||||
29AE; 29AF # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND DOWN
|
||||
29AF; 29AE # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND DOWN
|
||||
29B8; 2298 # CIRCLED REVERSE SOLIDUS
|
||||
29C0; 29C1 # CIRCLED LESS-THAN
|
||||
29C1; 29C0 # CIRCLED GREATER-THAN
|
||||
29C4; 29C5 # SQUARED RISING DIAGONAL SLASH
|
||||
29C5; 29C4 # SQUARED FALLING DIAGONAL SLASH
|
||||
29CF; 29D0 # LEFT TRIANGLE BESIDE VERTICAL BAR
|
||||
29D0; 29CF # VERTICAL BAR BESIDE RIGHT TRIANGLE
|
||||
29D1; 29D2 # BOWTIE WITH LEFT HALF BLACK
|
||||
29D2; 29D1 # BOWTIE WITH RIGHT HALF BLACK
|
||||
29D4; 29D5 # TIMES WITH LEFT HALF BLACK
|
||||
29D5; 29D4 # TIMES WITH RIGHT HALF BLACK
|
||||
29D8; 29D9 # LEFT WIGGLY FENCE
|
||||
29D9; 29D8 # RIGHT WIGGLY FENCE
|
||||
29DA; 29DB # LEFT DOUBLE WIGGLY FENCE
|
||||
29DB; 29DA # RIGHT DOUBLE WIGGLY FENCE
|
||||
29E8; 29E9 # DOWN-POINTING TRIANGLE WITH LEFT HALF BLACK
|
||||
29E9; 29E8 # DOWN-POINTING TRIANGLE WITH RIGHT HALF BLACK
|
||||
29F5; 2215 # REVERSE SOLIDUS OPERATOR
|
||||
29F8; 29F9 # BIG SOLIDUS
|
||||
29F9; 29F8 # BIG REVERSE SOLIDUS
|
||||
29FC; 29FD # LEFT-POINTING CURVED ANGLE BRACKET
|
||||
29FD; 29FC # RIGHT-POINTING CURVED ANGLE BRACKET
|
||||
2A2B; 2A2C # MINUS SIGN WITH FALLING DOTS
|
||||
2A2C; 2A2B # MINUS SIGN WITH RISING DOTS
|
||||
2A2D; 2A2E # PLUS SIGN IN LEFT HALF CIRCLE
|
||||
2A2E; 2A2D # PLUS SIGN IN RIGHT HALF CIRCLE
|
||||
2A34; 2A35 # MULTIPLICATION SIGN IN LEFT HALF CIRCLE
|
||||
2A35; 2A34 # MULTIPLICATION SIGN IN RIGHT HALF CIRCLE
|
||||
2A3C; 2A3D # INTERIOR PRODUCT
|
||||
2A3D; 2A3C # RIGHTHAND INTERIOR PRODUCT
|
||||
2A64; 2A65 # Z NOTATION DOMAIN ANTIRESTRICTION
|
||||
2A65; 2A64 # Z NOTATION RANGE ANTIRESTRICTION
|
||||
2A79; 2A7A # LESS-THAN WITH CIRCLE INSIDE
|
||||
2A7A; 2A79 # GREATER-THAN WITH CIRCLE INSIDE
|
||||
2A7B; 2A7C # [BEST FIT] LESS-THAN WITH QUESTION MARK ABOVE
|
||||
2A7C; 2A7B # [BEST FIT] GREATER-THAN WITH QUESTION MARK ABOVE
|
||||
2A7D; 2A7E # LESS-THAN OR SLANTED EQUAL TO
|
||||
2A7E; 2A7D # GREATER-THAN OR SLANTED EQUAL TO
|
||||
2A7F; 2A80 # LESS-THAN OR SLANTED EQUAL TO WITH DOT INSIDE
|
||||
2A80; 2A7F # GREATER-THAN OR SLANTED EQUAL TO WITH DOT INSIDE
|
||||
2A81; 2A82 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE
|
||||
2A82; 2A81 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE
|
||||
2A83; 2A84 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE RIGHT
|
||||
2A84; 2A83 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE LEFT
|
||||
2A85; 2A86 # [BEST FIT] LESS-THAN OR APPROXIMATE
|
||||
2A86; 2A85 # [BEST FIT] GREATER-THAN OR APPROXIMATE
|
||||
2A87; 2A88 # [BEST FIT] LESS-THAN AND SINGLE-LINE NOT EQUAL TO
|
||||
2A88; 2A87 # [BEST FIT] GREATER-THAN AND SINGLE-LINE NOT EQUAL TO
|
||||
2A89; 2A8A # [BEST FIT] LESS-THAN AND NOT APPROXIMATE
|
||||
2A8A; 2A89 # [BEST FIT] GREATER-THAN AND NOT APPROXIMATE
|
||||
2A8B; 2A8C # LESS-THAN ABOVE DOUBLE-LINE EQUAL ABOVE GREATER-THAN
|
||||
2A8C; 2A8B # GREATER-THAN ABOVE DOUBLE-LINE EQUAL ABOVE LESS-THAN
|
||||
2A8D; 2A8E # [BEST FIT] LESS-THAN ABOVE SIMILAR OR EQUAL
|
||||
2A8E; 2A8D # [BEST FIT] GREATER-THAN ABOVE SIMILAR OR EQUAL
|
||||
2A8F; 2A90 # [BEST FIT] LESS-THAN ABOVE SIMILAR ABOVE GREATER-THAN
|
||||
2A90; 2A8F # [BEST FIT] GREATER-THAN ABOVE SIMILAR ABOVE LESS-THAN
|
||||
2A91; 2A92 # LESS-THAN ABOVE GREATER-THAN ABOVE DOUBLE-LINE EQUAL
|
||||
2A92; 2A91 # GREATER-THAN ABOVE LESS-THAN ABOVE DOUBLE-LINE EQUAL
|
||||
2A93; 2A94 # LESS-THAN ABOVE SLANTED EQUAL ABOVE GREATER-THAN ABOVE SLANTED EQUAL
|
||||
2A94; 2A93 # GREATER-THAN ABOVE SLANTED EQUAL ABOVE LESS-THAN ABOVE SLANTED EQUAL
|
||||
2A95; 2A96 # SLANTED EQUAL TO OR LESS-THAN
|
||||
2A96; 2A95 # SLANTED EQUAL TO OR GREATER-THAN
|
||||
2A97; 2A98 # SLANTED EQUAL TO OR LESS-THAN WITH DOT INSIDE
|
||||
2A98; 2A97 # SLANTED EQUAL TO OR GREATER-THAN WITH DOT INSIDE
|
||||
2A99; 2A9A # DOUBLE-LINE EQUAL TO OR LESS-THAN
|
||||
2A9A; 2A99 # DOUBLE-LINE EQUAL TO OR GREATER-THAN
|
||||
2A9B; 2A9C # DOUBLE-LINE SLANTED EQUAL TO OR LESS-THAN
|
||||
2A9C; 2A9B # DOUBLE-LINE SLANTED EQUAL TO OR GREATER-THAN
|
||||
2A9D; 2A9E # [BEST FIT] SIMILAR OR LESS-THAN
|
||||
2A9E; 2A9D # [BEST FIT] SIMILAR OR GREATER-THAN
|
||||
2A9F; 2AA0 # [BEST FIT] SIMILAR ABOVE LESS-THAN ABOVE EQUALS SIGN
|
||||
2AA0; 2A9F # [BEST FIT] SIMILAR ABOVE GREATER-THAN ABOVE EQUALS SIGN
|
||||
2AA1; 2AA2 # DOUBLE NESTED LESS-THAN
|
||||
2AA2; 2AA1 # DOUBLE NESTED GREATER-THAN
|
||||
2AA6; 2AA7 # LESS-THAN CLOSED BY CURVE
|
||||
2AA7; 2AA6 # GREATER-THAN CLOSED BY CURVE
|
||||
2AA8; 2AA9 # LESS-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL
|
||||
2AA9; 2AA8 # GREATER-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL
|
||||
2AAA; 2AAB # SMALLER THAN
|
||||
2AAB; 2AAA # LARGER THAN
|
||||
2AAC; 2AAD # SMALLER THAN OR EQUAL TO
|
||||
2AAD; 2AAC # LARGER THAN OR EQUAL TO
|
||||
2AAF; 2AB0 # PRECEDES ABOVE SINGLE-LINE EQUALS SIGN
|
||||
2AB0; 2AAF # SUCCEEDS ABOVE SINGLE-LINE EQUALS SIGN
|
||||
2AB1; 2AB2 # [BEST FIT] PRECEDES ABOVE SINGLE-LINE NOT EQUAL TO
|
||||
2AB2; 2AB1 # [BEST FIT] SUCCEEDS ABOVE SINGLE-LINE NOT EQUAL TO
|
||||
2AB3; 2AB4 # PRECEDES ABOVE EQUALS SIGN
|
||||
2AB4; 2AB3 # SUCCEEDS ABOVE EQUALS SIGN
|
||||
2AB5; 2AB6 # [BEST FIT] PRECEDES ABOVE NOT EQUAL TO
|
||||
2AB6; 2AB5 # [BEST FIT] SUCCEEDS ABOVE NOT EQUAL TO
|
||||
2AB7; 2AB8 # [BEST FIT] PRECEDES ABOVE ALMOST EQUAL TO
|
||||
2AB8; 2AB7 # [BEST FIT] SUCCEEDS ABOVE ALMOST EQUAL TO
|
||||
2AB9; 2ABA # [BEST FIT] PRECEDES ABOVE NOT ALMOST EQUAL TO
|
||||
2ABA; 2AB9 # [BEST FIT] SUCCEEDS ABOVE NOT ALMOST EQUAL TO
|
||||
2ABB; 2ABC # DOUBLE PRECEDES
|
||||
2ABC; 2ABB # DOUBLE SUCCEEDS
|
||||
2ABD; 2ABE # SUBSET WITH DOT
|
||||
2ABE; 2ABD # SUPERSET WITH DOT
|
||||
2ABF; 2AC0 # SUBSET WITH PLUS SIGN BELOW
|
||||
2AC0; 2ABF # SUPERSET WITH PLUS SIGN BELOW
|
||||
2AC1; 2AC2 # SUBSET WITH MULTIPLICATION SIGN BELOW
|
||||
2AC2; 2AC1 # SUPERSET WITH MULTIPLICATION SIGN BELOW
|
||||
2AC3; 2AC4 # SUBSET OF OR EQUAL TO WITH DOT ABOVE
|
||||
2AC4; 2AC3 # SUPERSET OF OR EQUAL TO WITH DOT ABOVE
|
||||
2AC5; 2AC6 # SUBSET OF ABOVE EQUALS SIGN
|
||||
2AC6; 2AC5 # SUPERSET OF ABOVE EQUALS SIGN
|
||||
2AC7; 2AC8 # [BEST FIT] SUBSET OF ABOVE TILDE OPERATOR
|
||||
2AC8; 2AC7 # [BEST FIT] SUPERSET OF ABOVE TILDE OPERATOR
|
||||
2AC9; 2ACA # [BEST FIT] SUBSET OF ABOVE ALMOST EQUAL TO
|
||||
2ACA; 2AC9 # [BEST FIT] SUPERSET OF ABOVE ALMOST EQUAL TO
|
||||
2ACB; 2ACC # [BEST FIT] SUBSET OF ABOVE NOT EQUAL TO
|
||||
2ACC; 2ACB # [BEST FIT] SUPERSET OF ABOVE NOT EQUAL TO
|
||||
2ACD; 2ACE # SQUARE LEFT OPEN BOX OPERATOR
|
||||
2ACE; 2ACD # SQUARE RIGHT OPEN BOX OPERATOR
|
||||
2ACF; 2AD0 # CLOSED SUBSET
|
||||
2AD0; 2ACF # CLOSED SUPERSET
|
||||
2AD1; 2AD2 # CLOSED SUBSET OR EQUAL TO
|
||||
2AD2; 2AD1 # CLOSED SUPERSET OR EQUAL TO
|
||||
2AD3; 2AD4 # SUBSET ABOVE SUPERSET
|
||||
2AD4; 2AD3 # SUPERSET ABOVE SUBSET
|
||||
2AD5; 2AD6 # SUBSET ABOVE SUBSET
|
||||
2AD6; 2AD5 # SUPERSET ABOVE SUPERSET
|
||||
2ADE; 22A6 # SHORT LEFT TACK
|
||||
2AE3; 22A9 # DOUBLE VERTICAL BAR LEFT TURNSTILE
|
||||
2AE4; 22A8 # VERTICAL BAR DOUBLE LEFT TURNSTILE
|
||||
2AE5; 22AB # DOUBLE VERTICAL BAR DOUBLE LEFT TURNSTILE
|
||||
2AEC; 2AED # DOUBLE STROKE NOT SIGN
|
||||
2AED; 2AEC # REVERSED DOUBLE STROKE NOT SIGN
|
||||
2AEE; 2224 # DOES NOT DIVIDE WITH REVERSED NEGATION SLASH
|
||||
2AF7; 2AF8 # TRIPLE NESTED LESS-THAN
|
||||
2AF8; 2AF7 # TRIPLE NESTED GREATER-THAN
|
||||
2AF9; 2AFA # DOUBLE-LINE SLANTED LESS-THAN OR EQUAL TO
|
||||
2AFA; 2AF9 # DOUBLE-LINE SLANTED GREATER-THAN OR EQUAL TO
|
||||
2BFE; 221F # REVERSED RIGHT ANGLE
|
||||
2E02; 2E03 # LEFT SUBSTITUTION BRACKET
|
||||
2E03; 2E02 # RIGHT SUBSTITUTION BRACKET
|
||||
2E04; 2E05 # LEFT DOTTED SUBSTITUTION BRACKET
|
||||
2E05; 2E04 # RIGHT DOTTED SUBSTITUTION BRACKET
|
||||
2E09; 2E0A # LEFT TRANSPOSITION BRACKET
|
||||
2E0A; 2E09 # RIGHT TRANSPOSITION BRACKET
|
||||
2E0C; 2E0D # LEFT RAISED OMISSION BRACKET
|
||||
2E0D; 2E0C # RIGHT RAISED OMISSION BRACKET
|
||||
2E1C; 2E1D # LEFT LOW PARAPHRASE BRACKET
|
||||
2E1D; 2E1C # RIGHT LOW PARAPHRASE BRACKET
|
||||
2E20; 2E21 # LEFT VERTICAL BAR WITH QUILL
|
||||
2E21; 2E20 # RIGHT VERTICAL BAR WITH QUILL
|
||||
2E22; 2E23 # TOP LEFT HALF BRACKET
|
||||
2E23; 2E22 # TOP RIGHT HALF BRACKET
|
||||
2E24; 2E25 # BOTTOM LEFT HALF BRACKET
|
||||
2E25; 2E24 # BOTTOM RIGHT HALF BRACKET
|
||||
2E26; 2E27 # LEFT SIDEWAYS U BRACKET
|
||||
2E27; 2E26 # RIGHT SIDEWAYS U BRACKET
|
||||
2E28; 2E29 # LEFT DOUBLE PARENTHESIS
|
||||
2E29; 2E28 # RIGHT DOUBLE PARENTHESIS
|
||||
2E55; 2E56 # LEFT SQUARE BRACKET WITH STROKE
|
||||
2E56; 2E55 # RIGHT SQUARE BRACKET WITH STROKE
|
||||
2E57; 2E58 # LEFT SQUARE BRACKET WITH DOUBLE STROKE
|
||||
2E58; 2E57 # RIGHT SQUARE BRACKET WITH DOUBLE STROKE
|
||||
2E59; 2E5A # TOP HALF LEFT PARENTHESIS
|
||||
2E5A; 2E59 # TOP HALF RIGHT PARENTHESIS
|
||||
2E5B; 2E5C # BOTTOM HALF LEFT PARENTHESIS
|
||||
2E5C; 2E5B # BOTTOM HALF RIGHT PARENTHESIS
|
||||
3008; 3009 # LEFT ANGLE BRACKET
|
||||
3009; 3008 # RIGHT ANGLE BRACKET
|
||||
300A; 300B # LEFT DOUBLE ANGLE BRACKET
|
||||
300B; 300A # RIGHT DOUBLE ANGLE BRACKET
|
||||
300C; 300D # [BEST FIT] LEFT CORNER BRACKET
|
||||
300D; 300C # [BEST FIT] RIGHT CORNER BRACKET
|
||||
300E; 300F # [BEST FIT] LEFT WHITE CORNER BRACKET
|
||||
300F; 300E # [BEST FIT] RIGHT WHITE CORNER BRACKET
|
||||
3010; 3011 # LEFT BLACK LENTICULAR BRACKET
|
||||
3011; 3010 # RIGHT BLACK LENTICULAR BRACKET
|
||||
3014; 3015 # LEFT TORTOISE SHELL BRACKET
|
||||
3015; 3014 # RIGHT TORTOISE SHELL BRACKET
|
||||
3016; 3017 # LEFT WHITE LENTICULAR BRACKET
|
||||
3017; 3016 # RIGHT WHITE LENTICULAR BRACKET
|
||||
3018; 3019 # LEFT WHITE TORTOISE SHELL BRACKET
|
||||
3019; 3018 # RIGHT WHITE TORTOISE SHELL BRACKET
|
||||
301A; 301B # LEFT WHITE SQUARE BRACKET
|
||||
301B; 301A # RIGHT WHITE SQUARE BRACKET
|
||||
FE59; FE5A # SMALL LEFT PARENTHESIS
|
||||
FE5A; FE59 # SMALL RIGHT PARENTHESIS
|
||||
FE5B; FE5C # SMALL LEFT CURLY BRACKET
|
||||
FE5C; FE5B # SMALL RIGHT CURLY BRACKET
|
||||
FE5D; FE5E # SMALL LEFT TORTOISE SHELL BRACKET
|
||||
FE5E; FE5D # SMALL RIGHT TORTOISE SHELL BRACKET
|
||||
FE64; FE65 # SMALL LESS-THAN SIGN
|
||||
FE65; FE64 # SMALL GREATER-THAN SIGN
|
||||
FF08; FF09 # FULLWIDTH LEFT PARENTHESIS
|
||||
FF09; FF08 # FULLWIDTH RIGHT PARENTHESIS
|
||||
FF1C; FF1E # FULLWIDTH LESS-THAN SIGN
|
||||
FF1E; FF1C # FULLWIDTH GREATER-THAN SIGN
|
||||
FF3B; FF3D # FULLWIDTH LEFT SQUARE BRACKET
|
||||
FF3D; FF3B # FULLWIDTH RIGHT SQUARE BRACKET
|
||||
FF5B; FF5D # FULLWIDTH LEFT CURLY BRACKET
|
||||
FF5D; FF5B # FULLWIDTH RIGHT CURLY BRACKET
|
||||
FF5F; FF60 # FULLWIDTH LEFT WHITE PARENTHESIS
|
||||
FF60; FF5F # FULLWIDTH RIGHT WHITE PARENTHESIS
|
||||
FF62; FF63 # [BEST FIT] HALFWIDTH LEFT CORNER BRACKET
|
||||
FF63; FF62 # [BEST FIT] HALFWIDTH RIGHT CORNER BRACKET
|
||||
|
||||
# The following characters have no appropriate mirroring character.
|
||||
# For these characters it is up to the rendering system
|
||||
# to provide mirrored glyphs.
|
||||
|
||||
# 2140; DOUBLE-STRUCK N-ARY SUMMATION
|
||||
# 2201; COMPLEMENT
|
||||
# 2202; PARTIAL DIFFERENTIAL
|
||||
# 2203; THERE EXISTS
|
||||
# 2204; THERE DOES NOT EXIST
|
||||
# 2211; N-ARY SUMMATION
|
||||
# 2216; SET MINUS
|
||||
# 221A; SQUARE ROOT
|
||||
# 221B; CUBE ROOT
|
||||
# 221C; FOURTH ROOT
|
||||
# 221D; PROPORTIONAL TO
|
||||
# 2226; NOT PARALLEL TO
|
||||
# 222B; INTEGRAL
|
||||
# 222C; DOUBLE INTEGRAL
|
||||
# 222D; TRIPLE INTEGRAL
|
||||
# 222E; CONTOUR INTEGRAL
|
||||
# 222F; SURFACE INTEGRAL
|
||||
# 2230; VOLUME INTEGRAL
|
||||
# 2231; CLOCKWISE INTEGRAL
|
||||
# 2232; CLOCKWISE CONTOUR INTEGRAL
|
||||
# 2233; ANTICLOCKWISE CONTOUR INTEGRAL
|
||||
# 2239; EXCESS
|
||||
# 223B; HOMOTHETIC
|
||||
# 223E; INVERTED LAZY S
|
||||
# 223F; SINE WAVE
|
||||
# 2240; WREATH PRODUCT
|
||||
# 2241; NOT TILDE
|
||||
# 2242; MINUS TILDE
|
||||
# 2244; NOT ASYMPTOTICALLY EQUAL TO
|
||||
# 2246; APPROXIMATELY BUT NOT ACTUALLY EQUAL TO
|
||||
# 2247; NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO
|
||||
# 2248; ALMOST EQUAL TO
|
||||
# 2249; NOT ALMOST EQUAL TO
|
||||
# 224A; ALMOST EQUAL OR EQUAL TO
|
||||
# 224B; TRIPLE TILDE
|
||||
# 225F; QUESTIONED EQUAL TO
|
||||
# 2260; NOT EQUAL TO
|
||||
# 2262; NOT IDENTICAL TO
|
||||
# 228C; MULTISET
|
||||
# 22A7; MODELS
|
||||
# 22AA; TRIPLE VERTICAL BAR RIGHT TURNSTILE
|
||||
# 22AC; DOES NOT PROVE
|
||||
# 22AD; NOT TRUE
|
||||
# 22AE; DOES NOT FORCE
|
||||
# 22AF; NEGATED DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
|
||||
# 22BE; RIGHT ANGLE WITH ARC
|
||||
# 22BF; RIGHT TRIANGLE
|
||||
# 22F5; ELEMENT OF WITH DOT ABOVE
|
||||
# 22F8; ELEMENT OF WITH UNDERBAR
|
||||
# 22F9; ELEMENT OF WITH TWO HORIZONTAL STROKES
|
||||
# 22FF; Z NOTATION BAG MEMBERSHIP
|
||||
# 2320; TOP HALF INTEGRAL
|
||||
# 2321; BOTTOM HALF INTEGRAL
|
||||
# 27C0; THREE DIMENSIONAL ANGLE
|
||||
# 27CC; LONG DIVISION
|
||||
# 27D3; LOWER RIGHT CORNER WITH DOT
|
||||
# 27D4; UPPER LEFT CORNER WITH DOT
|
||||
# 299C; RIGHT ANGLE VARIANT WITH SQUARE
|
||||
# 299D; MEASURED RIGHT ANGLE WITH DOT
|
||||
# 299E; ANGLE WITH S INSIDE
|
||||
# 299F; ACUTE ANGLE
|
||||
# 29A2; TURNED ANGLE
|
||||
# 29A6; OBLIQUE ANGLE OPENING UP
|
||||
# 29A7; OBLIQUE ANGLE OPENING DOWN
|
||||
# 29C2; CIRCLE WITH SMALL CIRCLE TO THE RIGHT
|
||||
# 29C3; CIRCLE WITH TWO HORIZONTAL STROKES TO THE RIGHT
|
||||
# 29C9; TWO JOINED SQUARES
|
||||
# 29CE; RIGHT TRIANGLE ABOVE LEFT TRIANGLE
|
||||
# 29DC; INCOMPLETE INFINITY
|
||||
# 29E1; INCREASES AS
|
||||
# 29E3; EQUALS SIGN AND SLANTED PARALLEL
|
||||
# 29E4; EQUALS SIGN AND SLANTED PARALLEL WITH TILDE ABOVE
|
||||
# 29E5; IDENTICAL TO AND SLANTED PARALLEL
|
||||
# 29F4; RULE-DELAYED
|
||||
# 29F6; SOLIDUS WITH OVERBAR
|
||||
# 29F7; REVERSE SOLIDUS WITH HORIZONTAL STROKE
|
||||
# 2A0A; MODULO TWO SUM
|
||||
# 2A0B; SUMMATION WITH INTEGRAL
|
||||
# 2A0C; QUADRUPLE INTEGRAL OPERATOR
|
||||
# 2A0D; FINITE PART INTEGRAL
|
||||
# 2A0E; INTEGRAL WITH DOUBLE STROKE
|
||||
# 2A0F; INTEGRAL AVERAGE WITH SLASH
|
||||
# 2A10; CIRCULATION FUNCTION
|
||||
# 2A11; ANTICLOCKWISE INTEGRATION
|
||||
# 2A12; LINE INTEGRATION WITH RECTANGULAR PATH AROUND POLE
|
||||
# 2A13; LINE INTEGRATION WITH SEMICIRCULAR PATH AROUND POLE
|
||||
# 2A14; LINE INTEGRATION NOT INCLUDING THE POLE
|
||||
# 2A15; INTEGRAL AROUND A POINT OPERATOR
|
||||
# 2A16; QUATERNION INTEGRAL OPERATOR
|
||||
# 2A17; INTEGRAL WITH LEFTWARDS ARROW WITH HOOK
|
||||
# 2A18; INTEGRAL WITH TIMES SIGN
|
||||
# 2A19; INTEGRAL WITH INTERSECTION
|
||||
# 2A1A; INTEGRAL WITH UNION
|
||||
# 2A1B; INTEGRAL WITH OVERBAR
|
||||
# 2A1C; INTEGRAL WITH UNDERBAR
|
||||
# 2A1E; LARGE LEFT TRIANGLE OPERATOR
|
||||
# 2A1F; Z NOTATION SCHEMA COMPOSITION
|
||||
# 2A20; Z NOTATION SCHEMA PIPING
|
||||
# 2A21; Z NOTATION SCHEMA PROJECTION
|
||||
# 2A24; PLUS SIGN WITH TILDE ABOVE
|
||||
# 2A26; PLUS SIGN WITH TILDE BELOW
|
||||
# 2A29; MINUS SIGN WITH COMMA ABOVE
|
||||
# 2A3E; Z NOTATION RELATIONAL COMPOSITION
|
||||
# 2A57; SLOPING LARGE OR
|
||||
# 2A58; SLOPING LARGE AND
|
||||
# 2A6A; TILDE OPERATOR WITH DOT ABOVE
|
||||
# 2A6B; TILDE OPERATOR WITH RISING DOTS
|
||||
# 2A6C; SIMILAR MINUS SIMILAR
|
||||
# 2A6D; CONGRUENT WITH DOT ABOVE
|
||||
# 2A6F; ALMOST EQUAL TO WITH CIRCUMFLEX ACCENT
|
||||
# 2A70; APPROXIMATELY EQUAL OR EQUAL TO
|
||||
# 2A73; EQUALS SIGN ABOVE TILDE OPERATOR
|
||||
# 2A74; DOUBLE COLON EQUAL
|
||||
# 2AA3; DOUBLE NESTED LESS-THAN WITH UNDERBAR
|
||||
# 2ADC; FORKING
|
||||
# 2AE2; VERTICAL BAR TRIPLE RIGHT TURNSTILE
|
||||
# 2AE6; LONG DASH FROM LEFT MEMBER OF DOUBLE VERTICAL
|
||||
# 2AF3; PARALLEL WITH TILDE OPERATOR
|
||||
# 2AFB; TRIPLE SOLIDUS BINARY RELATION
|
||||
# 2AFD; DOUBLE SOLIDUS OPERATOR
|
||||
# 1D6DB; MATHEMATICAL BOLD PARTIAL DIFFERENTIAL
|
||||
# 1D715; MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL
|
||||
# 1D74F; MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL
|
||||
# 1D789; MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL
|
||||
# 1D7C3; MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL
|
||||
|
||||
# EOF
|
1624
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/Unicode.tables/CaseFolding.txt
vendored
Normal file
1624
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/Unicode.tables/CaseFolding.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
1743
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/Unicode.tables/PropList.txt
vendored
Normal file
1743
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/Unicode.tables/PropList.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,212 @@
|
||||
# PropertyAliases-14.0.0.txt
|
||||
# Date: 2021-03-08, 19:35:48 GMT
|
||||
# © 2021 Unicode®, Inc.
|
||||
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
# Unicode Character Database
|
||||
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||
#
|
||||
# This file contains aliases for properties used in the UCD.
|
||||
# These names can be used for XML formats of UCD data, for regular-expression
|
||||
# property tests, and other programmatic textual descriptions of Unicode data.
|
||||
#
|
||||
# The names may be translated in appropriate environments, and additional
|
||||
# aliases may be useful.
|
||||
#
|
||||
# FORMAT
|
||||
#
|
||||
# Each line has two or more fields, separated by semicolons.
|
||||
#
|
||||
# First Field: The first field is the short name for the property.
|
||||
# It is typically an abbreviation, but in a number of cases it is simply
|
||||
# a duplicate of the "long name" in the second field.
|
||||
# For Unihan database tags, the short name is actually a longer string than
|
||||
# the tag specified in the second field.
|
||||
#
|
||||
# Second Field: The second field is the long name for the property,
|
||||
# typically the formal name used in documentation about the property.
|
||||
#
|
||||
# The above are the preferred aliases. Other aliases may be listed in additional fields.
|
||||
#
|
||||
# Loose matching should be applied to all property names and property values, with
|
||||
# the exception of String Property values. With loose matching of property names and
|
||||
# values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property
|
||||
# values, numeric equivalencies are applied: thus "01.00" is equivalent to "1".
|
||||
#
|
||||
# NOTE: Property value names are NOT unique across properties. For example:
|
||||
#
|
||||
# AL means Arabic Letter for the Bidi_Class property, and
|
||||
# AL means Above_Left for the Combining_Class property, and
|
||||
# AL means Alphabetic for the Line_Break property.
|
||||
#
|
||||
# In addition, some property names may be the same as some property value names.
|
||||
# For example:
|
||||
#
|
||||
# sc means the Script property, and
|
||||
# Sc means the General_Category property value Currency_Symbol (Sc)
|
||||
#
|
||||
# The combination of property value and property name is, however, unique.
|
||||
#
|
||||
# For more information, see UAX #44, Unicode Character Database, and
|
||||
# UTS #18, Unicode Regular Expressions.
|
||||
# ================================================
|
||||
|
||||
|
||||
# ================================================
|
||||
# Numeric Properties
|
||||
# ================================================
|
||||
cjkAccountingNumeric ; kAccountingNumeric
|
||||
cjkOtherNumeric ; kOtherNumeric
|
||||
cjkPrimaryNumeric ; kPrimaryNumeric
|
||||
nv ; Numeric_Value
|
||||
|
||||
# ================================================
|
||||
# String Properties
|
||||
# ================================================
|
||||
cf ; Case_Folding
|
||||
cjkCompatibilityVariant ; kCompatibilityVariant
|
||||
dm ; Decomposition_Mapping
|
||||
FC_NFKC ; FC_NFKC_Closure
|
||||
lc ; Lowercase_Mapping
|
||||
NFKC_CF ; NFKC_Casefold
|
||||
scf ; Simple_Case_Folding ; sfc
|
||||
slc ; Simple_Lowercase_Mapping
|
||||
stc ; Simple_Titlecase_Mapping
|
||||
suc ; Simple_Uppercase_Mapping
|
||||
tc ; Titlecase_Mapping
|
||||
uc ; Uppercase_Mapping
|
||||
|
||||
# ================================================
|
||||
# Miscellaneous Properties
|
||||
# ================================================
|
||||
bmg ; Bidi_Mirroring_Glyph
|
||||
bpb ; Bidi_Paired_Bracket
|
||||
cjkIICore ; kIICore
|
||||
cjkIRG_GSource ; kIRG_GSource
|
||||
cjkIRG_HSource ; kIRG_HSource
|
||||
cjkIRG_JSource ; kIRG_JSource
|
||||
cjkIRG_KPSource ; kIRG_KPSource
|
||||
cjkIRG_KSource ; kIRG_KSource
|
||||
cjkIRG_MSource ; kIRG_MSource
|
||||
cjkIRG_SSource ; kIRG_SSource
|
||||
cjkIRG_TSource ; kIRG_TSource
|
||||
cjkIRG_UKSource ; kIRG_UKSource
|
||||
cjkIRG_USource ; kIRG_USource
|
||||
cjkIRG_VSource ; kIRG_VSource
|
||||
cjkRSUnicode ; kRSUnicode ; Unicode_Radical_Stroke; URS
|
||||
EqUIdeo ; Equivalent_Unified_Ideograph
|
||||
isc ; ISO_Comment
|
||||
JSN ; Jamo_Short_Name
|
||||
na ; Name
|
||||
na1 ; Unicode_1_Name
|
||||
Name_Alias ; Name_Alias
|
||||
scx ; Script_Extensions
|
||||
|
||||
# ================================================
|
||||
# Catalog Properties
|
||||
# ================================================
|
||||
age ; Age
|
||||
blk ; Block
|
||||
sc ; Script
|
||||
|
||||
# ================================================
|
||||
# Enumerated Properties
|
||||
# ================================================
|
||||
bc ; Bidi_Class
|
||||
bpt ; Bidi_Paired_Bracket_Type
|
||||
ccc ; Canonical_Combining_Class
|
||||
dt ; Decomposition_Type
|
||||
ea ; East_Asian_Width
|
||||
gc ; General_Category
|
||||
GCB ; Grapheme_Cluster_Break
|
||||
hst ; Hangul_Syllable_Type
|
||||
InPC ; Indic_Positional_Category
|
||||
InSC ; Indic_Syllabic_Category
|
||||
jg ; Joining_Group
|
||||
jt ; Joining_Type
|
||||
lb ; Line_Break
|
||||
NFC_QC ; NFC_Quick_Check
|
||||
NFD_QC ; NFD_Quick_Check
|
||||
NFKC_QC ; NFKC_Quick_Check
|
||||
NFKD_QC ; NFKD_Quick_Check
|
||||
nt ; Numeric_Type
|
||||
SB ; Sentence_Break
|
||||
vo ; Vertical_Orientation
|
||||
WB ; Word_Break
|
||||
|
||||
# ================================================
|
||||
# Binary Properties
|
||||
# ================================================
|
||||
AHex ; ASCII_Hex_Digit
|
||||
Alpha ; Alphabetic
|
||||
Bidi_C ; Bidi_Control
|
||||
Bidi_M ; Bidi_Mirrored
|
||||
Cased ; Cased
|
||||
CE ; Composition_Exclusion
|
||||
CI ; Case_Ignorable
|
||||
Comp_Ex ; Full_Composition_Exclusion
|
||||
CWCF ; Changes_When_Casefolded
|
||||
CWCM ; Changes_When_Casemapped
|
||||
CWKCF ; Changes_When_NFKC_Casefolded
|
||||
CWL ; Changes_When_Lowercased
|
||||
CWT ; Changes_When_Titlecased
|
||||
CWU ; Changes_When_Uppercased
|
||||
Dash ; Dash
|
||||
Dep ; Deprecated
|
||||
DI ; Default_Ignorable_Code_Point
|
||||
Dia ; Diacritic
|
||||
EBase ; Emoji_Modifier_Base
|
||||
EComp ; Emoji_Component
|
||||
EMod ; Emoji_Modifier
|
||||
Emoji ; Emoji
|
||||
EPres ; Emoji_Presentation
|
||||
Ext ; Extender
|
||||
ExtPict ; Extended_Pictographic
|
||||
Gr_Base ; Grapheme_Base
|
||||
Gr_Ext ; Grapheme_Extend
|
||||
Gr_Link ; Grapheme_Link
|
||||
Hex ; Hex_Digit
|
||||
Hyphen ; Hyphen
|
||||
IDC ; ID_Continue
|
||||
Ideo ; Ideographic
|
||||
IDS ; ID_Start
|
||||
IDSB ; IDS_Binary_Operator
|
||||
IDST ; IDS_Trinary_Operator
|
||||
Join_C ; Join_Control
|
||||
LOE ; Logical_Order_Exception
|
||||
Lower ; Lowercase
|
||||
Math ; Math
|
||||
NChar ; Noncharacter_Code_Point
|
||||
OAlpha ; Other_Alphabetic
|
||||
ODI ; Other_Default_Ignorable_Code_Point
|
||||
OGr_Ext ; Other_Grapheme_Extend
|
||||
OIDC ; Other_ID_Continue
|
||||
OIDS ; Other_ID_Start
|
||||
OLower ; Other_Lowercase
|
||||
OMath ; Other_Math
|
||||
OUpper ; Other_Uppercase
|
||||
Pat_Syn ; Pattern_Syntax
|
||||
Pat_WS ; Pattern_White_Space
|
||||
PCM ; Prepended_Concatenation_Mark
|
||||
QMark ; Quotation_Mark
|
||||
Radical ; Radical
|
||||
RI ; Regional_Indicator
|
||||
SD ; Soft_Dotted
|
||||
STerm ; Sentence_Terminal
|
||||
Term ; Terminal_Punctuation
|
||||
UIdeo ; Unified_Ideograph
|
||||
Upper ; Uppercase
|
||||
VS ; Variation_Selector
|
||||
WSpace ; White_Space ; space
|
||||
XIDC ; XID_Continue
|
||||
XIDS ; XID_Start
|
||||
XO_NFC ; Expands_On_NFC
|
||||
XO_NFD ; Expands_On_NFD
|
||||
XO_NFKC ; Expands_On_NFKC
|
||||
XO_NFKD ; Expands_On_NFKD
|
||||
|
||||
# ================================================
|
||||
# Total: 129
|
||||
|
||||
# EOF
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,628 @@
|
||||
# ScriptExtensions-14.0.0.txt
|
||||
# Date: 2021-06-04, 02:19:38 GMT
|
||||
# © 2021 Unicode®, Inc.
|
||||
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
# Unicode Character Database
|
||||
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||
#
|
||||
# The Script_Extensions property indicates which characters are commonly used
|
||||
# with more than one script, but with a limited number of scripts.
|
||||
# For each code point, there is one or more property values. Each such value is a Script property value.
|
||||
# For more information, see:
|
||||
# UAX #24, Unicode Script Property: https://www.unicode.org/reports/tr24/
|
||||
# Especially the sections:
|
||||
# https://www.unicode.org/reports/tr24/#Assignment_Script_Values
|
||||
# https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
|
||||
#
|
||||
# Each Script_Extensions value in this file consists of a set
|
||||
# of one or more abbreviated Script property values. The ordering of the
|
||||
# values in that set is not material, but for stability in presentation
|
||||
# it is given here as alphabetical.
|
||||
#
|
||||
# The Script_Extensions values are presented in sorted order in the file.
|
||||
# They are sorted first by the number of Script property values in their sets,
|
||||
# and then alphabetically by first differing Script property value.
|
||||
#
|
||||
# Following each distinct Script_Extensions value is the list of code
|
||||
# points associated with that value, listed in code point order.
|
||||
#
|
||||
# All code points not explicitly listed for Script_Extensions
|
||||
# have as their value the corresponding Script property value
|
||||
#
|
||||
# @missing: 0000..10FFFF; <script>
|
||||
|
||||
# ================================================
|
||||
|
||||
# Property: Script_Extensions
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Beng
|
||||
|
||||
1CF7 ; Beng # Mc VEDIC SIGN ATIKRAMA
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva
|
||||
|
||||
1CD1 ; Deva # Mn VEDIC TONE SHARA
|
||||
1CD4 ; Deva # Mn VEDIC SIGN YAJURVEDIC MIDLINE SVARITA
|
||||
1CDB ; Deva # Mn VEDIC TONE TRIPLE SVARITA
|
||||
1CDE..1CDF ; Deva # Mn [2] VEDIC TONE TWO DOTS BELOW..VEDIC TONE THREE DOTS BELOW
|
||||
1CE2..1CE8 ; Deva # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
|
||||
1CEB..1CEC ; Deva # Lo [2] VEDIC SIGN ANUSVARA VAMAGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL
|
||||
1CEE..1CF1 ; Deva # Lo [4] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ANUSVARA UBHAYATO MUKHA
|
||||
|
||||
# Total code points: 18
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Dupl
|
||||
|
||||
1BCA0..1BCA3 ; Dupl # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
|
||||
|
||||
# Total code points: 4
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Grek
|
||||
|
||||
0342 ; Grek # Mn COMBINING GREEK PERISPOMENI
|
||||
0345 ; Grek # Mn COMBINING GREEK YPOGEGRAMMENI
|
||||
1DC0..1DC1 ; Grek # Mn [2] COMBINING DOTTED GRAVE ACCENT..COMBINING DOTTED ACUTE ACCENT
|
||||
|
||||
# Total code points: 4
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Hani
|
||||
|
||||
3006 ; Hani # Lo IDEOGRAPHIC CLOSING MARK
|
||||
303E..303F ; Hani # So [2] IDEOGRAPHIC VARIATION INDICATOR..IDEOGRAPHIC HALF FILL SPACE
|
||||
3190..3191 ; Hani # So [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK
|
||||
3192..3195 ; Hani # No [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK
|
||||
3196..319F ; Hani # So [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK
|
||||
31C0..31E3 ; Hani # So [36] CJK STROKE T..CJK STROKE Q
|
||||
3220..3229 ; Hani # No [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN
|
||||
322A..3247 ; Hani # So [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO
|
||||
3280..3289 ; Hani # No [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN
|
||||
328A..32B0 ; Hani # So [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT
|
||||
32C0..32CB ; Hani # So [12] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DECEMBER
|
||||
32FF ; Hani # So SQUARE ERA NAME REIWA
|
||||
3358..3370 ; Hani # So [25] IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR ZERO..IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR TWENTY-FOUR
|
||||
337B..337F ; Hani # So [5] SQUARE ERA NAME HEISEI..SQUARE CORPORATION
|
||||
33E0..33FE ; Hani # So [31] IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY ONE..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
|
||||
1D360..1D371 ; Hani # No [18] COUNTING ROD UNIT DIGIT ONE..COUNTING ROD TENS DIGIT NINE
|
||||
1F250..1F251 ; Hani # So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
|
||||
|
||||
# Total code points: 238
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Latn
|
||||
|
||||
0363..036F ; Latn # Mn [13] COMBINING LATIN SMALL LETTER A..COMBINING LATIN SMALL LETTER X
|
||||
|
||||
# Total code points: 13
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Nand
|
||||
|
||||
1CFA ; Nand # Lo VEDIC SIGN DOUBLE ANUSVARA ANTARGOMUKHA
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Syrc
|
||||
|
||||
1DFA ; Syrc # Mn COMBINING DOT BELOW LEFT
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Copt
|
||||
|
||||
102E0 ; Arab Copt # Mn COPTIC EPACT THOUSANDS MARK
|
||||
102E1..102FB ; Arab Copt # No [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED
|
||||
|
||||
# Total code points: 28
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Rohg
|
||||
|
||||
06D4 ; Arab Rohg # Po ARABIC FULL STOP
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Nkoo
|
||||
|
||||
FD3E ; Arab Nkoo # Pe ORNATE LEFT PARENTHESIS
|
||||
FD3F ; Arab Nkoo # Ps ORNATE RIGHT PARENTHESIS
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Syrc
|
||||
|
||||
064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
|
||||
0670 ; Arab Syrc # Mn ARABIC LETTER SUPERSCRIPT ALEF
|
||||
|
||||
# Total code points: 12
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Thaa
|
||||
|
||||
FDF2 ; Arab Thaa # Lo ARABIC LIGATURE ALLAH ISOLATED FORM
|
||||
FDFD ; Arab Thaa # So ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Beng Deva
|
||||
|
||||
1CD5..1CD6 ; Beng Deva # Mn [2] VEDIC TONE YAJURVEDIC AGGRAVATED INDEPENDENT SVARITA..VEDIC TONE YAJURVEDIC INDEPENDENT SVARITA
|
||||
1CD8 ; Beng Deva # Mn VEDIC TONE CANDRA BELOW
|
||||
1CE1 ; Beng Deva # Mc VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA
|
||||
1CEA ; Beng Deva # Lo VEDIC SIGN ANUSVARA BAHIRGOMUKHA
|
||||
1CED ; Beng Deva # Mn VEDIC SIGN TIRYAK
|
||||
1CF5..1CF6 ; Beng Deva # Lo [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA
|
||||
A8F1 ; Beng Deva # Mn COMBINING DEVANAGARI SIGN AVAGRAHA
|
||||
|
||||
# Total code points: 9
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Bopo Hani
|
||||
|
||||
302A..302D ; Bopo Hani # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
|
||||
|
||||
# Total code points: 4
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Bugi Java
|
||||
|
||||
A9CF ; Bugi Java # Lm JAVANESE PANGRANGKEP
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cprt Linb
|
||||
|
||||
10102 ; Cprt Linb # Po AEGEAN CHECK MARK
|
||||
10137..1013F ; Cprt Linb # So [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT
|
||||
|
||||
# Total code points: 10
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cyrl Glag
|
||||
|
||||
0484 ; Cyrl Glag # Mn COMBINING CYRILLIC PALATALIZATION
|
||||
0487 ; Cyrl Glag # Mn COMBINING CYRILLIC POKRYTIE
|
||||
2E43 ; Cyrl Glag # Po DASH WITH LEFT UPTURN
|
||||
A66F ; Cyrl Glag # Mn COMBINING CYRILLIC VZMET
|
||||
|
||||
# Total code points: 4
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cyrl Latn
|
||||
|
||||
0485..0486 ; Cyrl Latn # Mn [2] COMBINING CYRILLIC DASIA PNEUMATA..COMBINING CYRILLIC PSILI PNEUMATA
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cyrl Perm
|
||||
|
||||
0483 ; Cyrl Perm # Mn COMBINING CYRILLIC TITLO
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cyrl Syrc
|
||||
|
||||
1DF8 ; Cyrl Syrc # Mn COMBINING DOT ABOVE LEFT
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Gran
|
||||
|
||||
1CD3 ; Deva Gran # Po VEDIC SIGN NIHSHVASA
|
||||
1CF3 ; Deva Gran # Lo VEDIC SIGN ROTATED ARDHAVISARGA
|
||||
1CF8..1CF9 ; Deva Gran # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
|
||||
|
||||
# Total code points: 4
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Nand
|
||||
|
||||
1CE9 ; Deva Nand # Lo VEDIC SIGN ANUSVARA ANTARGOMUKHA
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Shrd
|
||||
|
||||
1CD7 ; Deva Shrd # Mn VEDIC TONE YAJURVEDIC KATHAKA INDEPENDENT SVARITA
|
||||
1CD9 ; Deva Shrd # Mn VEDIC TONE YAJURVEDIC KATHAKA INDEPENDENT SVARITA SCHROEDER
|
||||
1CDC..1CDD ; Deva Shrd # Mn [2] VEDIC TONE KATHAKA ANUDATTA..VEDIC TONE DOT BELOW
|
||||
1CE0 ; Deva Shrd # Mn VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA
|
||||
|
||||
# Total code points: 5
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Taml
|
||||
|
||||
A8F3 ; Deva Taml # Lo DEVANAGARI SIGN CANDRABINDU VIRAMA
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Geor Latn
|
||||
|
||||
10FB ; Geor Latn # Po GEORGIAN PARAGRAPH SEPARATOR
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Gran Taml
|
||||
|
||||
0BE6..0BEF ; Gran Taml # Nd [10] TAMIL DIGIT ZERO..TAMIL DIGIT NINE
|
||||
0BF0..0BF2 ; Gran Taml # No [3] TAMIL NUMBER TEN..TAMIL NUMBER ONE THOUSAND
|
||||
0BF3 ; Gran Taml # So TAMIL DAY SIGN
|
||||
11301 ; Gran Taml # Mn GRANTHA SIGN CANDRABINDU
|
||||
11303 ; Gran Taml # Mc GRANTHA SIGN VISARGA
|
||||
1133B..1133C ; Gran Taml # Mn [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA
|
||||
11FD0..11FD1 ; Gran Taml # No [2] TAMIL FRACTION ONE QUARTER..TAMIL FRACTION ONE HALF-1
|
||||
11FD3 ; Gran Taml # No TAMIL FRACTION THREE QUARTERS
|
||||
|
||||
# Total code points: 21
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Gujr Khoj
|
||||
|
||||
0AE6..0AEF ; Gujr Khoj # Nd [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE
|
||||
|
||||
# Total code points: 10
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Guru Mult
|
||||
|
||||
0A66..0A6F ; Guru Mult # Nd [10] GURMUKHI DIGIT ZERO..GURMUKHI DIGIT NINE
|
||||
|
||||
# Total code points: 10
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Hani Latn
|
||||
|
||||
A700..A707 ; Hani Latn # Sk [8] MODIFIER LETTER CHINESE TONE YIN PING..MODIFIER LETTER CHINESE TONE YANG RU
|
||||
|
||||
# Total code points: 8
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Hira Kana
|
||||
|
||||
3031..3035 ; Hira Kana # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF
|
||||
3099..309A ; Hira Kana # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
||||
309B..309C ; Hira Kana # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
||||
30A0 ; Hira Kana # Pd KATAKANA-HIRAGANA DOUBLE HYPHEN
|
||||
30FC ; Hira Kana # Lm KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
FF70 ; Hira Kana # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
FF9E..FF9F ; Hira Kana # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
|
||||
|
||||
# Total code points: 14
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Knda Nand
|
||||
|
||||
0CE6..0CEF ; Knda Nand # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
|
||||
|
||||
# Total code points: 10
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Latn Mong
|
||||
|
||||
202F ; Latn Mong # Zs NARROW NO-BREAK SPACE
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Mani Ougr
|
||||
|
||||
10AF2 ; Mani Ougr # Po MANICHAEAN PUNCTUATION DOUBLE DOT WITHIN DOT
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Mong Phag
|
||||
|
||||
1802..1803 ; Mong Phag # Po [2] MONGOLIAN COMMA..MONGOLIAN FULL STOP
|
||||
1805 ; Mong Phag # Po MONGOLIAN FOUR DOTS
|
||||
|
||||
# Total code points: 3
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Syrc Thaa
|
||||
|
||||
061C ; Arab Syrc Thaa # Cf ARABIC LETTER MARK
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Thaa Yezi
|
||||
|
||||
0660..0669 ; Arab Thaa Yezi # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE
|
||||
|
||||
# Total code points: 10
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Beng Cakm Sylo
|
||||
|
||||
09E6..09EF ; Beng Cakm Sylo # Nd [10] BENGALI DIGIT ZERO..BENGALI DIGIT NINE
|
||||
|
||||
# Total code points: 10
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cakm Mymr Tale
|
||||
|
||||
1040..1049 ; Cakm Mymr Tale # Nd [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE
|
||||
|
||||
# Total code points: 10
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cpmn Cprt Linb
|
||||
|
||||
10100..10101 ; Cpmn Cprt Linb # Po [2] AEGEAN WORD SEPARATOR LINE..AEGEAN WORD SEPARATOR DOT
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cprt Lina Linb
|
||||
|
||||
10107..10133 ; Cprt Lina Linb # No [45] AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOUSAND
|
||||
|
||||
# Total code points: 45
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Gran Knda
|
||||
|
||||
1CF4 ; Deva Gran Knda # Mn VEDIC TONE CANDRA ABOVE
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Gran Latn
|
||||
|
||||
20F0 ; Deva Gran Latn # Mn COMBINING ASTERISK ABOVE
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Hani Hira Kana
|
||||
|
||||
303C ; Hani Hira Kana # Lo MASU MARK
|
||||
303D ; Hani Hira Kana # Po PART ALTERNATION MARK
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Kali Latn Mymr
|
||||
|
||||
A92E ; Kali Latn Mymr # Po KAYAH LI SIGN CWI
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Beng Deva Gran Knda
|
||||
|
||||
1CD0 ; Beng Deva Gran Knda # Mn VEDIC TONE KARSHANA
|
||||
1CD2 ; Beng Deva Gran Knda # Mn VEDIC TONE PRENKHA
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Buhd Hano Tagb Tglg
|
||||
|
||||
1735..1736 ; Buhd Hano Tagb Tglg # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Dogr Kthi Mahj
|
||||
|
||||
0966..096F ; Deva Dogr Kthi Mahj # Nd [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE
|
||||
|
||||
# Total code points: 10
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Bopo Hang Hani Hira Kana
|
||||
|
||||
3003 ; Bopo Hang Hani Hira Kana # Po DITTO MARK
|
||||
3013 ; Bopo Hang Hani Hira Kana # So GETA MARK
|
||||
301C ; Bopo Hang Hani Hira Kana # Pd WAVE DASH
|
||||
301D ; Bopo Hang Hani Hira Kana # Ps REVERSED DOUBLE PRIME QUOTATION MARK
|
||||
301E..301F ; Bopo Hang Hani Hira Kana # Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK
|
||||
3030 ; Bopo Hang Hani Hira Kana # Pd WAVY DASH
|
||||
3037 ; Bopo Hang Hani Hira Kana # So IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL
|
||||
FE45..FE46 ; Bopo Hang Hani Hira Kana # Po [2] SESAME DOT..WHITE SESAME DOT
|
||||
|
||||
# Total code points: 10
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Nkoo Rohg Syrc Thaa Yezi
|
||||
|
||||
060C ; Arab Nkoo Rohg Syrc Thaa Yezi # Po ARABIC COMMA
|
||||
061B ; Arab Nkoo Rohg Syrc Thaa Yezi # Po ARABIC SEMICOLON
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Bopo Hang Hani Hira Kana Yiii
|
||||
|
||||
3001..3002 ; Bopo Hang Hani Hira Kana Yiii # Po [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP
|
||||
3008 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT ANGLE BRACKET
|
||||
3009 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT ANGLE BRACKET
|
||||
300A ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT DOUBLE ANGLE BRACKET
|
||||
300B ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT DOUBLE ANGLE BRACKET
|
||||
300C ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT CORNER BRACKET
|
||||
300D ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT CORNER BRACKET
|
||||
300E ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE CORNER BRACKET
|
||||
300F ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE CORNER BRACKET
|
||||
3010 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT BLACK LENTICULAR BRACKET
|
||||
3011 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT BLACK LENTICULAR BRACKET
|
||||
3014 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT TORTOISE SHELL BRACKET
|
||||
3015 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT TORTOISE SHELL BRACKET
|
||||
3016 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE LENTICULAR BRACKET
|
||||
3017 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE LENTICULAR BRACKET
|
||||
3018 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE TORTOISE SHELL BRACKET
|
||||
3019 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE TORTOISE SHELL BRACKET
|
||||
301A ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE SQUARE BRACKET
|
||||
301B ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE SQUARE BRACKET
|
||||
30FB ; Bopo Hang Hani Hira Kana Yiii # Po KATAKANA MIDDLE DOT
|
||||
FF61 ; Bopo Hang Hani Hira Kana Yiii # Po HALFWIDTH IDEOGRAPHIC FULL STOP
|
||||
FF62 ; Bopo Hang Hani Hira Kana Yiii # Ps HALFWIDTH LEFT CORNER BRACKET
|
||||
FF63 ; Bopo Hang Hani Hira Kana Yiii # Pe HALFWIDTH RIGHT CORNER BRACKET
|
||||
FF64..FF65 ; Bopo Hang Hani Hira Kana Yiii # Po [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDLE DOT
|
||||
|
||||
# Total code points: 26
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Knda Mlym Orya Taml Telu
|
||||
|
||||
1CDA ; Deva Knda Mlym Orya Taml Telu # Mn VEDIC TONE DOUBLE SVARITA
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Adlm Arab Nkoo Rohg Syrc Thaa Yezi
|
||||
|
||||
061F ; Adlm Arab Nkoo Rohg Syrc Thaa Yezi # Po ARABIC QUESTION MARK
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Beng Deva Gran Knda Nand Orya Telu Tirh
|
||||
|
||||
1CF2 ; Beng Deva Gran Knda Nand Orya Telu Tirh # Lo VEDIC SIGN ARDHAVISARGA
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Adlm Arab Mand Mani Ougr Phlp Rohg Sogd Syrc
|
||||
|
||||
0640 ; Adlm Arab Mand Mani Ougr Phlp Rohg Sogd Syrc # Lm ARABIC TATWEEL
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh
|
||||
|
||||
A836..A837 ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # So [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
|
||||
A838 ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # Sc NORTH INDIC RUPEE MARK
|
||||
A839 ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # So NORTH INDIC QUANTITY MARK
|
||||
|
||||
# Total code points: 4
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh
|
||||
|
||||
0952 ; Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh # Mn DEVANAGARI STRESS SIGN ANUDATTA
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh
|
||||
|
||||
0951 ; Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh # Mn DEVANAGARI STRESS SIGN UDATTA
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Nand Sind Takr Tirh
|
||||
|
||||
A833..A835 ; Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Nand Sind Takr Tirh # No [3] NORTH INDIC FRACTION ONE SIXTEENTH..NORTH INDIC FRACTION THREE SIXTEENTHS
|
||||
|
||||
# Total code points: 3
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Nand Sind Takr Tirh
|
||||
|
||||
A830..A832 ; Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Nand Sind Takr Tirh # No [3] NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTION THREE QUARTERS
|
||||
|
||||
# Total code points: 3
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh
|
||||
|
||||
0964 ; Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh # Po DEVANAGARI DANDA
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Limb Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh
|
||||
|
||||
0965 ; Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Limb Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh # Po DEVANAGARI DOUBLE DANDA
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# EOF
|
2991
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/Unicode.tables/Scripts.txt
vendored
Normal file
2991
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/Unicode.tables/Scripts.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
34626
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/Unicode.tables/UnicodeData.txt
vendored
Normal file
34626
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/Unicode.tables/UnicodeData.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1297
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/Unicode.tables/emoji-data.txt
vendored
Normal file
1297
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/Unicode.tables/emoji-data.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,141 @@
|
||||
const unsigned char _pcre_default_tables[] = {
|
||||
0,1,2,3,4,5,6,7,
|
||||
8,9,10,11,12,13,14,15,
|
||||
16,17,18,19,20,21,22,23,
|
||||
24,25,26,27,28,29,30,31,
|
||||
32,33,34,35,36,37,38,39,
|
||||
40,41,42,43,44,45,46,47,
|
||||
48,49,50,51,52,53,54,55,
|
||||
56,57,58,59,60,61,62,63,
|
||||
64,97,98,99,100,101,102,103,
|
||||
104,105,106,107,108,109,110,111,
|
||||
112,113,114,115,116,117,118,119,
|
||||
120,121,122,91,92,93,94,95,
|
||||
96,97,98,99,100,101,102,103,
|
||||
104,105,106,107,108,109,110,111,
|
||||
112,113,114,115,116,117,118,119,
|
||||
120,121,122,123,124,125,126,127,
|
||||
128,129,130,131,132,133,134,135,
|
||||
136,137,138,139,140,141,142,143,
|
||||
144,145,146,147,148,149,150,151,
|
||||
152,153,154,155,156,157,158,159,
|
||||
160,161,162,163,164,165,166,167,
|
||||
168,169,170,171,172,173,174,175,
|
||||
176,177,178,179,180,181,182,183,
|
||||
184,185,186,187,188,189,190,191,
|
||||
224,225,226,227,228,229,230,231,
|
||||
232,233,234,235,236,237,238,239,
|
||||
240,241,242,243,244,245,246,215,
|
||||
248,249,250,251,252,253,254,223,
|
||||
224,225,226,227,228,229,230,231,
|
||||
232,233,234,235,236,237,238,239,
|
||||
240,241,242,243,244,245,246,247,
|
||||
248,249,250,251,252,253,254,255,
|
||||
0,1,2,3,4,5,6,7,
|
||||
8,9,10,11,12,13,14,15,
|
||||
16,17,18,19,20,21,22,23,
|
||||
24,25,26,27,28,29,30,31,
|
||||
32,33,34,35,36,37,38,39,
|
||||
40,41,42,43,44,45,46,47,
|
||||
48,49,50,51,52,53,54,55,
|
||||
56,57,58,59,60,61,62,63,
|
||||
64,97,98,99,100,101,102,103,
|
||||
104,105,106,107,108,109,110,111,
|
||||
112,113,114,115,116,117,118,119,
|
||||
120,121,122,91,92,93,94,95,
|
||||
96,65,66,67,68,69,70,71,
|
||||
72,73,74,75,76,77,78,79,
|
||||
80,81,82,83,84,85,86,87,
|
||||
88,89,90,123,124,125,126,127,
|
||||
128,129,130,131,132,133,134,135,
|
||||
136,137,138,139,140,141,142,143,
|
||||
144,145,146,147,148,149,150,151,
|
||||
152,153,154,155,156,157,158,159,
|
||||
160,161,162,163,164,165,166,167,
|
||||
168,169,170,171,172,173,174,175,
|
||||
176,177,178,179,180,181,182,183,
|
||||
184,185,186,187,188,189,190,191,
|
||||
224,225,226,227,228,229,230,231,
|
||||
232,233,234,235,236,237,238,239,
|
||||
240,241,242,243,244,245,246,215,
|
||||
248,249,250,251,252,253,254,223,
|
||||
192,193,194,195,196,197,198,199,
|
||||
200,201,202,203,204,205,206,207,
|
||||
208,209,210,211,212,213,214,247,
|
||||
216,217,218,219,220,221,222,255,
|
||||
0,62,0,0,1,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
32,0,0,0,1,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,255,3,
|
||||
126,0,0,0,126,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,255,3,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,12,2,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
254,255,255,7,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
255,255,127,127,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,254,255,255,7,
|
||||
0,0,0,0,0,4,32,4,
|
||||
0,0,0,128,255,255,127,255,
|
||||
0,0,0,0,0,0,255,3,
|
||||
254,255,255,135,254,255,255,7,
|
||||
0,0,0,0,0,4,44,6,
|
||||
255,255,127,255,255,255,127,255,
|
||||
0,0,0,0,254,255,255,255,
|
||||
255,255,255,255,255,255,255,127,
|
||||
0,0,0,0,254,255,255,255,
|
||||
255,255,255,255,255,255,255,255,
|
||||
0,2,0,0,255,255,255,255,
|
||||
255,255,255,255,255,255,255,127,
|
||||
0,0,0,0,255,255,255,255,
|
||||
255,255,255,255,255,255,255,255,
|
||||
0,0,0,0,254,255,0,252,
|
||||
1,0,0,248,1,0,0,120,
|
||||
0,0,0,0,254,255,255,255,
|
||||
0,0,128,0,0,0,128,0,
|
||||
255,255,255,255,0,0,0,0,
|
||||
0,0,0,0,0,0,0,128,
|
||||
255,255,255,255,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
|
||||
/* Fiddled by hand when the table bits changed. May be broken! */
|
||||
|
||||
128,0,0,0,0,0,0,0,
|
||||
0,1,1,1,1,1,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
1,0,0,0,128,0,0,0,
|
||||
128,128,128,128,0,0,128,0,
|
||||
24,24,24,24,24,24,24,24,
|
||||
24,24,0,0,0,0,0,128,
|
||||
0,18,18,18,18,18,18,18,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,128,128,0,128,16,
|
||||
0,18,18,18,18,18,18,18,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,128,128,0,0,0,
|
||||
0,0,0,0,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
1,0,0,0,0,0,0,0,
|
||||
0,0,18,0,0,0,0,0,
|
||||
0,0,24,24,0,18,0,0,
|
||||
0,24,18,0,0,0,0,0,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,18,18,18,18,0,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,18,18,18,18,0,
|
||||
18,18,18,18,18,18,18,18
|
||||
};
|
1086
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/ucptest.c
vendored
Normal file
1086
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/ucptest.c
vendored
Normal file
File diff suppressed because it is too large
Load Diff
50
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/ucptestdata/testinput1
vendored
Normal file
50
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/ucptestdata/testinput1
vendored
Normal file
@ -0,0 +1,50 @@
|
||||
findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
|
||||
findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
|
||||
findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
|
||||
findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
|
||||
findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
|
||||
findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
|
||||
findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
|
||||
findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
|
||||
|
||||
findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f
|
||||
findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f
|
||||
findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af
|
||||
findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf
|
||||
findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf
|
||||
findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df
|
||||
findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef
|
||||
findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff
|
||||
|
||||
findprop 0100 0101 0102 0103 0104 0105 0106
|
||||
|
||||
findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7
|
||||
findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
|
||||
findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
|
||||
findprop 10000 10001 e01ef f0000 100000
|
||||
|
||||
findprop 1b00 12000 7c0 a840 10900
|
||||
findprop 1d79 a77d
|
||||
|
||||
findprop 0800 083e a4d0 a4f7 aa80 aadf
|
||||
findprop 10b00 10b35 13000 1342e 10840 10855
|
||||
|
||||
findprop 11100 1113c 11680 116c0
|
||||
|
||||
findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89
|
||||
|
||||
findprop 118a0 11ac7 16ad0
|
||||
|
||||
findprop 11700 14400 108e0 11280 1d800
|
||||
|
||||
findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
|
||||
|
||||
findprop a836 a833 1cf4 20f0 1cd0
|
||||
|
||||
findprop 32ff
|
||||
|
||||
findprop 1f16d
|
||||
|
||||
findprop U+10e93 U+10eaa
|
||||
|
||||
findprop 0602 202a 202b 202c 2068 2069 202d 202e 2067
|
19
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/ucptestdata/testinput2
vendored
Normal file
19
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/ucptestdata/testinput2
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
find script Han
|
||||
find type Pe script Common scriptx Hangul
|
||||
find type Sk
|
||||
find type Pd
|
||||
find gbreak LVT
|
||||
find script Old_Uyghur
|
||||
find bidi PDF
|
||||
find bidi CS
|
||||
find bidi CS type Sm
|
||||
find bidi B
|
||||
find bidi FSI
|
||||
find bidi PDI
|
||||
find bidi RLI
|
||||
find bidi RLO
|
||||
find bidi S
|
||||
find bidi WS
|
||||
find script bopo
|
||||
find bool prependedconcatenationmark
|
||||
find bool pcm
|
409
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/ucptestdata/testoutput1
vendored
Normal file
409
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/ucptestdata/testoutput1
vendored
Normal file
@ -0,0 +1,409 @@
|
||||
findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
|
||||
U+0000 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0001 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0002 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0003 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0004 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0005 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0006 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0007 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0008 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0009 S Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000A B Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000B S Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000C WS Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000D B Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000E BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+000F BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
|
||||
U+0010 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0011 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0012 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0013 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0014 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0015 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0016 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0017 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0018 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0019 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001A BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001B BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001C B Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001D B Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001E B Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+001F S Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
|
||||
U+0020 WS Separator: Space separator, common, Other, [ascii, emoji, emojicomponent, graphemebase, patternsyntax]
|
||||
U+0021 ON Punctuation: Other punctuation, common, Other, [ascii, caseignorable, graphemebase, patternsyntax, quotationmark]
|
||||
U+0022 ON Punctuation: Other punctuation, common, Other, [ascii, graphemebase, math, patternsyntax]
|
||||
U+0023 ET Punctuation: Other punctuation, common, Other, [ascii, dash, graphemebase, patternsyntax]
|
||||
U+0024 ET Symbol: Currency symbol, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+0025 ET Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+0026 ON Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+0027 ON Punctuation: Other punctuation, common, Other, [ascii, bidimirrored, graphemebase, math, patternsyntax]
|
||||
U+0028 ON Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+0029 ON Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+002A ON Punctuation: Other punctuation, common, Other, [ascii, dash, graphemebase, patternsyntax]
|
||||
U+002B ES Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
|
||||
U+002C CS Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+002D ES Punctuation: Dash punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
|
||||
U+002E CS Punctuation: Other punctuation, common, Other, [graphemebase, whitespace]
|
||||
U+002F CS Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
|
||||
U+0030 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0031 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0032 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0033 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0034 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0035 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0036 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0037 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0038 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+0039 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
|
||||
U+003A CS Punctuation: Other punctuation, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+003B ON Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+003C ON Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
|
||||
U+003D ON Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
|
||||
U+003E ON Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
|
||||
U+003F ON Punctuation: Other punctuation, common, Other, [ascii, caseignorable, graphemebase, patternsyntax, quotationmark]
|
||||
findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
|
||||
U+0040 ON Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+0041 L Letter: Upper case letter, latin, Other, U+0061, [graphemebase]
|
||||
U+0042 L Letter: Upper case letter, latin, Other, U+0062, [graphemebase]
|
||||
U+0043 L Letter: Upper case letter, latin, Other, U+0063, [graphemebase]
|
||||
U+0044 L Letter: Upper case letter, latin, Other, U+0064, [graphemebase]
|
||||
U+0045 L Letter: Upper case letter, latin, Other, U+0065, [graphemebase]
|
||||
U+0046 L Letter: Upper case letter, latin, Other, U+0066, [graphemebase]
|
||||
U+0047 L Letter: Upper case letter, latin, Other, U+0067, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0048 L Letter: Upper case letter, latin, Other, U+0068, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0049 L Letter: Upper case letter, latin, Other, U+0069, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004A L Letter: Upper case letter, latin, Other, U+006A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004B L Letter: Upper case letter, latin, Other, U+006B, U+212A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004C L Letter: Upper case letter, latin, Other, U+006C, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004D L Letter: Upper case letter, latin, Other, U+006D, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004E L Letter: Upper case letter, latin, Other, U+006E, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+004F L Letter: Upper case letter, latin, Other, U+006F, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
|
||||
U+0050 L Letter: Upper case letter, latin, Other, U+0070, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0051 L Letter: Upper case letter, latin, Other, U+0071, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0052 L Letter: Upper case letter, latin, Other, U+0072, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0053 L Letter: Upper case letter, latin, Other, U+0073, U+017F, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0054 L Letter: Upper case letter, latin, Other, U+0074, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0055 L Letter: Upper case letter, latin, Other, U+0075, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0056 L Letter: Upper case letter, latin, Other, U+0076, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0057 L Letter: Upper case letter, latin, Other, U+0077, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0058 L Letter: Upper case letter, latin, Other, U+0078, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+0059 L Letter: Upper case letter, latin, Other, U+0079, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+005A L Letter: Upper case letter, latin, Other, U+007A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
|
||||
U+005B ON Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+005C ON Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+005D ON Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+005E ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+005F ON Punctuation: Connector punctuation, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, deprecated, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
|
||||
U+0060 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+0061 L Letter: Lower case letter, latin, Other, U+0041, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0062 L Letter: Lower case letter, latin, Other, U+0042, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0063 L Letter: Lower case letter, latin, Other, U+0043, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0064 L Letter: Lower case letter, latin, Other, U+0044, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0065 L Letter: Lower case letter, latin, Other, U+0045, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0066 L Letter: Lower case letter, latin, Other, U+0046, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0067 L Letter: Lower case letter, latin, Other, U+0047, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0068 L Letter: Lower case letter, latin, Other, U+0048, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0069 L Letter: Lower case letter, latin, Other, U+0049, [caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+006A L Letter: Lower case letter, latin, Other, U+004A, [caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+006B L Letter: Lower case letter, latin, Other, U+004B, U+212A, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+006C L Letter: Lower case letter, latin, Other, U+004C, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+006D L Letter: Lower case letter, latin, Other, U+004D, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+006E L Letter: Lower case letter, latin, Other, U+004E, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+006F L Letter: Lower case letter, latin, Other, U+004F, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
|
||||
U+0070 L Letter: Lower case letter, latin, Other, U+0050, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0071 L Letter: Lower case letter, latin, Other, U+0051, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0072 L Letter: Lower case letter, latin, Other, U+0052, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0073 L Letter: Lower case letter, latin, Other, U+0053, U+017F, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0074 L Letter: Lower case letter, latin, Other, U+0054, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0075 L Letter: Lower case letter, latin, Other, U+0055, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0076 L Letter: Lower case letter, latin, Other, U+0056, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0077 L Letter: Lower case letter, latin, Other, U+0057, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0078 L Letter: Lower case letter, latin, Other, U+0058, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+0079 L Letter: Lower case letter, latin, Other, U+0059, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+007A L Letter: Lower case letter, latin, Other, U+005A, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+007B ON Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+007C ON Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
|
||||
U+007D ON Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+007E ON Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
|
||||
U+007F BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
|
||||
findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f
|
||||
U+0080 BN Control: Control, common, Control
|
||||
U+0081 BN Control: Control, common, Control
|
||||
U+0082 BN Control: Control, common, Control
|
||||
U+0083 BN Control: Control, common, Control
|
||||
U+0084 BN Control: Control, common, Control
|
||||
U+0085 B Control: Control, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
|
||||
U+0086 BN Control: Control, common, Control
|
||||
U+0087 BN Control: Control, common, Control
|
||||
U+0088 BN Control: Control, common, Control
|
||||
U+0089 BN Control: Control, common, Control
|
||||
U+008A BN Control: Control, common, Control
|
||||
U+008B BN Control: Control, common, Control
|
||||
U+008C BN Control: Control, common, Control
|
||||
U+008D BN Control: Control, common, Control
|
||||
U+008E BN Control: Control, common, Control
|
||||
U+008F BN Control: Control, common, Control
|
||||
findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f
|
||||
U+0090 BN Control: Control, common, Control
|
||||
U+0091 BN Control: Control, common, Control
|
||||
U+0092 BN Control: Control, common, Control
|
||||
U+0093 BN Control: Control, common, Control
|
||||
U+0094 BN Control: Control, common, Control
|
||||
U+0095 BN Control: Control, common, Control
|
||||
U+0096 BN Control: Control, common, Control
|
||||
U+0097 BN Control: Control, common, Control
|
||||
U+0098 BN Control: Control, common, Control
|
||||
U+0099 BN Control: Control, common, Control
|
||||
U+009A BN Control: Control, common, Control
|
||||
U+009B BN Control: Control, common, Control
|
||||
U+009C BN Control: Control, common, Control
|
||||
U+009D BN Control: Control, common, Control
|
||||
U+009E BN Control: Control, common, Control
|
||||
U+009F BN Control: Control, common, Control
|
||||
findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af
|
||||
U+00A0 CS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+00A1 ON Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A2 ET Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A3 ET Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A4 ET Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A5 ET Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A6 ON Symbol: Other symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A7 ON Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00A8 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00A9 ON Symbol: Other symbol, common, Extended Pictographic, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00AA L Letter: Other letter, latin, Other, [caseignorable, graphemeextend]
|
||||
U+00AB ON Punctuation: Initial punctuation, common, Other, [graphemebase, sentenceterminal, terminalpunctuation]
|
||||
U+00AC ON Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00AD BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+00AE ON Symbol: Other symbol, common, Extended Pictographic, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00AF ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf
|
||||
U+00B0 ET Symbol: Other symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00B1 ET Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00B2 EN Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00B3 EN Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00B4 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00B5 L Letter: Lower case letter, common, Other, U+03BC, U+039C, [alphabetic, deprecated, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00B6 ON Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
U+00B7 ON Punctuation: Other punctuation, common, Other, [alphabetic, graphemebase, idcontinue, xidcontinue]
|
||||
U+00B8 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00B9 EN Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00BA L Letter: Other letter, latin, Other, [caseignorable, graphemeextend]
|
||||
U+00BB ON Punctuation: Final punctuation, common, Other, [graphemebase, sentenceterminal, terminalpunctuation]
|
||||
U+00BC ON Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00BD ON Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00BE ON Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+00BF ON Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
|
||||
findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf
|
||||
U+00C0 L Letter: Upper case letter, latin, Other, U+00E0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C1 L Letter: Upper case letter, latin, Other, U+00E1, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C2 L Letter: Upper case letter, latin, Other, U+00E2, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C3 L Letter: Upper case letter, latin, Other, U+00E3, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C4 L Letter: Upper case letter, latin, Other, U+00E4, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C5 L Letter: Upper case letter, latin, Other, U+00E5, U+212B, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C6 L Letter: Upper case letter, latin, Other, U+00E6, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C7 L Letter: Upper case letter, latin, Other, U+00E7, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C8 L Letter: Upper case letter, latin, Other, U+00E8, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00C9 L Letter: Upper case letter, latin, Other, U+00E9, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CA L Letter: Upper case letter, latin, Other, U+00EA, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CB L Letter: Upper case letter, latin, Other, U+00EB, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CC L Letter: Upper case letter, latin, Other, U+00EC, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CD L Letter: Upper case letter, latin, Other, U+00ED, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CE L Letter: Upper case letter, latin, Other, U+00EE, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00CF L Letter: Upper case letter, latin, Other, U+00EF, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df
|
||||
U+00D0 L Letter: Upper case letter, latin, Other, U+00F0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D1 L Letter: Upper case letter, latin, Other, U+00F1, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D2 L Letter: Upper case letter, latin, Other, U+00F2, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D3 L Letter: Upper case letter, latin, Other, U+00F3, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D4 L Letter: Upper case letter, latin, Other, U+00F4, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D5 L Letter: Upper case letter, latin, Other, U+00F5, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D6 L Letter: Upper case letter, latin, Other, U+00F6, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D7 ON Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D8 L Letter: Upper case letter, latin, Other, U+00F8, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00D9 L Letter: Upper case letter, latin, Other, U+00F9, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DA L Letter: Upper case letter, latin, Other, U+00FA, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DB L Letter: Upper case letter, latin, Other, U+00FB, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DC L Letter: Upper case letter, latin, Other, U+00FC, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DD L Letter: Upper case letter, latin, Other, U+00FD, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DE L Letter: Upper case letter, latin, Other, U+00FE, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00DF L Letter: Lower case letter, latin, Other, U+1E9E, [alphabetic, deprecated, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef
|
||||
U+00E0 L Letter: Lower case letter, latin, Other, U+00C0, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E1 L Letter: Lower case letter, latin, Other, U+00C1, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E2 L Letter: Lower case letter, latin, Other, U+00C2, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E3 L Letter: Lower case letter, latin, Other, U+00C3, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E4 L Letter: Lower case letter, latin, Other, U+00C4, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E5 L Letter: Lower case letter, latin, Other, U+00C5, U+212B, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E6 L Letter: Lower case letter, latin, Other, U+00C6, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E7 L Letter: Lower case letter, latin, Other, U+00C7, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E8 L Letter: Lower case letter, latin, Other, U+00C8, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00E9 L Letter: Lower case letter, latin, Other, U+00C9, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00EA L Letter: Lower case letter, latin, Other, U+00CA, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00EB L Letter: Lower case letter, latin, Other, U+00CB, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00EC L Letter: Lower case letter, latin, Other, U+00CC, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00ED L Letter: Lower case letter, latin, Other, U+00CD, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00EE L Letter: Lower case letter, latin, Other, U+00CE, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00EF L Letter: Lower case letter, latin, Other, U+00CF, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff
|
||||
U+00F0 L Letter: Lower case letter, latin, Other, U+00D0, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F1 L Letter: Lower case letter, latin, Other, U+00D1, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F2 L Letter: Lower case letter, latin, Other, U+00D2, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F3 L Letter: Lower case letter, latin, Other, U+00D3, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F4 L Letter: Lower case letter, latin, Other, U+00D4, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F5 L Letter: Lower case letter, latin, Other, U+00D5, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F6 L Letter: Lower case letter, latin, Other, U+00D6, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F7 ON Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+00F8 L Letter: Lower case letter, latin, Other, U+00D8, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00F9 L Letter: Lower case letter, latin, Other, U+00D9, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FA L Letter: Lower case letter, latin, Other, U+00DA, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FB L Letter: Lower case letter, latin, Other, U+00DB, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FC L Letter: Lower case letter, latin, Other, U+00DC, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FD L Letter: Lower case letter, latin, Other, U+00DD, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FE L Letter: Lower case letter, latin, Other, U+00DE, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+00FF L Letter: Lower case letter, latin, Other, U+0178, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
|
||||
findprop 0100 0101 0102 0103 0104 0105 0106
|
||||
U+0100 L Letter: Upper case letter, latin, Other, U+0101, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+0101 L Letter: Lower case letter, latin, Other, U+0100, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+0102 L Letter: Upper case letter, latin, Other, U+0103, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+0103 L Letter: Lower case letter, latin, Other, U+0102, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+0104 L Letter: Upper case letter, latin, Other, U+0105, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+0105 L Letter: Lower case letter, latin, Other, U+0104, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+0106 L Letter: Upper case letter, latin, Other, U+0107, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
|
||||
findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7
|
||||
U+FFE0 ET Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE1 ET Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE2 ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
|
||||
U+FFE3 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+FFE4 ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE5 ET Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE6 ET Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE7 L Control: Unassigned, unknown, Other
|
||||
findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
|
||||
U+FFE8 ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFE9 ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
|
||||
U+FFEA ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
|
||||
U+FFEB ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
|
||||
U+FFEC ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
|
||||
U+FFED ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFEE ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFEF L Control: Unassigned, unknown, Other
|
||||
findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
|
||||
U+FFF8 BN Control: Unassigned, unknown, Control, [dash, defaultignorablecodepoint, deprecated, extendedpictographic, joincontrol, lowercase, patternwhitespace, quotationmark, sentenceterminal, softdotted, xidcontinue, xidstart]
|
||||
U+FFF9 ON Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FFFA ON Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FFFB ON Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FFFC ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFFD ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FFFE BN Control: Unassigned, unknown, Other, [changeswhenuppercased, deprecated, emojicomponent, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FFFF BN Control: Unassigned, unknown, Other, [changeswhenuppercased, deprecated, emojicomponent, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
findprop 10000 10001 e01ef f0000 100000
|
||||
U+10000 L Letter: Other letter, linearb, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10001 L Letter: Other letter, linearb, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+E01EF NSM Mark: Non-spacing mark, inherited, Extend, []
|
||||
U+F0000 L Control: Private use, unknown, Other
|
||||
U+100000 L Control: Private use, unknown, Other
|
||||
|
||||
findprop 1b00 12000 7c0 a840 10900
|
||||
U+1B00 NSM Mark: Non-spacing mark, balinese, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
|
||||
U+12000 L Letter: Other letter, cuneiform, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+07C0 R Number: Decimal number, nko, Other, [graphemebase, patternsyntax, terminalpunctuation]
|
||||
U+A840 L Letter: Other letter, phagspa, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10900 R Letter: Other letter, phoenician, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
findprop 1d79 a77d
|
||||
U+1D79 L Letter: Lower case letter, latin, Other, U+A77D, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
|
||||
U+A77D L Letter: Upper case letter, latin, Other, U+1D79, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
|
||||
findprop 0800 083e a4d0 a4f7 aa80 aadf
|
||||
U+0800 R Letter: Other letter, samaritan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+083E R Punctuation: Other punctuation, samaritan, Other, [bidimirrored, graphemebase, math, patternsyntax]
|
||||
U+A4D0 L Letter: Other letter, lisu, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+A4F7 L Letter: Other letter, lisu, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AA80 L Letter: Other letter, taiviet, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AADF L Punctuation: Other punctuation, taiviet, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
findprop 10b00 10b35 13000 1342e 10840 10855
|
||||
U+10B00 R Letter: Other letter, avestan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10B35 R Letter: Other letter, avestan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+13000 L Letter: Other letter, egyptianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+1342E L Letter: Other letter, egyptianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10840 R Letter: Other letter, imperialaramaic, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10855 R Letter: Other letter, imperialaramaic, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
|
||||
findprop 11100 1113c 11680 116c0
|
||||
U+11100 NSM Mark: Non-spacing mark, chakma, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
|
||||
U+1113C L Number: Decimal number, chakma, Other, [graphemebase, patternsyntax, terminalpunctuation]
|
||||
U+11680 L Letter: Other letter, takri, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+116C0 L Number: Decimal number, takri, Other, [graphemebase, patternsyntax, terminalpunctuation]
|
||||
|
||||
findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89
|
||||
U+000D B Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000A B Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000E BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0711 NSM Mark: Non-spacing mark, syriac, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
|
||||
U+1B04 L Mark: Spacing mark, balinese, SpacingMark, [dash, emoji, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+1111 L Letter: Other letter, hangul, Hangul syllable type L, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+1169 L Letter: Other letter, hangul, Hangul syllable type V, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+11FE L Letter: Other letter, hangul, Hangul syllable type T, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE4C L Letter: Other letter, hangul, Hangul syllable type LV, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD89 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
|
||||
findprop 118a0 11ac7 16ad0
|
||||
U+118A0 L Letter: Upper case letter, warangciti, Other, U+118C0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+11AC7 L Letter: Other letter, paucinhau, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+16AD0 L Letter: Other letter, bassavah, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
|
||||
findprop 11700 14400 108e0 11280 1d800
|
||||
U+11700 L Letter: Other letter, ahom, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+14400 L Letter: Other letter, anatolianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+108E0 R Letter: Other letter, hatran, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+11280 L Letter: Other letter, multani, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+1D800 L Symbol: Other symbol, signwriting, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
|
||||
findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
|
||||
U+11800 L Letter: Other letter, dogra, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+1E903 R Letter: Upper case letter, adlam, Other, U+1E925, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+11DA9 L Number: Decimal number, gunjalagondi, Other, [graphemebase, patternsyntax, terminalpunctuation]
|
||||
U+10D27 NSM Mark: Non-spacing mark, hanifirohingya, Extend, [extendedpictographic, graphemebase, patternsyntax]
|
||||
U+11EE0 L Letter: Other letter, makasar, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+16E48 L Letter: Upper case letter, medefaidrin, Other, U+16E68, [alphabetic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+10F27 R Letter: Other letter, oldsogdian, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10F30 AL Letter: Other letter, sogdian, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
|
||||
findprop a836 a833 1cf4 20f0 1cd0
|
||||
U+A836 L Symbol: Other symbol, common, Other, [devanagari, gurmukhi, gujarati, kaithi, takri, khojki, mahajani, modi, khudawadi, tirhuta, dogra], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+A833 L Number: Other number, common, Other, [devanagari, gurmukhi, gujarati, kannada, kaithi, takri, khojki, mahajani, modi, khudawadi, tirhuta, dogra, nandinagari], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+1CF4 NSM Mark: Non-spacing mark, inherited, Extend, [devanagari, kannada, grantha], [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
|
||||
U+20F0 NSM Mark: Non-spacing mark, inherited, Extend, [latin, devanagari, grantha], [caseignorable, graphemebase, patternsyntax, quotationmark]
|
||||
U+1CD0 NSM Mark: Non-spacing mark, inherited, Extend, [devanagari, bengali, kannada, grantha], [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
|
||||
|
||||
findprop 32ff
|
||||
U+32FF L Symbol: Other symbol, common, Other, [han], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
|
||||
findprop 1f16d
|
||||
U+1F16D ON Symbol: Other symbol, common, Extended Pictographic, [ascii, sentenceterminal, unifiedideograph, whitespace, xidcontinue]
|
||||
|
||||
findprop U+10e93 U+10eaa
|
||||
U+10E93 R Letter: Other letter, yezidi, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10EAA R Control: Unassigned, unknown, Other
|
||||
|
||||
findprop 0602 202a 202b 202c 2068 2069 202d 202e 2067
|
||||
U+0602 AN Control: Format, arabic, Prepend, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, lowercase]
|
||||
U+202A LRE Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+202B RLE Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+202C PDF Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+2068 FSI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+2069 PDI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+202D LRO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+202E RLO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
U+2067 RLI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
298
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/ucptestdata/testoutput2
vendored
Normal file
298
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/ucptestdata/testoutput2
vendored
Normal file
@ -0,0 +1,298 @@
|
||||
find script Han
|
||||
U+2E80..U+2E99 ON Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
|
||||
U+2E9B..U+2EF3 ON Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
|
||||
U+2F00..U+2FD5 ON Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
|
||||
U+3005 L Letter: Modifier letter, han, Other, [emoji, emojimodifierbase, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+3007 L Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+3021..U+3029 L Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+3038..U+303A L Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+303B L Letter: Modifier letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
|
||||
U+3400..U+4DBF L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+4E00..U+9FFF L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+F900..U+FA0D L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA0E..U+FA0F L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA10 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA11 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA12 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA13..U+FA14 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA15..U+FA1E L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA1F L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA20 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA21 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA22 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA23..U+FA24 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA25..U+FA26 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA27..U+FA29 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FA2A..U+FA6D L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FA70..U+FAD9 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+16FE2 ON Punctuation: Other punctuation, han, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+16FE3 L Letter: Modifier letter, han, Other, [emoji, emojimodifierbase, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+16FF0..U+16FF1 L Mark: Spacing mark, han, SpacingMark, [caseignorable, graphemeextend, idcontinue, ideographic, xidcontinue]
|
||||
U+20000..U+2A6DF L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+2A700..U+2B738 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+2B740..U+2B81D L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+2B820..U+2CEA1 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+2CEB0..U+2EBE0 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+2F800..U+2FA1D L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
|
||||
U+30000..U+3134A L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
find type Pe script Common scriptx Hangul
|
||||
U+3009 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+300B ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+300D ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [graphemebase, sentenceterminal, terminalpunctuation]
|
||||
U+300F ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [graphemebase, sentenceterminal, terminalpunctuation]
|
||||
U+3011 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+3015 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+3017 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+3019 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+301B ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
|
||||
U+301E..U+301F ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han], [softdotted, terminalpunctuation, unifiedideograph, xidcontinue, xidstart]
|
||||
U+FF63 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, emojimodifier, emojimodifierbase]
|
||||
find type Sk
|
||||
U+005E ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+0060 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
|
||||
U+00A8 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00AF ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00B4 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+00B8 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02C2..U+02C5 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02D2..U+02DF ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02E5..U+02E9 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02EA..U+02EB ON Symbol: Modifier symbol, bopomofo, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02ED ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+02EF..U+02FF ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+0375 ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+0384 ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+0385 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+0888 AL Symbol: Modifier symbol, arabic, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, math, softdotted, xidcontinue, xidstart]
|
||||
U+1FBD ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1FBF..U+1FC1 ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1FCD..U+1FCF ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1FDD..U+1FDF ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1FED..U+1FEF ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1FFD..U+1FFE ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+309B..U+309C ON Symbol: Modifier symbol, common, Other, [hiragana, katakana], [alphabetic, bidimirrored, caseignorable, cased, changeswhencasefolded, changeswhenlowercased, changeswhentitlecased, changeswhenuppercased, dash, defaultignorablecodepoint, deprecated, diacritic, emoji, emojicomponent, emojimodifier, emojimodifierbase, emojipresentation, extendedpictographic, extender, graphemebase, graphemeextend, graphemelink, hexdigit, idsbinaryoperator, idstrinaryoperator, idcontinue, idstart, ideographic, sentenceterminal, unifiedideograph, whitespace, xidcontinue]
|
||||
U+A700..U+A707 ON Symbol: Modifier symbol, common, Other, [latin, han], [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+A708..U+A716 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+A720..U+A721 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+A789..U+A78A L Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+AB5B L Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+AB6A..U+AB6B ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+FBB2..U+FBC2 AL Symbol: Modifier symbol, arabic, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, math, softdotted, xidcontinue, xidstart]
|
||||
U+FF3E ON Symbol: Modifier symbol, common, Other, [asciihexdigit, bidicontrol, bidimirrored, cased, changeswhencasefolded, sentenceterminal, unifiedideograph, whitespace, xidstart]
|
||||
U+FF40 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+FFE3 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+1F3FB..U+1F3FF ON Symbol: Modifier symbol, common, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternsyntax, radical, sentenceterminal, terminalpunctuation]
|
||||
find type Pd
|
||||
U+002D ES Punctuation: Dash punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
|
||||
U+058A ON Punctuation: Dash punctuation, armenian, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+05BE R Punctuation: Dash punctuation, hebrew, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+1400 ON Punctuation: Dash punctuation, canadianaboriginal, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+1806 ON Punctuation: Dash punctuation, mongolian, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+2010..U+2015 ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+2E17 ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+2E1A ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+2E3A..U+2E3B ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+2E40 ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+2E5D ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+301C ON Punctuation: Dash punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han], [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
|
||||
U+3030 ON Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+30A0 ON Punctuation: Dash punctuation, common, Other, [hiragana, katakana], [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+FE31..U+FE32 ON Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+FE58 ON Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+FE63 ES Punctuation: Dash punctuation, common, Other, [caseignorable, sentenceterminal, unifiedideograph, xidcontinue]
|
||||
U+FF0D ES Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
U+10EAD R Punctuation: Dash punctuation, yezidi, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
|
||||
find gbreak LVT
|
||||
U+AC01..U+AC1B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AC1D..U+AC37 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AC39..U+AC53 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AC55..U+AC6F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AC71..U+AC8B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AC8D..U+ACA7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ACA9..U+ACC3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ACC5..U+ACDF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ACE1..U+ACFB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ACFD..U+AD17 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD19..U+AD33 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD35..U+AD4F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD51..U+AD6B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD6D..U+AD87 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AD89..U+ADA3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ADA5..U+ADBF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ADC1..U+ADDB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ADDD..U+ADF7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+ADF9..U+AE13 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE15..U+AE2F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE31..U+AE4B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE4D..U+AE67 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE69..U+AE83 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AE85..U+AE9F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AEA1..U+AEBB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AEBD..U+AED7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AED9..U+AEF3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AEF5..U+AF0F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF11..U+AF2B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF2D..U+AF47 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF49..U+AF63 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF65..U+AF7F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF81..U+AF9B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AF9D..U+AFB7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AFB9..U+AFD3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AFD5..U+AFEF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+AFF1..U+B00B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B00D..U+B027 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B029..U+B043 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B045..U+B05F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B061..U+B07B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B07D..U+B097 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B099..U+B0B3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B0B5..U+B0CF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B0D1..U+B0EB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B0ED..U+B107 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B109..U+B123 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B125..U+B13F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B141..U+B15B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B15D..U+B177 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B179..U+B193 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B195..U+B1AF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B1B1..U+B1CB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B1CD..U+B1E7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B1E9..U+B203 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B205..U+B21F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B221..U+B23B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B23D..U+B257 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B259..U+B273 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B275..U+B28F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B291..U+B2AB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B2AD..U+B2C7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B2C9..U+B2E3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B2E5..U+B2FF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B301..U+B31B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B31D..U+B337 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B339..U+B353 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B355..U+B36F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B371..U+B38B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B38D..U+B3A7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B3A9..U+B3C3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B3C5..U+B3DF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B3E1..U+B3FB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B3FD..U+B417 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B419..U+B433 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B435..U+B44F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B451..U+B46B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B46D..U+B487 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B489..U+B4A3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B4A5..U+B4BF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B4C1..U+B4DB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B4DD..U+B4F7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B4F9..U+B513 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B515..U+B52F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B531..U+B54B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B54D..U+B567 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B569..U+B583 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B585..U+B59F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B5A1..U+B5BB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B5BD..U+B5D7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B5D9..U+B5F3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B5F5..U+B60F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B611..U+B62B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B62D..U+B647 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B649..U+B663 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B665..U+B67F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B681..U+B69B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B69D..U+B6B7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B6B9..U+B6D3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+B6D5..U+B6EF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
...
|
||||
find script Old_Uyghur
|
||||
U+10F70..U+10F81 R Letter: Other letter, olduyghur, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+10F82..U+10F85 NSM Mark: Non-spacing mark, olduyghur, Extend, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
|
||||
U+10F86..U+10F89 R Punctuation: Other punctuation, olduyghur, Other, [bidimirrored, graphemebase, math, patternsyntax]
|
||||
find bidi PDF
|
||||
U+202C PDF Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
find bidi CS
|
||||
U+002C CS Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+002E CS Punctuation: Other punctuation, common, Other, [graphemebase, whitespace]
|
||||
U+002F CS Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
|
||||
U+003A CS Punctuation: Other punctuation, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
|
||||
U+00A0 CS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+060C CS Punctuation: Other punctuation, common, Other, [arabic, syriac, thaana, nko, hanifirohingya, yezidi], [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+202F CS Separator: Space separator, common, Other, [latin, mongolian], [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+2044 CS Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
U+FE50 CS Punctuation: Other punctuation, common, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+FE52 CS Punctuation: Other punctuation, common, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FE55 CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+FF0C CS Punctuation: Other punctuation, common, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+FF0E CS Punctuation: Other punctuation, common, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
|
||||
U+FF0F CS Punctuation: Other punctuation, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
|
||||
U+FF1A CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
find bidi CS type Sm
|
||||
U+2044 CS Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
|
||||
find bidi B
|
||||
U+000A B Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000D B Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+001C..U+001E B Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
U+0085 B Control: Control, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
|
||||
U+2029 B Separator: Paragraph separator, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
|
||||
find bidi FSI
|
||||
U+2068 FSI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
find bidi PDI
|
||||
U+2069 PDI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
find bidi RLI
|
||||
U+2067 RLI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
find bidi RLO
|
||||
U+202E RLO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
|
||||
find bidi S
|
||||
U+0009 S Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+000B S Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+001F S Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
|
||||
find bidi WS
|
||||
U+000C WS Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
|
||||
U+0020 WS Separator: Space separator, common, Other, [ascii, emoji, emojicomponent, graphemebase, patternsyntax]
|
||||
U+1680 WS Separator: Space separator, ogham, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+2000..U+200A WS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+2028 WS Separator: Line separator, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
|
||||
U+205F WS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
U+3000 WS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
|
||||
find script bopo
|
||||
U+02EA..U+02EB ON Symbol: Modifier symbol, bopomofo, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
|
||||
U+3105..U+312F L Letter: Other letter, bopomofo, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
U+31A0..U+31BF L Letter: Other letter, bopomofo, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
|
||||
find bool prependedconcatenationmark
|
||||
U+00AD BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+180E BN Control: Format, mongolian, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+200B BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+2060 BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+2118 ON Symbol: Mathematical symbol, common, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+3030 ON Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+AAC0 L Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+AAC2 L Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+FE0F NSM Mark: Non-spacing mark, inherited, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+FE55 CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+FEFF BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+FF1A CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+FF21..U+FF26 L Letter: Upper case letter, latin, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+10D22..U+10D23 AL Letter: Other letter, hanifirohingya, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+1135D L Letter: Other letter, grantha, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, graphemeextend, hexdigit, logicalorderexception, lowercase, math, noncharactercodepoint, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+1BCA0..U+1BCA3 BN Control: Format, common, Control, [duployan], [caseignorable, prependedconcatenationmark]
|
||||
U+1D173..U+1D17A BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+1F1E6..U+1F1FF L Symbol: Other symbol, common, Regional Indicator, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
find bool pcm
|
||||
U+00AD BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+180E BN Control: Format, mongolian, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+200B BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+2060 BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+2118 ON Symbol: Mathematical symbol, common, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+3030 ON Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+AAC0 L Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+AAC2 L Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+FE0F NSM Mark: Non-spacing mark, inherited, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+FE55 CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+FEFF BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+FF1A CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+FF21..U+FF26 L Letter: Upper case letter, latin, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
|
||||
U+10D22..U+10D23 AL Letter: Other letter, hanifirohingya, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
|
||||
U+1135D L Letter: Other letter, grantha, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, graphemeextend, hexdigit, logicalorderexception, lowercase, math, noncharactercodepoint, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
||||
U+1BCA0..U+1BCA3 BN Control: Format, common, Control, [duployan], [caseignorable, prependedconcatenationmark]
|
||||
U+1D173..U+1D17A BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
|
||||
U+1F1E6..U+1F1FF L Symbol: Other symbol, common, Regional Indicator, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
|
347
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/utf8.c
vendored
Normal file
347
Kha/Backends/Kinc-hxcpp/khacpp/project/thirdparty/pcre2-10.42-16/maint/utf8.c
vendored
Normal file
@ -0,0 +1,347 @@
|
||||
/****************************************************
|
||||
* PCRE maintainers' helper program: UTF-8 converter *
|
||||
****************************************************/
|
||||
|
||||
/* This is a test program for converting character code points to UTF-8 and
|
||||
vice versa. Note that this program conforms to the original definition of
|
||||
UTF-8, which allows codepoints up to 7fffffff. The more recent definition
|
||||
limits the validity of Unicode UTF-8 codepoints to a maximum of 10ffffff, and
|
||||
forbids the "surrogate" code points. This program now gives warnings for these
|
||||
invalid code points.
|
||||
|
||||
The arguments are either single code point values written as U+hh.. or 0xhh..
|
||||
for conversion to UTF-8, or sequences of hex values, written without 0x and
|
||||
optionally including spaces (but such arguments must be quoted), for conversion
|
||||
from UTF-8 to codepoints. For example:
|
||||
|
||||
./utf8 0x1234
|
||||
U+00001234 => e1 88 b4
|
||||
|
||||
./utf8 "e1 88 b4"
|
||||
U+00001234 <= e1 88 b4
|
||||
|
||||
In the second case, a number of UTF-8 characters can be present in one
|
||||
argument. In other words, each such argument is interpreted (after ignoring
|
||||
spaces) as a string of UTF-8 bytes representing a string of characters:
|
||||
|
||||
./utf8 "65 e188b4 77"
|
||||
0x00000065 <= 65
|
||||
0x00001234 <= e1 88 b4
|
||||
0x00000077 <= 77
|
||||
|
||||
If the option -s is given, the sequence of UTF-bytes is written out between
|
||||
angle brackets at the end of the line. On a UTF-8 terminal, this will show the
|
||||
appropriate graphic for the code point.
|
||||
|
||||
Errors provoke error messages, but the program carries on with the next
|
||||
argument. The return code is always zero.
|
||||
|
||||
Philip Hazel
|
||||
Original creation data: unknown
|
||||
Code extended and tidied to avoid compiler warnings: 26 March 2020
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
|
||||
/* The valid ranges for UTF-8 characters are:
|
||||
|
||||
0000 0000 to 0000 007f 1 byte (ascii)
|
||||
0000 0080 to 0000 07ff 2 bytes
|
||||
0000 0800 to 0000 ffff 3 bytes
|
||||
0001 0000 to 001f ffff 4 bytes
|
||||
0020 0000 to 03ff ffff 5 bytes
|
||||
0400 0000 to 7fff ffff 6 bytes
|
||||
*/
|
||||
|
||||
|
||||
static const unsigned int utf8_table1[] = {
|
||||
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
|
||||
|
||||
static const int utf8_table2[] = {
|
||||
0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
|
||||
|
||||
static const int utf8_table3[] = {
|
||||
0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Convert character value to UTF-8 *
|
||||
*************************************************/
|
||||
|
||||
/* This function takes an unsigned long integer value in the range 0 -
|
||||
0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes.
|
||||
|
||||
Arguments:
|
||||
cvalue the character value
|
||||
buffer pointer to buffer for result - at least 6 bytes long
|
||||
|
||||
Returns: number of bytes placed in the buffer
|
||||
0 if input code point is too big
|
||||
*/
|
||||
|
||||
static size_t
|
||||
ord2utf8(unsigned long int cvalue, unsigned char *buffer)
|
||||
{
|
||||
size_t i, j;
|
||||
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
|
||||
if (cvalue <= utf8_table1[i]) break;
|
||||
if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
|
||||
buffer += i;
|
||||
for (j = i; j > 0; j--)
|
||||
{
|
||||
*buffer-- = 0x80 | (cvalue & 0x3f);
|
||||
cvalue >>= 6;
|
||||
}
|
||||
*buffer = utf8_table2[i] | cvalue;
|
||||
return i + 1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Convert UTF-8 string to value *
|
||||
*************************************************/
|
||||
|
||||
/* This function takes one or more bytes that represent a UTF-8 character from
|
||||
the start of a string of bytes. It returns the value of the character, or the
|
||||
offset of a malformation. For an overlong encoding that works but is not the
|
||||
correct (shortest) one, the error offset is just after the last byte.
|
||||
|
||||
Argument:
|
||||
buffer a pointer to the byte vector
|
||||
buffend a pointer to the end of the buffer
|
||||
vptr a pointer to a variable to receive the value
|
||||
lenptr a pointer to a variable to receive the offset when error detected
|
||||
|
||||
Returns: > 0 => the number of bytes consumed
|
||||
0 => invalid UTF-8: first byte missing 0x40 bit
|
||||
-1 => invalid UTF-8: first byte has too many high-order 1-bits
|
||||
-2 => incomplete sequence at end of string
|
||||
-3 => incomplete sequence within string
|
||||
-4 => overlong code sequence
|
||||
*/
|
||||
|
||||
static int
|
||||
utf82ord(unsigned char *buffer, unsigned char *buffend,
|
||||
long unsigned int *vptr, int *lenptr)
|
||||
{
|
||||
unsigned int c = *buffer++;
|
||||
unsigned int d = c;
|
||||
int i, j, s;
|
||||
|
||||
/* Check for an ASCII character, or find the number of additional bytes in a
|
||||
multibyte character. */
|
||||
|
||||
for (i = -1; i < 6; i++)
|
||||
{
|
||||
if ((d & 0x80) == 0) break;
|
||||
d <<= 1;
|
||||
}
|
||||
|
||||
switch (i)
|
||||
{
|
||||
case -1: /* ASCII character; first byte does not have 0x80 bit */
|
||||
*vptr = c;
|
||||
return 1;
|
||||
|
||||
case 0: /* First byte has 0x80 but is missing 0x40 bit */
|
||||
*lenptr = 0;
|
||||
return 0;
|
||||
|
||||
case 6:
|
||||
*lenptr = 0; /* Too many high bits */
|
||||
return -1;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
/* i now has a value in the range 1-5 */
|
||||
|
||||
s = 6*i;
|
||||
d = (c & utf8_table3[i]) << s;
|
||||
|
||||
for (j = 0; j < i; j++)
|
||||
{
|
||||
if (buffer >= buffend)
|
||||
{
|
||||
*lenptr = j + 1;
|
||||
return -2;
|
||||
}
|
||||
c = *buffer++;
|
||||
if ((c & 0xc0) != 0x80)
|
||||
{
|
||||
*lenptr = j + 1;
|
||||
return -3;
|
||||
}
|
||||
s -= 6;
|
||||
d |= (c & 0x3f) << s;
|
||||
}
|
||||
|
||||
/* Valid UTF-8 syntax */
|
||||
|
||||
*vptr = d;
|
||||
|
||||
/* Check that encoding was the correct one, not overlong */
|
||||
|
||||
for (j = 0; j < (int)(sizeof(utf8_table1)/sizeof(int)); j++)
|
||||
if (d <= utf8_table1[j]) break;
|
||||
if (j != i)
|
||||
{
|
||||
*lenptr = i + 1;
|
||||
return -4;
|
||||
}
|
||||
|
||||
/* Valid value */
|
||||
|
||||
return i + 1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Main Program *
|
||||
*************************************************/
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
int i = 1;
|
||||
int show = 0;
|
||||
unsigned char buffer[64];
|
||||
|
||||
if (argc > 1 && strcmp(argv[1], "-s") == 0)
|
||||
{
|
||||
show = 1;
|
||||
i = 2;
|
||||
}
|
||||
|
||||
for (; i < argc; i++)
|
||||
{
|
||||
char *x = argv[i];
|
||||
char *endptr;
|
||||
if (strncmp(x, "0x", 2) == 0 || strncmp(x, "U+", 2) == 0)
|
||||
{
|
||||
size_t rc, j;
|
||||
unsigned long int d = strtoul(x+2, &endptr, 16);
|
||||
if (*endptr != 0)
|
||||
{
|
||||
printf("** Invalid hex number %s\n", x);
|
||||
continue; /* With next argument */
|
||||
}
|
||||
rc = ord2utf8(d, buffer);
|
||||
printf("U+%08lx => ", d);
|
||||
if (rc == 0)
|
||||
printf("** Code point greater than 0x7fffffff cannot be encoded");
|
||||
else
|
||||
{
|
||||
for (j = 0; j < rc; j++) printf("%02x ", buffer[j]);
|
||||
if (show)
|
||||
{
|
||||
printf(">");
|
||||
for (j = 0; j < rc; j++) printf("%c", buffer[j]);
|
||||
printf("< ");
|
||||
}
|
||||
if (d >= 0xd800 && d <= 0xdfff)
|
||||
printf("** Invalid Unicode (surrogate)");
|
||||
else if (d > 0x10ffff)
|
||||
printf("** Invalid Unicode (greater than U+10ffff)");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned char *bptr;
|
||||
unsigned char *buffend;
|
||||
int len = 0;
|
||||
int y = 0;
|
||||
int z = 0;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
while (*x == ' ') x++;
|
||||
if (*x == 0 && !z) break;
|
||||
if (!isxdigit(*x))
|
||||
{
|
||||
printf("** Malformed hex string: %s\n", argv[i]);
|
||||
len = -1;
|
||||
break;
|
||||
}
|
||||
y = y * 16 + tolower(*x) - ((isdigit(*x))? '0' : 'W');
|
||||
x++;
|
||||
if (z)
|
||||
{
|
||||
buffer[len++] = y;
|
||||
y = 0;
|
||||
}
|
||||
z ^= 1;
|
||||
}
|
||||
|
||||
if (len < 0) continue; /* With next argument after malformation */
|
||||
|
||||
bptr = buffer;
|
||||
buffend = buffer + len;
|
||||
|
||||
while (bptr < buffend)
|
||||
{
|
||||
unsigned long int d;
|
||||
int j;
|
||||
int offset;
|
||||
int rc = utf82ord(bptr, buffend, &d, &offset);
|
||||
|
||||
if (rc > 0)
|
||||
{
|
||||
printf("U+%08lx <= ", d);
|
||||
for (j = 0; j < rc; j++) printf("%02x ", bptr[j]);
|
||||
if (show)
|
||||
{
|
||||
printf(">");
|
||||
for (j = 0; j < rc; j++) printf("%c", bptr[j]);
|
||||
printf("<");
|
||||
}
|
||||
printf("\n");
|
||||
bptr += rc;
|
||||
}
|
||||
else if (rc == -4)
|
||||
{
|
||||
printf("U+%08lx <= ", d);
|
||||
for (j = 0; j < offset; j++) printf("%02x ", bptr[j]);
|
||||
printf("** Overlong UTF-8 sequence\n");
|
||||
bptr += offset;
|
||||
}
|
||||
else
|
||||
{
|
||||
switch (rc)
|
||||
{
|
||||
case 0: printf("** First byte missing 0x40 bit");
|
||||
break;
|
||||
|
||||
case -1: printf("** First byte has too many high-order bits");
|
||||
break;
|
||||
|
||||
case -2: printf("** Incomplete UTF-8 sequence at end of string");
|
||||
break;
|
||||
|
||||
case -3: printf("** Incomplete UTF-8 sequence");
|
||||
break;
|
||||
|
||||
default: printf("** Unexpected return %d from utf82ord()", rc);
|
||||
break;
|
||||
}
|
||||
printf(" at offset %d in string ", offset);
|
||||
while (bptr < buffend) printf("%02x ", *bptr++);
|
||||
printf("\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* End */
|
Reference in New Issue
Block a user