Update Files

This commit is contained in:
2025-01-22 16:18:30 +01:00
parent ed4603cf95
commit a36294b518
16718 changed files with 2960346 additions and 0 deletions

View File

@ -0,0 +1,355 @@
#! /usr/bin/python
# PCRE2 UNICODE PROPERTY SUPPORT
# ------------------------------
# This file is a Python module containing common lists and functions for the
# GenerateXXX scripts that create various.c and .h files from Unicode data
# files. It was created as part of a re-organizaton of these scripts in
# December 2021.
import re
# ---------------------------------------------------------------------------
# DATA LISTS
# ---------------------------------------------------------------------------
# BIDI classes in the DerivedBidiClass.txt file, with comments.
bidi_classes = [
'AL', 'Arabic letter',
'AN', 'Arabic number',
'B', 'Paragraph separator',
'BN', 'Boundary neutral',
'CS', 'Common separator',
'EN', 'European number',
'ES', 'European separator',
'ET', 'European terminator',
'FSI', 'First strong isolate',
'L', 'Left to right',
'LRE', 'Left to right embedding',
'LRI', 'Left to right isolate',
'LRO', 'Left to right override',
'NSM', 'Non-spacing mark',
'ON', 'Other neutral',
'PDF', 'Pop directional format',
'PDI', 'Pop directional isolate',
'R', 'Right to left',
'RLE', 'Right to left embedding',
'RLI', 'Right to left isolate',
'RLO', 'Right to left override',
'S', 'Segment separator',
'WS', 'White space'
]
# Particular category property names, with comments. NOTE: If ever this list
# is changed, the table called "catposstab" in the pcre2_auto_possess.c file
# must be edited to keep in step.
category_names = [
'Cc', 'Control',
'Cf', 'Format',
'Cn', 'Unassigned',
'Co', 'Private use',
'Cs', 'Surrogate',
'Ll', 'Lower case letter',
'Lm', 'Modifier letter',
'Lo', 'Other letter',
'Lt', 'Title case letter',
'Lu', 'Upper case letter',
'Mc', 'Spacing mark',
'Me', 'Enclosing mark',
'Mn', 'Non-spacing mark',
'Nd', 'Decimal number',
'Nl', 'Letter number',
'No', 'Other number',
'Pc', 'Connector punctuation',
'Pd', 'Dash punctuation',
'Pe', 'Close punctuation',
'Pf', 'Final punctuation',
'Pi', 'Initial punctuation',
'Po', 'Other punctuation',
'Ps', 'Open punctuation',
'Sc', 'Currency symbol',
'Sk', 'Modifier symbol',
'Sm', 'Mathematical symbol',
'So', 'Other symbol',
'Zl', 'Line separator',
'Zp', 'Paragraph separator',
'Zs', 'Space separator'
]
# The Extended_Pictographic property is not found in the file where all the
# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
# file, but we list it here so that the name has the correct index value.
break_properties = [
'CR', ' 0',
'LF', ' 1',
'Control', ' 2',
'Extend', ' 3',
'Prepend', ' 4',
'SpacingMark', ' 5',
'L', ' 6 Hangul syllable type L',
'V', ' 7 Hangul syllable type V',
'T', ' 8 Hangul syllable type T',
'LV', ' 9 Hangul syllable type LV',
'LVT', '10 Hangul syllable type LVT',
'Regional_Indicator', '11',
'Other', '12',
'ZWJ', '13',
'Extended_Pictographic', '14'
]
# List of files from which the names of Boolean properties are obtained, along
# with a list of regex patterns for properties to be ignored, and a list of
# extra pattern names to add.
bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt']
bool_propsignore = [r'^Other_', r'^Hyphen$']
bool_propsextras = ['ASCII', 'Bidi_Mirrored']
# ---------------------------------------------------------------------------
# GET BOOLEAN PROPERTY NAMES
# ---------------------------------------------------------------------------
# Get a list of Boolean property names from a number of files.
def getbpropslist():
bplist = []
bplast = ""
for filename in bool_propsfiles:
try:
file = open('Unicode.tables/' + filename, 'r')
except IOError:
print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
sys.exit(1)
for line in file:
line = re.sub(r'#.*', '', line)
data = list(map(str.strip, line.split(';')))
if len(data) <= 1 or data[1] == bplast:
continue
bplast = data[1]
for pat in bool_propsignore:
if re.match(pat, bplast) != None:
break
else:
bplist.append(bplast)
file.close()
bplist.extend(bool_propsextras)
bplist.sort()
return bplist
bool_properties = getbpropslist()
bool_props_list_item_size = (len(bool_properties) + 31) // 32
# ---------------------------------------------------------------------------
# COLLECTING PROPERTY NAMES AND ALIASES
# ---------------------------------------------------------------------------
script_names = ['Unknown']
abbreviations = {}
def collect_property_names():
global script_names
global abbreviations
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #')
last_script_name = ""
with open("Unicode.tables/Scripts.txt") as f:
for line in f:
match_obj = names_re.match(line)
if match_obj == None or match_obj.group(1) == last_script_name:
continue
last_script_name = match_obj.group(1)
script_names.append(last_script_name)
# Sometimes there is comment in the line
# so splitting around semicolon is not enough
value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?')
with open("Unicode.tables/PropertyValueAliases.txt") as f:
for line in f:
match_obj = value_alias_re.match(line)
if match_obj == None:
continue
if match_obj.group(1) == "sc":
if match_obj.group(2) == match_obj.group(3):
abbreviations[match_obj.group(3)] = ()
elif match_obj.group(4) == None:
abbreviations[match_obj.group(3)] = (match_obj.group(2),)
else:
abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4))
# We can also collect Boolean property abbreviations into the same dictionary
bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?')
with open("Unicode.tables/PropertyAliases.txt") as f:
for line in f:
match_obj = bin_alias_re.match(line)
if match_obj == None:
continue
if match_obj.group(2) in bool_properties:
if match_obj.group(3) == None:
abbreviations[match_obj.group(2)] = (match_obj.group(1),)
else:
abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3))
collect_property_names()
# ---------------------------------------------------------------------------
# REORDERING SCRIPT NAMES
# ---------------------------------------------------------------------------
script_abbrevs = []
def reorder_scripts():
global script_names
global script_abbrevs
global abbreviations
for name in script_names:
abbrevs = abbreviations[name]
script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0])
extended_script_abbrevs = set()
with open("Unicode.tables/ScriptExtensions.txt") as f:
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')
for line in f:
match_obj = names_re.match(line)
if match_obj == None:
continue
for name in match_obj.group(1).split(" "):
extended_script_abbrevs.add(name)
new_script_names = []
new_script_abbrevs = []
for idx, abbrev in enumerate(script_abbrevs):
if abbrev in extended_script_abbrevs:
new_script_names.append(script_names[idx])
new_script_abbrevs.append(abbrev)
for idx, abbrev in enumerate(script_abbrevs):
if abbrev not in extended_script_abbrevs:
new_script_names.append(script_names[idx])
new_script_abbrevs.append(abbrev)
script_names = new_script_names
script_abbrevs = new_script_abbrevs
reorder_scripts()
script_list_item_size = (script_names.index('Unknown') + 31) // 32
# ---------------------------------------------------------------------------
# DERIVED LISTS
# ---------------------------------------------------------------------------
# Create general character property names from the first letters of the
# particular categories.
gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2))
general_category_names = list(gcn_set)
general_category_names.sort()
# ---------------------------------------------------------------------------
# FUNCTIONS
# ---------------------------------------------------------------------------
import sys
# Open an output file, using the command's argument or a default. Write common
# preliminary header information.
def open_output(default):
if len(sys.argv) > 2:
print('** Too many arguments: just give a file name')
sys.exit(1)
if len(sys.argv) == 2:
output_name = sys.argv[1]
else:
output_name = default
try:
file = open(output_name, "w")
except IOError:
print ("** Couldn't open %s" % output_name)
sys.exit(1)
script_name = sys.argv[0]
i = script_name.rfind('/')
if i >= 0:
script_name = script_name[i+1:]
file.write("""\
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2022 University of Cambridge
This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
""")
file.write("Instead, modify the maint/%s script and run it to generate\n"
"a new version of this code.\n\n" % script_name)
file.write("""\
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
\n""")
return file
# End of UcpCommon.py

View File

@ -0,0 +1,188 @@
#! /usr/bin/python
# PCRE2 UNICODE PROPERTY SUPPORT
# ------------------------------
#
# This file auto-generates unicode property tests and their expected output.
# It is recommended to re-run this generator after the unicode files are
# updated. The names of the generated files are `testinput26` and `testoutput26`
import re
import sys
from GenerateCommon import \
script_names, \
script_abbrevs
def write_both(text):
input_file.write(text)
output_file.write(text)
def to_string_char(ch_idx):
if ch_idx < 128:
if ch_idx < 16:
return "\\x{0%x}" % ch_idx
if ch_idx >= 32:
return chr(ch_idx)
return "\\x{%x}" % ch_idx
output_directory = ""
if len(sys.argv) > 2:
print('** Too many arguments: just give a directory name')
sys.exit(1)
if len(sys.argv) == 2:
output_directory = sys.argv[1]
if not output_directory.endswith("/"):
output_directory += "/"
try:
input_file = open(output_directory + "testinput26", "w")
output_file = open(output_directory + "testoutput26", "w")
except IOError:
print ("** Couldn't open output files")
sys.exit(1)
write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")
# ---------------------------------------------------------------------------
# UNICODE SCRIPT EXTENSION TESTS
# ---------------------------------------------------------------------------
write_both("# Unicode Script Extension tests.\n\n")
def gen_script_tests():
script_data = [None] * len(script_names)
char_data = [None] * 0x110000
property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
prev_name = ""
script_idx = -1
with open("Unicode.tables/Scripts.txt") as f:
for line in f:
match_obj = property_re.match(line)
if match_obj == None:
continue
name = match_obj.group(3)
if name != prev_name:
script_idx = script_names.index(name)
prev_name = name
low = int(match_obj.group(1), 16)
high = low
char_data[low] = name
if match_obj.group(2) != None:
high = int(match_obj.group(2), 16)
for idx in range(low + 1, high + 1):
char_data[idx] = name
if script_data[script_idx] == None:
script_data[script_idx] = [low, None, None, None, None]
script_data[script_idx][1] = high
extended_script_indicies = {}
with open("Unicode.tables/ScriptExtensions.txt") as f:
for line in f:
match_obj = property_re.match(line)
if match_obj == None:
continue
low = int(match_obj.group(1), 16)
high = low
if match_obj.group(2) != None:
high = int(match_obj.group(2), 16)
for abbrev in match_obj.group(3).split(" "):
if abbrev not in extended_script_indicies:
idx = script_abbrevs.index(abbrev)
extended_script_indicies[abbrev] = idx
rec = script_data[idx]
rec[2] = low
rec[3] = high
else:
idx = extended_script_indicies[abbrev]
rec = script_data[idx]
if rec[2] > low:
rec[2] = low
if rec[3] < high:
rec[3] = high
if rec[4] == None:
name = script_names[idx]
for idx in range(low, high + 1):
if char_data[idx] != name:
rec[4] = idx
break
long_property_name = False
for idx, rec in enumerate(script_data):
script_name = script_names[idx]
if script_name == "Unknown":
continue
script_abbrev = script_abbrevs[idx]
write_both("# Base script check\n")
write_both("/^\\p{sc=%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(rec[0]))
output_file.write(" 0: %s\n" % to_string_char(rec[0]))
write_both("\n")
write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
write_both(" %s\n" % to_string_char(rec[1]))
output_file.write(" 0: %s\n" % to_string_char(rec[1]))
write_both("\n")
if rec[2] != None:
property_name = "scx"
if long_property_name:
property_name = "Script_Extensions"
write_both("# Script extension check\n")
write_both("/^\\p{%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(rec[2]))
output_file.write(" 0: %s\n" % to_string_char(rec[2]))
write_both("\n")
write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
write_both(" %s\n" % to_string_char(rec[3]))
output_file.write(" 0: %s\n" % to_string_char(rec[3]))
write_both("\n")
long_property_name = not long_property_name
if rec[4] != None:
write_both("# Script extension only character\n")
write_both("/^\\p{%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(rec[4]))
output_file.write(" 0: %s\n" % to_string_char(rec[4]))
write_both("\n")
write_both("/^\\p{sc=%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(rec[4]))
output_file.write("No match\n")
write_both("\n")
else:
print("External character has not found for %s" % script_name)
high = rec[1]
if rec[3] != None and rec[3] > rec[1]:
high = rec[3]
write_both("# Character not in script\n")
write_both("/^\\p{%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(high + 1))
output_file.write("No match\n")
write_both("\n")
gen_script_tests()
write_both("# End of testinput26\n")

View File

@ -0,0 +1,923 @@
#! /usr/bin/python
# PCRE2 UNICODE PROPERTY SUPPORT
# ------------------------------
#
# This script generates the pcre2_ucd.c file from Unicode data files. This is
# the compressed Unicode property data used by PCRE2. The script was created in
# December 2021 as part of the Unicode data generation refactoring. It is
# basically a re-working of the MultiStage2.py script that was submitted to the
# PCRE project by Peter Kankowski in 2008 as part of a previous upgrading of
# Unicode property support. A number of extensions have since been added. The
# main difference in the 2021 upgrade (apart from comments and layout) is that
# the data tables (e.g. list of script names) are now listed in or generated by
# a separate Python module that is shared with the other Generate scripts.
#
# This script must be run in the "maint" directory. It requires the following
# Unicode data tables: BidiMirrorring.txt, CaseFolding.txt,
# DerivedBidiClass.txt, DerivedCoreProperties.txt, DerivedGeneralCategory.txt,
# GraphemeBreakProperty.txt, PropList.txt, PropertyAliases.txt,
# PropertyValueAliases.txt, ScriptExtensions.txt, Scripts.txt, and
# emoji-data.txt. These must be in the Unicode.tables subdirectory.
#
# The emoji-data.txt file is found in the "emoji" subdirectory even though it
# is technically part of a different (but coordinated) standard as shown
# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"),
# for example:
#
# http://unicode.org/Public/emoji/13.0/ReadMe.txt
#
# DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted"
# subdirectory of the Unicode database (UCD) on the Unicode web site;
# GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. The other files
# are in the top-level UCD directory.
#
# -----------------------------------------------------------------------------
# Minor modifications made to the original script:
# Added #! line at start
# Removed tabs
# Made it work with Python 2.4 by rewriting two statements that needed 2.5
# Consequent code tidy
# Adjusted data file names to take from the Unicode.tables directory
# Adjusted global table names by prefixing _pcre_.
# Commented out stuff relating to the casefolding table, which isn't used;
# removed completely in 2012.
# Corrected size calculation
# Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
# Update for PCRE2: name changes, and SUPPORT_UCP is abolished.
#
# Major modifications made to the original script:
# Added code to add a grapheme break property field to records.
#
# Added code to search for sets of more than two characters that must match
# each other caselessly. A new table is output containing these sets, and
# offsets into the table are added to the main output records. This new
# code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer
# used.
#
# Update for Python3:
# . Processed with 2to3, but that didn't fix everything
# . Changed string.strip to str.strip
# . Added encoding='utf-8' to the open() call
# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
# required and the result of the division is a float
#
# Added code to scan the emoji-data.txt file to find the Extended Pictographic
# property, which is used by PCRE2 as a grapheme breaking property. This was
# done when updating to Unicode 11.0.0 (July 2018).
#
# Added code to add a Script Extensions field to records. This has increased
# their size from 8 to 12 bytes, only 10 of which are currently used.
#
# Added code to add a bidi class field to records by scanning the
# DerivedBidiClass.txt and PropList.txt files. This uses one of the two spare
# bytes, so now 11 out of 12 are in use.
#
# 01-March-2010: Updated list of scripts for Unicode 5.2.0
# 30-April-2011: Updated list of scripts for Unicode 6.0.0
# July-2012: Updated list of scripts for Unicode 6.1.0
# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
# field in the record to hold the value. Luckily, the
# structure had a hole in it, so the resulting table is
# not much bigger than before.
# 18-September-2012: Added code for multiple caseless sets. This uses the
# final hole in the structure.
# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
# 13-May-2014: Updated for PCRE2
# 03-June-2014: Updated for Python 3
# 20-June-2014: Updated for Unicode 7.0.0
# 12-August-2014: Updated to put Unicode version into the file
# 19-June-2015: Updated for Unicode 8.0.0
# 02-July-2017: Updated for Unicode 10.0.0
# 03-July-2018: Updated for Unicode 11.0.0
# 07-July-2018: Added code to scan emoji-data.txt for the Extended
# Pictographic property.
# 01-October-2018: Added the 'Unknown' script name
# 03-October-2018: Added new field for Script Extensions
# 27-July-2019: Updated for Unicode 12.1.0
# 10-March-2020: Updated for Unicode 13.0.0
# PCRE2-10.39: Updated for Unicode 14.0.0
# 05-December-2021: Added code to scan DerivedBidiClass.txt for bidi class,
# and also PropList.txt for the Bidi_Control property
# 19-December-2021: Reworked script extensions lists to be bit maps instead
# of zero-terminated lists of script numbers.
# ----------------------------------------------------------------------------
#
# Changes to the refactored script:
#
# 26-December-2021: Refactoring completed
# 10-January-2022: Addition of general Boolean property support
# 12-January-2022: Merge scriptx and bidiclass fields
# 14-January-2022: Enlarge Boolean property offset to 12 bits
#
# ----------------------------------------------------------------------------
#
#
# The main tables generated by this script are used by macros defined in
# pcre2_internal.h. They look up Unicode character properties using short
# sequences of code that contains no branches, which makes for greater speed.
#
# Conceptually, there is a table of records (of type ucd_record), one for each
# Unicode character. Each record contains the script number, script extension
# value, character type, grapheme break type, offset to caseless matching set,
# offset to the character's other case, the bidi class, and offset to bitmap of
# Boolean properties.
#
# A real table covering all Unicode characters would be far too big. It can be
# efficiently compressed by observing that many characters have the same
# record, and many blocks of characters (taking 128 characters in a block) have
# the same set of records as other blocks. This leads to a 2-stage lookup
# process.
#
# This script constructs seven tables. The ucd_caseless_sets table contains
# lists of characters that all match each other caselessly. Each list is
# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
# any valid character. The first list is empty; this is used for characters
# that are not part of any list.
#
# The ucd_digit_sets table contains the code points of the '9' characters in
# each set of 10 decimal digits in Unicode. This is used to ensure that digits
# in script runs all come from the same set. The first element in the vector
# contains the number of subsequent elements, which are in ascending order.
#
# Scripts are partitioned into two groups. Scripts that appear in at least one
# character's script extension list come first, followed by "Unknown" and then
# all the rest. This sorting is done automatically in the GenerateCommon.py
# script. A script's number is its index in the script_names list.
#
# The ucd_script_sets table contains bitmaps that represent lists of scripts
# for Script Extensions properties. Each bitmap consists of a fixed number of
# unsigned 32-bit numbers, enough to allocate a bit for every script that is
# used in any character's extension list, that is, enough for every script
# whose number is less than ucp_Unknown. A character's script extension value
# in its ucd record is an offset into the ucd_script_sets vector. The first
# bitmap has no bits set; characters that have no script extensions have zero
# as their script extensions value so that they use this map.
#
# The ucd_boolprop_sets table contains bitmaps that represent lists of Boolean
# properties. Each bitmap consists of a fixed number of unsigned 32-bit
# numbers, enough to allocate a bit for each supported Boolean property.
#
# The ucd_records table contains one instance of every unique character record
# that is required. The ucd_stage1 table is indexed by a character's block
# number, which is the character's code point divided by 128, since 128 is the
# size of each block. The result of a lookup in ucd_stage1 a "virtual" block
# number.
#
# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by
# the offset of a character within its own block, and the result is the index
# number of the required record in the ucd_records vector.
#
# The following examples are correct for the Unicode 14.0.0 database. Future
# updates may make change the actual lookup values.
#
# Example: lowercase "a" (U+0061) is in block 0
# lookup 0 in stage1 table yields 0
# lookup 97 (0x61) in the first table in stage2 yields 35
# record 35 is { 0, 5, 12, 0, -32, 18432, 44 }
# 0 = ucp_Latin => Latin script
# 5 = ucp_Ll => Lower case letter
# 12 = ucp_gbOther => Grapheme break property "Other"
# 0 => Not part of a caseless set
# -32 (-0x20) => Other case is U+0041
# 18432 = 0x4800 => Combined Bidi class + script extension values
# 44 => Offset to Boolean properties
#
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
# script extension value, giving:
#
# 9 = ucp_bidiL => Bidi class left-to-right
# 0 => No special script extension property
#
# Almost all lowercase latin characters resolve to the same record. One or two
# are different because they are part of a multi-character caseless set (for
# example, k, K and the Kelvin symbol are such a set).
#
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
# lookup 96 in stage1 table yields 93
# lookup 66 (0x42) in table 93 in stage2 yields 819
# record 819 is { 20, 7, 12, 0, 0, 18432, 82 }
# 20 = ucp_Hiragana => Hiragana script
# 7 = ucp_Lo => Other letter
# 12 = ucp_gbOther => Grapheme break property "Other"
# 0 => Not part of a caseless set
# 0 => No other case
# 18432 = 0x4800 => Combined Bidi class + script extension values
# 82 => Offset to Boolean properties
#
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
# script extension value, giving:
#
# 9 = ucp_bidiL => Bidi class left-to-right
# 0 => No special script extension property
#
# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
# lookup 57 in stage1 table yields 55
# lookup 80 (0x50) in table 55 in stage2 yields 621
# record 621 is { 84, 12, 3, 0, 0, 26762, 96 }
# 84 = ucp_Inherited => Script inherited from predecessor
# 12 = ucp_Mn => Non-spacing mark
# 3 = ucp_gbExtend => Grapheme break property "Extend"
# 0 => Not part of a caseless set
# 0 => No other case
# 26762 = 0x688A => Combined Bidi class + script extension values
# 96 => Offset to Boolean properties
#
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
# script extension value, giving:
#
# 13 = ucp_bidiNSM => Bidi class non-spacing mark
# 138 => Script Extension list offset = 138
#
# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
# 18, and 47 set. This means that this character is expected to be used with
# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
#
# Philip Hazel, last updated 14 January 2022.
##############################################################################
# Import standard modules
import re
import string
import sys
# Import common data lists and functions
from GenerateCommon import \
bidi_classes, \
bool_properties, \
bool_propsfiles, \
bool_props_list_item_size, \
break_properties, \
category_names, \
general_category_names, \
script_abbrevs, \
script_list_item_size, \
script_names, \
open_output
# Some general parameters
MAX_UNICODE = 0x110000
NOTACHAR = 0xffffffff
# ---------------------------------------------------------------------------
# DEFINE FUNCTIONS
# ---------------------------------------------------------------------------
# Parse a line of Scripts.txt, GraphemeBreakProperty.txt, DerivedBidiClass.txt
# or DerivedGeneralCategory.txt
def make_get_names(enum):
return lambda chardata: enum.index(chardata[1])
# Parse a line of CaseFolding.txt
def get_other_case(chardata):
if chardata[1] == 'C' or chardata[1] == 'S':
return int(chardata[2], 16) - int(chardata[0], 16)
return 0
# Parse a line of ScriptExtensions.txt
def get_script_extension(chardata):
global last_script_extension
offset = len(script_lists) * script_list_item_size
if last_script_extension == chardata[1]:
return offset - script_list_item_size
last_script_extension = chardata[1]
script_lists.append(tuple(script_abbrevs.index(abbrev) for abbrev in last_script_extension.split(' ')))
return offset
# Read a whole table in memory, setting/checking the Unicode version
def read_table(file_name, get_value, default_value):
global unicode_version
f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name)
file_base = f.group(1)
version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$"
file = open(file_name, 'r', encoding='utf-8')
f = re.match(version_pat, file.readline())
version = f.group(1)
if unicode_version == "":
unicode_version = version
elif unicode_version != version:
print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr)
table = [default_value] * MAX_UNICODE
for line in file:
line = re.sub(r'#.*', '', line)
chardata = list(map(str.strip, line.split(';')))
if len(chardata) <= 1:
continue
value = get_value(chardata)
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
char = int(m.group(1), 16)
if m.group(3) is None:
last = char
else:
last = int(m.group(3), 16)
for i in range(char, last + 1):
# It is important not to overwrite a previously set value because in the
# CaseFolding file there are lines to be ignored (returning the default
# value of 0) which often come after a line which has already set data.
if table[i] == default_value:
table[i] = value
file.close()
return table
# Get the smallest possible C language type for the values in a table
def get_type_size(table):
type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4),
("signed char", 1), ("int16_t", 2), ("int32_t", 4)]
limits = [(0, 255), (0, 65535), (0, 4294967295), (-128, 127),
(-32768, 32767), (-2147483648, 2147483647)]
minval = min(table)
maxval = max(table)
for num, (minlimit, maxlimit) in enumerate(limits):
if minlimit <= minval and maxval <= maxlimit:
return type_size[num]
raise OverflowError("Too large to fit into C types")
# Get the total size of a list of tables
def get_tables_size(*tables):
total_size = 0
for table in tables:
type, size = get_type_size(table)
total_size += size * len(table)
return total_size
# Compress a table into the two stages
def compress_table(table, block_size):
blocks = {} # Dictionary for finding identical blocks
stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
stage2 = [] # Stage 2 table contains the blocks with property values
table = tuple(table)
for i in range(0, len(table), block_size):
block = table[i:i+block_size]
start = blocks.get(block)
if start is None:
# Allocate a new block
start = len(stage2) / block_size
stage2 += block
blocks[block] = start
stage1.append(start)
return stage1, stage2
# Output a table
def write_table(table, table_name, block_size = None):
type, size = get_type_size(table)
ELEMS_PER_LINE = 16
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
if block_size:
s += ", block = %d" % block_size
f.write(s + " */\n")
table = tuple(table)
if block_size is None:
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */\n"
mult = MAX_UNICODE / len(table)
for i in range(0, len(table), ELEMS_PER_LINE):
f.write(fmt % (table[i:i+ELEMS_PER_LINE] + (int(i * mult),)))
else:
if block_size > ELEMS_PER_LINE:
el = ELEMS_PER_LINE
else:
el = block_size
fmt = "%3d," * el + "\n"
if block_size > ELEMS_PER_LINE:
fmt = fmt * int(block_size / ELEMS_PER_LINE)
for i in range(0, len(table), block_size):
f.write(("\n/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
f.write("};\n\n")
# Extract the unique combinations of properties into records
def combine_tables(*tables):
records = {}
index = []
for t in zip(*tables):
i = records.get(t)
if i is None:
i = records[t] = len(records)
index.append(i)
return index, records
# Create a record struct
def get_record_size_struct(records):
size = 0
structure = 'typedef struct {\n'
for i in range(len(records[0])):
record_slice = [record[i] for record in records]
slice_type, slice_size = get_type_size(record_slice)
# add padding: round up to the nearest power of slice_size
size = (size + slice_size - 1) & -slice_size
size += slice_size
structure += '%s property_%d;\n' % (slice_type, i)
# round up to the first item of the next structure in array
record_slice = [record[0] for record in records]
slice_type, slice_size = get_type_size(record_slice)
size = (size + slice_size - 1) & -slice_size
structure += '} ucd_record;\n*/\n'
return size, structure
# Write records
def write_records(records, record_size):
f.write('const ucd_record PRIV(ucd_records)[] = { ' + \
'/* %d bytes, record size %d */\n' % (len(records) * record_size, record_size))
records = list(zip(list(records.keys()), list(records.values())))
records.sort(key = lambda x: x[1])
for i, record in enumerate(records):
f.write((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */\n') % (record[0] + (i,)))
f.write('};\n\n')
# Write a bit set
def write_bitsets(list, item_size):
for d in list:
bitwords = [0] * item_size
for idx in d:
bitwords[idx // 32] |= 1 << (idx & 31)
s = " "
for x in bitwords:
f.write("%s" % s)
s = ", "
f.write("0x%08xu" % x)
f.write(",\n")
f.write("};\n\n")
# ---------------------------------------------------------------------------
# This bit of code must have been useful when the original script was being
# developed. Retain it just in case it is ever needed again.
# def test_record_size():
# tests = [ \
# ( [(3,), (6,), (6,), (1,)], 1 ), \
# ( [(300,), (600,), (600,), (100,)], 2 ), \
# ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
# ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
# ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
# ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
# ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
# ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
# ]
# for test in tests:
# size, struct = get_record_size_struct(test[0])
# assert(size == test[1])
# test_record_size()
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
# MAIN CODE FOR CREATING TABLES
# ---------------------------------------------------------------------------
unicode_version = ""
# Some of the tables imported from GenerateCommon.py have alternate comment
# strings for use by GenerateUcpHeader. The comments are not wanted here, so
# remove them.
bidi_classes = bidi_classes[::2]
break_properties = break_properties[::2]
category_names = category_names[::2]
# Create the various tables from Unicode data files
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown'))
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_properties), break_properties.index('Other'))
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', make_get_names(bidi_classes), bidi_classes.index('L'))
# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now
# we need to find the Extended_Pictographic property for emoji characters. This
# can be set as an additional grapheme break property, because the default for
# all the emojis is "other". We scan the emoji-data.txt file and modify the
# break-props table.
file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8')
for line in file:
line = re.sub(r'#.*', '', line)
chardata = list(map(str.strip, line.split(';')))
if len(chardata) <= 1:
continue
if chardata[1] != "Extended_Pictographic":
continue
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
char = int(m.group(1), 16)
if m.group(3) is None:
last = char
else:
last = int(m.group(3), 16)
for i in range(char, last + 1):
if break_props[i] != break_properties.index('Other'):
print("WARNING: Emoji 0x%x has break property %s, not 'Other'",
i, break_properties[break_props[i]], file=sys.stderr)
break_props[i] = break_properties.index('Extended_Pictographic')
file.close()
# Handle script extensions. The get_script_extesion() function maintains a
# list of unique bitmaps representing lists of scripts, returning the offset
# in that list. Initialize the list with an empty set, which is used for
# characters that have no script extensions.
script_lists = [[]]
last_script_extension = ""
scriptx_bidi_class = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0)
for idx in range(len(scriptx_bidi_class)):
scriptx_bidi_class[idx] = scriptx_bidi_class[idx] | (bidi_class[idx] << 11)
bidi_class = None
# Find the Boolean properties of each character. This next bit of magic creates
# a list of empty lists. Using [[]] * MAX_UNICODE gives a list of references to
# the *same* list, which is not what we want.
bprops = [[] for _ in range(MAX_UNICODE)]
# Collect the properties from the various files
for filename in bool_propsfiles:
try:
file = open('Unicode.tables/' + filename, 'r')
except IOError:
print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
sys.exit(1)
for line in file:
line = re.sub(r'#.*', '', line)
data = list(map(str.strip, line.split(';')))
if len(data) <= 1:
continue
try:
ix = bool_properties.index(data[1])
except ValueError:
continue
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', data[0])
char = int(m.group(1), 16)
if m.group(3) is None:
last = char
else:
last = int(m.group(3), 16)
for i in range(char, last + 1):
bprops[i].append(ix)
file.close()
# The ASCII property isn't listed in any files, but it is easy enough to add
# it manually.
ix = bool_properties.index("ASCII")
for i in range(128):
bprops[i].append(ix)
# The Bidi_Mirrored property isn't listed in any property files. We have to
# deduce it from the file that lists the mirrored characters.
ix = bool_properties.index("Bidi_Mirrored")
try:
file = open('Unicode.tables/BidiMirroring.txt', 'r')
except IOError:
print(f"** Couldn't open {'Unicode.tables/BidiMirroring.txt'}\n")
sys.exit(1)
for line in file:
line = re.sub(r'#.*', '', line)
data = list(map(str.strip, line.split(';')))
if len(data) <= 1:
continue
c = int(data[0], 16)
bprops[c].append(ix)
file.close()
# Scan each character's boolean property list and created a list of unique
# lists, at the same time, setting the index in that list for each property in
# the bool_props vector.
bool_props = [0] * MAX_UNICODE
bool_props_lists = [[]]
for c in range(MAX_UNICODE):
s = set(bprops[c])
for i in range(len(bool_props_lists)):
if s == set(bool_props_lists[i]):
break;
else:
bool_props_lists.append(bprops[c])
i += 1
bool_props[c] = i * bool_props_list_item_size
# This block of code was added by PH in September 2012. It scans the other_case
# table to find sets of more than two characters that must all match each other
# caselessly. Later in this script a table of these sets is written out.
# However, we have to do this work here in order to compute the offsets in the
# table that are inserted into the main table.
# The CaseFolding.txt file lists pairs, but the common logic for reading data
# sets only one value, so first we go through the table and set "return"
# offsets for those that are not already set.
for c in range(MAX_UNICODE):
if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
other_case[c + other_case[c]] = -other_case[c]
# Now scan again and create equivalence sets.
caseless_sets = []
for c in range(MAX_UNICODE):
o = c + other_case[c]
# Trigger when this character's other case does not point back here. We
# now have three characters that are case-equivalent.
if other_case[o] != -other_case[c]:
t = o + other_case[o]
# Scan the existing sets to see if any of the three characters are already
# part of a set. If so, unite the existing set with the new set.
appended = 0
for s in caseless_sets:
found = 0
for x in s:
if x == c or x == o or x == t:
found = 1
# Add new characters to an existing set
if found:
found = 0
for y in [c, o, t]:
for x in s:
if x == y:
found = 1
if not found:
s.append(y)
appended = 1
# If we have not added to an existing set, create a new one.
if not appended:
caseless_sets.append([c, o, t])
# End of loop looking for caseless sets.
# Now scan the sets and set appropriate offsets for the characters.
caseless_offsets = [0] * MAX_UNICODE
offset = 1;
for s in caseless_sets:
for x in s:
caseless_offsets[x] = offset
offset += len(s) + 1
# End of block of code for creating offsets for caseless matching sets.
# Combine all the tables
table, records = combine_tables(script, category, break_props,
caseless_offsets, other_case, scriptx_bidi_class, bool_props)
# Find the record size and create a string definition of the structure for
# outputting as a comment.
record_size, record_struct = get_record_size_struct(list(records.keys()))
# Find the optimum block size for the two-stage table
min_size = sys.maxsize
for block_size in [2 ** i for i in range(5,10)]:
size = len(records) * record_size
stage1, stage2 = compress_table(table, block_size)
size += get_tables_size(stage1, stage2)
#print "/* block size %5d => %5d bytes */" % (block_size, size)
if size < min_size:
min_size = size
min_stage1, min_stage2 = stage1, stage2
min_block_size = block_size
# ---------------------------------------------------------------------------
# MAIN CODE FOR WRITING THE OUTPUT FILE
# ---------------------------------------------------------------------------
# Open the output file (no return on failure). This call also writes standard
# header boilerplate.
f = open_output("pcre2_ucd.c")
# Output this file's heading text
f.write("""\
/* This file contains tables of Unicode properties that are extracted from
Unicode data files. See the comments at the start of maint/GenerateUcd.py for
details.
As well as being part of the PCRE2 library, this file is #included by the
pcre2test program, which redefines the PRIV macro to change table names from
_pcre2_xxx to xxxx, thereby avoiding name clashes with the library. At present,
just one of these tables is actually needed. When compiling the library, some
headers are needed. */
#ifndef PCRE2_PCRE2TEST
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "pcre2_internal.h"
#endif /* PCRE2_PCRE2TEST */
/* The tables herein are needed only when UCP support is built, and in PCRE2
that happens automatically with UTF support. This module should not be
referenced otherwise, so it should not matter whether it is compiled or not.
However a comment was received about space saving - maybe the guy linked all
the modules rather than using a library - so we include a condition to cut out
the tables when not needed. But don't leave a totally empty module because some
compilers barf at that. Instead, just supply some small dummy tables. */
#ifndef SUPPORT_UNICODE
const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0}};
const uint16_t PRIV(ucd_stage1)[] = {0};
const uint16_t PRIV(ucd_stage2)[] = {0};
const uint32_t PRIV(ucd_caseless_sets)[] = {0};
#else
\n""")
# --- Output some variable heading stuff ---
f.write("/* Total size: %d bytes, block size: %d. */\n\n" % (min_size, min_block_size))
f.write('const char *PRIV(unicode_version) = "{}";\n\n'.format(unicode_version))
f.write("""\
/* When recompiling tables with a new Unicode version, please check the types
in this structure definition with those in pcre2_internal.h (the actual field
names will be different).
\n""")
f.write(record_struct)
f.write("""
/* If the 32-bit library is run in non-32-bit mode, character values greater
than 0x10ffff may be encountered. For these we set up a special record. */
#if PCRE2_CODE_UNIT_WIDTH == 32
const ucd_record PRIV(dummy_ucd_record)[] = {{
ucp_Unknown, /* script */
ucp_Cn, /* type unassigned */
ucp_gbOther, /* grapheme break property */
0, /* case set */
0, /* other case */
0 | (ucp_bidiL << UCD_BIDICLASS_SHIFT), /* script extension and bidi class */
0, /* bool properties offset */
}};
#endif
\n""")
# --- Output the table of caseless character sets ---
f.write("""\
/* This table contains lists of characters that are caseless sets of
more than one character. Each list is terminated by NOTACHAR. */
const uint32_t PRIV(ucd_caseless_sets)[] = {
NOTACHAR,
""")
for s in caseless_sets:
s = sorted(s)
for x in s:
f.write(' 0x%04x,' % x)
f.write(' NOTACHAR,\n')
f.write('};\n\n')
# --- Other tables are not needed by pcre2test ---
f.write("""\
/* When #included in pcre2test, we don't need the table of digit sets, nor the
the large main UCD tables. */
#ifndef PCRE2_PCRE2TEST
\n""")
# --- Read Scripts.txt again for the sets of 10 digits. ---
digitsets = []
file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8')
for line in file:
m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line)
if m is None:
continue
first = int(m.group(1),16)
last = int(m.group(2),16)
if ((last - first + 1) % 10) != 0:
f.write("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last),
file=sys.stderr)
while first < last:
digitsets.append(first + 9)
first += 10
file.close()
digitsets.sort()
f.write("""\
/* This table lists the code points for the '9' characters in each set of
decimal digits. It is used to ensure that all the digits in a script run come
from the same set. */
const uint32_t PRIV(ucd_digit_sets)[] = {
""")
f.write(" %d, /* Number of subsequent values */" % len(digitsets))
count = 8
for d in digitsets:
if count == 8:
f.write("\n ")
count = 0
f.write(" 0x%05x," % d)
count += 1
f.write("\n};\n\n")
f.write("""\
/* This vector is a list of script bitsets for the Script Extension property.
The number of 32-bit words in each bitset is #defined in pcre2_ucp.h as
ucd_script_sets_item_size. */
const uint32_t PRIV(ucd_script_sets)[] = {
""")
write_bitsets(script_lists, script_list_item_size)
f.write("""\
/* This vector is a list of bitsets for Boolean properties. The number of
32_bit words in each bitset is #defined as ucd_boolprop_sets_item_size in
pcre2_ucp.h. */
const uint32_t PRIV(ucd_boolprop_sets)[] = {
""")
write_bitsets(bool_props_lists, bool_props_list_item_size)
# Output the main UCD tables.
f.write("""\
/* These are the main two-stage UCD tables. The fields in each record are:
script (8 bits), character type (8 bits), grapheme break property (8 bits),
offset to multichar other cases or zero (8 bits), offset to other case or zero
(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed
into a 16-bit field, and offset in binary properties table (16 bits). */
\n""")
write_records(records, record_size)
write_table(min_stage1, 'PRIV(ucd_stage1)')
write_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
f.write("#if UCD_BLOCK_SIZE != %d\n" % min_block_size)
f.write("""\
#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h
#endif
#endif /* SUPPORT_UNICODE */
#endif /* PCRE2_PCRE2TEST */
/* End of pcre2_ucd.c */
""")
f.close
# End

View File

@ -0,0 +1,98 @@
#! /usr/bin/python
# PCRE2 UNICODE PROPERTY SUPPORT
# ------------------------------
# This script generates the pcre2_ucp.h file from Unicode data files. This
# header uses enumerations to give names to Unicode property types and script
# names.
# This script was created in December 2021 as part of the Unicode data
# generation refactoring.
# Import common data lists and functions
from GenerateCommon import \
bidi_classes, \
bool_properties, \
bool_props_list_item_size, \
break_properties, \
category_names, \
general_category_names, \
script_list_item_size, \
script_names, \
open_output
# Open the output file (no return on failure). This call also writes standard
# header boilerplate.
f = open_output("pcre2_ucp.h")
# Output this file's heading text
f.write("""\
#ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD
#define PCRE2_UCP_H_IDEMPOTENT_GUARD
/* This file contains definitions of the Unicode property values that are
returned by the UCD access macros and used throughout PCRE2.
IMPORTANT: The specific values of the first two enums (general and particular
character categories) are assumed by the table called catposstab in the file
pcre2_auto_possess.c. They are unlikely to change, but should be checked after
an update. */
\n""")
f.write("/* These are the general character categories. */\n\nenum {\n")
for i in general_category_names:
f.write(" ucp_%s,\n" % i)
f.write("};\n\n")
f.write("/* These are the particular character categories. */\n\nenum {\n")
for i in range(0, len(category_names), 2):
f.write(" ucp_%s, /* %s */\n" % (category_names[i], category_names[i+1]))
f.write("};\n\n")
f.write("/* These are Boolean properties. */\n\nenum {\n")
for i in bool_properties:
f.write(" ucp_%s,\n" % i)
f.write(" /* This must be last */\n")
f.write(" ucp_Bprop_Count\n};\n\n")
f.write("/* Size of entries in ucd_boolprop_sets[] */\n\n")
f.write("#define ucd_boolprop_sets_item_size %d\n\n" % bool_props_list_item_size)
f.write("/* These are the bidi class values. */\n\nenum {\n")
for i in range(0, len(bidi_classes), 2):
sp = ' ' * (4 - len(bidi_classes[i]))
f.write(" ucp_bidi%s,%s /* %s */\n" % (bidi_classes[i], sp, bidi_classes[i+1]))
f.write("};\n\n")
f.write("/* These are grapheme break properties. The Extended Pictographic "
"property\ncomes from the emoji-data.txt file. */\n\nenum {\n")
for i in range(0, len(break_properties), 2):
sp = ' ' * (21 - len(break_properties[i]))
f.write(" ucp_gb%s,%s /* %s */\n" % (break_properties[i], sp, break_properties[i+1]))
f.write("};\n\n")
f.write("/* These are the script identifications. */\n\nenum {\n /* Scripts which has characters in other scripts. */\n")
for i in script_names:
if i == "Unknown":
f.write("\n /* Scripts which has no characters in other scripts. */\n")
f.write(" ucp_%s,\n" % i)
f.write("\n")
f.write(" /* This must be last */\n")
f.write(" ucp_Script_Count\n};\n\n")
f.write("/* Size of entries in ucd_script_sets[] */\n\n")
f.write("#define ucd_script_sets_item_size %d\n\n" % script_list_item_size)
f.write("#endif /* PCRE2_UCP_H_IDEMPOTENT_GUARD */\n\n")
f.write("/* End of pcre2_ucp.h */\n")
f.close()
# End

View File

@ -0,0 +1,203 @@
#! /usr/bin/python
# PCRE2 UNICODE PROPERTY SUPPORT
# ------------------------------
# This script generates the pcre2_ucptables.c file, which contains tables for
# recognizing Unicode property names. It is #included by pcre2_tables.c. In
# order to reduce the number of relocations when loading the PCRE2 library, the
# names are held as a single large string, with offsets in the table. This is
# tedious to maintain by hand. Therefore, a script is used to generate the
# table.
# This script was created in December 2021 based on the previous GenerateUtt
# script, whose output had to be manually edited into pcre2_tables.c. Here is
# the history of the original script:
# -----------------------------------------------------------------------------
# Modified by PH 17-March-2009 to generate the more verbose form that works
# for UTF-support in EBCDIC as well as ASCII environments.
# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
# Modified by PH 04-May-2010 to add new "X.." special categories.
# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
# Modified by ChPe 30-September-2012 to add this note; no other changes were
# necessary for Unicode 6.2.0 support.
# Modfied by PH 26-February-2013 to add the Xuc special category.
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
# Script updated to Python 3 by running it through the 2to3 converter.
# Added script names for Unicode 7.0.0, 20-June-2014.
# Added script names for Unicode 8.0.0, 19-June-2015.
# Added script names for Unicode 10.0.0, 02-July-2017.
# Added script names for Unicode 11.0.0, 03-July-2018.
# Added 'Unknown' script, 01-October-2018.
# Added script names for Unicode 12.1.0, 27-July-2019.
# Added script names for Unicode 13.0.0, 10-March-2020.
# Added Script names for Unicode 14.0.0, PCRE2-10.39
# Added support for bidi class and bidi control, 06-December-2021
# This also involved lower casing strings and removing underscores, in
# accordance with Unicode's "loose matching" rules, which Perl observes.
# Changed default script type from PT_SC to PT_SCX, 18-December-2021
# -----------------------------------------------------------------------------
#
# Note subsequent changes here:
#
# 27-December-2021: Added support for 4-letter script abbreviations.
# 10-January-2022: Further updates for Boolean property support
# -----------------------------------------------------------------------------
# Import common data lists and functions
from GenerateCommon import \
abbreviations, \
bool_properties, \
bidi_classes, \
category_names, \
general_category_names, \
script_names, \
open_output
# Open the output file (no return on failure). This call also writes standard
# header boilerplate.
f = open_output("pcre2_ucptables.c")
# The list in bidi_classes contains just the Unicode classes such as AN, LRE,
# etc., along with comments. We need to add "bidi" in front of each value, in
# order to create names that don't clash with other types of property.
bidi_class_names = []
for i in range(0, len(bidi_classes), 2):
bidi_class_names.append("bidi" + bidi_classes[i])
# Remove the comments from other lists that contain them.
category_names = category_names[::2]
# Create standardized versions of the names by lowercasing and removing
# underscores.
def stdname(x):
return x.lower().replace('_', '')
def stdnames(x):
y = [''] * len(x)
for i in range(len(x)):
y[i] = stdname(x[i])
return y
std_category_names = stdnames(category_names)
std_general_category_names = stdnames(general_category_names)
std_bidi_class_names = stdnames(bidi_class_names)
std_bool_properties = stdnames(bool_properties)
# Create the table, starting with the Unicode script, category and bidi class
# names. We keep both the standardized name and the original, because the
# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
# still use the full original names.
utt_table = []
scx_end = script_names.index('Unknown')
for idx, name in enumerate(script_names):
pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC'
utt_table.append((stdname(name), name, pt_type))
for abbrev in abbreviations[name]:
utt_table.append((stdname(abbrev), name, pt_type))
# Add the remaining property lists
utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names)))
utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))
for name in bool_properties:
utt_table.append((stdname(name), name, 'PT_BOOL'))
if name in abbreviations:
for abbrev in abbreviations[name]:
utt_table.append((stdname(abbrev), name, 'PT_BOOL'))
# Now add specials and synonyms. Note both the standardized and capitalized
# forms are needed.
utt_table.append(('any', 'Any', 'PT_ANY'))
utt_table.append(('l&', 'L&', 'PT_LAMP'))
utt_table.append(('lc', 'LC', 'PT_LAMP'))
utt_table.append(('xan', 'Xan', 'PT_ALNUM'))
utt_table.append(('xps', 'Xps', 'PT_PXSPACE'))
utt_table.append(('xsp', 'Xsp', 'PT_SPACE'))
utt_table.append(('xuc', 'Xuc', 'PT_UCNC'))
utt_table.append(('xwd', 'Xwd', 'PT_WORD'))
# Remove duplicates from the table and then sort it.
utt_table = list(set(utt_table))
utt_table.sort()
# Output file-specific heading
f.write("""\
#ifdef SUPPORT_UNICODE
/* The PRIV(utt)[] table below translates Unicode property names into type and
code values. It is searched by binary chop, so must be in collating sequence of
name. Originally, the table contained pointers to the name strings in the first
field of each entry. However, that leads to a large number of relocations when
a shared library is dynamically loaded. A significant reduction is made by
putting all the names into a single, large string and using offsets instead.
All letters are lower cased, and underscores are removed, in accordance with
the "loose matching" rules that Unicode advises and Perl uses. */
\n""")
# We have to use STR_ macros to define the strings so that it all works in
# UTF-8 mode on EBCDIC platforms.
for utt in utt_table:
f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')))
for c in utt[0]:
if c == '&':
f.write(' STR_AMPERSAND')
else:
f.write(' STR_%s' % c);
f.write(' "\\0"\n')
# Output the long string of concatenated names
f.write('\nconst char PRIV(utt_names)[] =\n');
last = ''
for utt in utt_table:
if utt == utt_table[-1]:
last = ';'
f.write(' STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last))
# Output the property type table
f.write('\nconst ucp_type_table PRIV(utt)[] = {\n')
offset = 0
last = ','
for utt in utt_table:
if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
value = '0'
else:
value = 'ucp_' + utt[1]
if utt == utt_table[-1]:
last = ''
f.write(' { %3d, %s, %s }%s\n' % (offset, utt[2], value, last))
offset += len(utt[0]) + 1
f.write('};\n\n')
# Ending text
f.write("""\
const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
#endif /* SUPPORT_UNICODE */
/* End of pcre2_ucptables.c */
""")
f.close
# End

View File

@ -0,0 +1,453 @@
#! /bin/sh
# This is a script for the use of PCRE2 maintainers. It configures and rebuilds
# PCRE2 with a variety of configuration options, and in each case runs the
# tests to ensure that all goes well. Every possible combination would take far
# too long, so we use a representative sample. This script should be run in the
# PCRE2 source directory.
# While debugging, it is sometimes useful to be able to cut out some of the
# tests, in order to run those that are giving errors. The following options
# do this:
#
# -noasan skip the test that uses -fsanitize=address
# -nousan skip the test that uses -fsanitize=undefined
# -nodebug skip the test that uses --enable-debug
# -nojit skip all JIT tests
# -nojitmain skip non-valgrind JIT tests
# -nojitvalgrind skip JIT tests with valgrind
# -nomain skip all the main (non-JIT) set of tests
# -nomainvalgrind skip the main (non-JIT) valgrind tests
# -notmp skip the test in a temporary directory
# -novalgrind skip all the valgrind tests
# Alternatively, if any of those names are given with '+' instead of '-no',
# only those groups named with '+' are run (e.g. +jit). If -dummy is given,
# no tests are actually run - this provides a means of testing the selectors.
# The -v option causes a call to 'pcre2test -C' to happen for each
# configuration.
useasan=1
useusan=1
usedebug=1
usejit=1
usejitvalgrind=1
usemain=1
usemainvalgrind=1
usetmp=1
usevalgrind=1
dummy=0
seenplus=0
verbose=0
while [ $# -gt 0 ] ; do
case $1 in
+*) if [ $seenplus -eq 0 ]; then
useasan=0
useusan=0
usedebug=0
usejit=0
usejitvalgrind=0
usemain=0
usemainvalgrind=0
usetmp=0
seenplus=1
fi;;
esac
case $1 in
-dummy) dummy=1;;
-v) verbose=1;;
-noasan) useasan=0;;
-nousan) useusan=0;;
-nodebug) usedebug=0;;
-nojit) usejit=0; usejitvalgrind=0;;
-nojitmain) usejit=0;;
-nojitvalgrind) usejitvalgrind=0;;
-nomain) usemain=0; usemainvalgrind=0;;
-nomainvalgrind) usemainvalgrind=0;;
-notmp) usetmp=0;;
-novalgrind) usevalgrind=0;;
+asan) useasan=1;;
+usan) useusan=1;;
+debug) usedebug=1;;
+jit) usejit=1; usejitvalgrind=1;;
+jitmain) usejit=1;;
+jitvalgrind) usejitvalgrind=1;;
+main) usemain=1; usemainvalgrind=1;;
+mainvalgrind) usemainvalgrind=1;;
+tmp) usetmp=1;;
+valgrind) usevalgrind=1; usejitvalgrind=1; usemainvalgrind=1;;
*) echo "Unknown option '$1'"; exit 1;;
esac
shift
done
if [ $usejitvalgrind -eq 0 -a $usemainvalgrind -eq 0 ] ; then
usevalgrind=0
fi
# This is in case the caller has set aliases (as I do - PH)
unset cp ls mv rm
# This is a temporary directory for testing out-of-line builds
tmp=/tmp/pcre2testing
# Don't bother with compiler optimization for most tests; it just slows down
# compilation a lot (and running the tests themselves is quick). However, one
# special test turns optimization on, because it can provoke some compiler
# warnings.
CFLAGS="-g"
OFLAGS="-O0"
ISGCC=0
# If the compiler is gcc, add a lot of warning switches.
cc --version >/tmp/pcre2ccversion 2>/dev/null
if [ $? -eq 0 ] && grep GCC /tmp/pcre2ccversion >/dev/null; then
ISGCC=1
CFLAGS="$CFLAGS -Wall"
CFLAGS="$CFLAGS -Wno-overlength-strings"
CFLAGS="$CFLAGS -Wpointer-arith"
CFLAGS="$CFLAGS -Wwrite-strings"
CFLAGS="$CFLAGS -Wundef -Wshadow"
CFLAGS="$CFLAGS -Wmissing-field-initializers"
CFLAGS="$CFLAGS -Wunused-parameter"
CFLAGS="$CFLAGS -Wextra -Wformat"
CFLAGS="$CFLAGS -Wbad-function-cast"
CFLAGS="$CFLAGS -Wmissing-declarations"
CFLAGS="$CFLAGS -Wnested-externs"
CFLAGS="$CFLAGS -pedantic"
CFLAGS="$CFLAGS -Wuninitialized"
CFLAGS="$CFLAGS -Wmaybe-uninitialized"
CFLAGS="$CFLAGS -Wmissing-prototypes"
CFLAGS="$CFLAGS -Wstrict-prototypes"
fi
rm -f /tmp/pcre2ccversion
# This function runs a single test with the set of configuration options that
# are in $opts. The source directory must be set in srcdir. The function must
# be defined as "runtest()" not "function runtest()" in order to run on
# Solaris.
runtest()
{
rm -f $srcdir/pcre2test $srcdir/pcre2grep $srcdir/pcre2_jit_test
testcount=`expr $testcount + 1`
if [ "$opts" = "" ] ; then
echo "[$testcount/$testtotal] Configuring with: default settings"
else
echo "[$testcount/$testtotal] Configuring with:"
echo " $opts"
fi
if [ $dummy -eq 1 ]; then return; fi
CFLAGS="$CFLAGS" \
$srcdir/configure $opts >/dev/null 2>teststderrM
if [ $? -ne 0 ]; then
echo " "
echo "******** Error while configuring ********"
cat teststderrM
exit 1
fi
# There is an infelicity in the Autotools world (as of October 2015) which
# causes the message
#
# ar: `u' modifier ignored since `D' is the default (see `U')
#
# to be output while linking. This triggers an unwanted error report from this
# script, because it expects no stderr output while making. To get round this
# we filter the stderr output through sed, removing all occurrences of the
# above lines. Just for paranoia, check that sed is available before doing
# this.
echo "Making"
make -j >/dev/null 2>teststderrM
makeRC=$?
if command -v sed >/dev/null 2>&1 ; then
sed "/\`u' modifier ignored since \`D' is the default/ d" \
teststderrM > teststderrMM
mv -f teststderrMM teststderrM
fi
if [ $makeRC -ne 0 -o -s teststderrM ]; then
echo " "
echo "******** Errors or warnings while making ********"
echo " "
cat teststderrM
exit 1
fi
if [ $verbose -eq 1 ]; then
./pcre2test -C
fi
./pcre2test -C jit >/dev/null
jit=$?
./pcre2test -C pcre2-8 >/dev/null
pcre2_8=$?
echo "Running PCRE2 library tests $withvalgrind"
$srcdir/RunTest $valgrind >teststdoutM 2>teststderrM
if [ $? -ne 0 -o -s teststderrM ]; then
echo " "
echo "**** Test failed ****"
if [ -s teststderrM ] ; then
cat teststderrM
else
cat teststdoutM
fi
exit 1
fi
if [ $pcre2_8 -gt 0 ]; then
echo "Running pcre2grep tests $withvalgrind"
$srcdir/RunGrepTest $valgrind >teststdoutM 2>teststderrM
if [ $? -ne 0 -o -s teststderrM ]; then
echo " "
echo "**** Test failed ****"
cat teststderrM
cat teststdoutM
exit 1
fi
else
echo "Skipping pcre2grep tests: 8-bit library not compiled"
fi
if [ "$jit" -gt 0 ]; then
echo "Running JIT regression tests $withvalgrind"
$jrvalgrind $srcdir/pcre2_jit_test >teststdoutM 2>teststderrM
if [ $? -ne 0 -o -s teststderrM ]; then
echo " "
echo "**** Test failed ****"
cat teststderrM
cat teststdoutM
exit 1
fi
else
echo "Skipping JIT regression tests: JIT is not enabled"
fi
}
# Update the total count whenever a new test is added; it is used to show
# progess as each test is run.
testtotal=`expr 17 \* $usemain + \
1 \* $usemain \* $usedebug + \
1 \* $usetmp + \
1 \* $ISGCC \* $usemain + \
1 \* $ISGCC \* $usemain \* $useasan + \
1 \* $ISGCC \* $usemain \* $useusan + \
13 \* $usejit + \
2 \* $usemainvalgrind + \
2 \* $usejitvalgrind`
testcount=0
if [ $testtotal -eq 0 ] ; then
echo "** No tests selected"
exit 1
fi
valgrind=
jrvalgrind=
withvalgrind=
srcdir=.
export srcdir
if [ $usejit -ne 0 ]; then
enable_jit=--enable-jit
else
enable_jit=
fi
# If gcc is in use, run a maximally configured test with -O2, because that can
# throw up warnings that are not detected with -O0. Then run a second test with
# -fsanitize=address, which also may throw up new warnings as well as checking
# things at runtime. Finally, run another test using -fsanitize=undefined
# -std-gnu99 to check for runtime actions that are not well defined. However,
# we also use -fno-sanitize=shift to avoid warnings for shifts of negative
# numbers, which occur in src/pcre2_jit_compile.c.
if [ $ISGCC -ne 0 -a $usemain -ne 0 ]; then
echo "---------- Maximally configured test with -O2 ----------"
SAVECFLAGS="$CFLAGS"
CFLAGS="-O2 $CFLAGS"
echo "CFLAGS=$CFLAGS"
opts="--disable-shared $enable_jit --enable-pcre2-16 --enable-pcre2-32"
runtest
if [ $useasan -ne 0 ]; then
echo "---------- Maximally configured test with -fsanitize=address ----------"
# Following a kernel change, sanitize address doesn't work unless the extra
# PIE options are also set.
CFLAGS="$OFLAGS $SAVECFLAGS -no-pie -fno-PIE -fsanitize=address"
echo "CFLAGS=$CFLAGS"
opts="--disable-shared $enable_jit --enable-pcre2-16 --enable-pcre2-32"
runtest
fi
# This also seems to be the case for sanitize undefined.
if [ $useusan -ne 0 ]; then
echo "------- Maximally configured test with -fsanitize=undefined -fno-sanitize=shift -fno-sanitize=alignment -std=gnu99 -------"
CFLAGS="$OFLAGS $SAVECFLAGS -no-pie -fno-PIE -fsanitize=undefined -fno-sanitize=shift -fno-sanitize=alignment -std=gnu99"
echo "CFLAGS=$CFLAGS"
opts="--disable-shared $enable_jit --enable-pcre2-16 --enable-pcre2-32"
runtest
fi
CFLAGS="$OFLAGS $SAVECFLAGS"
fi
# This set of tests builds PCRE2 and runs the tests with a variety of configure
# options, in the current (source) directory. The empty configuration builds
# with all the default settings. As well as testing that these options work, we
# use --disable-shared or --disable-static except for the default test (which
# builds both) to save a bit of time by building only one version of the
# library for the subsequent tests.
echo "---------- CFLAGS for the remaining tests ----------"
echo "CFLAGS=$CFLAGS"
if [ $usemain -ne 0 ]; then
if [ $usedebug -ne 0 ]; then
echo "---------- Maximally configured test with --enable-debug ----------"
opts="--disable-shared $enable_jit --enable-pcre2-16 --enable-pcre2-32 --enable-debug"
runtest
fi
echo "---------- Non-JIT tests in the current directory ----------"
for opts in \
"" \
"--disable-static" \
"--disable-shared" \
"--disable-unicode --disable-shared --enable-never-backslash-C" \
"--with-link-size=3 --disable-shared --disable-pcre2grep-callout" \
"--disable-unicode --enable-rebuild-chartables --disable-shared" \
"--disable-unicode --enable-newline-is-any --disable-shared" \
"--disable-unicode --enable-newline-is-cr --disable-shared" \
"--disable-unicode --enable-newline-is-crlf --disable-shared" \
"--disable-unicode --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared" \
"--enable-newline-is-any --disable-static" \
"--disable-unicode --enable-pcre2-16" \
"--enable-pcre2-16 --disable-shared" \
"--disable-unicode --enable-pcre2-32" \
"--enable-pcre2-32 --disable-shared" \
"--disable-unicode --enable-pcre2-32 --enable-pcre2-16 --disable-shared" \
"--disable-unicode --enable-pcre2-32 --enable-pcre2-16 --disable-pcre2-8 --disable-shared"
do
runtest
done
fi
# Now run the JIT tests unless disabled
if [ $usejit -ne 0 ]; then
echo "---------- JIT tests in the current directory ----------"
for opts in \
"--disable-unicode --enable-jit --disable-shared" \
"--enable-jit --disable-shared" \
"--enable-jit --with-link-size=3 --disable-shared" \
"--enable-jit --enable-pcre2-16 --disable-shared" \
"--disable-unicode --enable-jit --enable-pcre2-16 --disable-pcre2-8 --disable-shared" \
"--enable-jit --enable-pcre2-16 --disable-pcre2-8 --disable-shared" \
"--enable-jit --enable-pcre2-16 --with-link-size=3 --disable-shared" \
"--enable-jit --enable-pcre2-16 --with-link-size=4 --disable-shared" \
"--enable-jit --enable-pcre2-32 --disable-shared" \
"--disable-unicode --enable-jit --enable-pcre2-32 --disable-pcre2-8 --disable-shared" \
"--enable-jit --enable-pcre2-32 --disable-pcre2-8 --disable-shared" \
"--enable-jit --enable-pcre2-32 --with-link-size=4 --disable-shared" \
"--enable-jit --enable-pcre2-32 --enable-pcre2-16 --disable-pcre2-8 --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared"
do
runtest
done
fi
# Now re-run some of the tests under valgrind.
if [ $usevalgrind -ne 0 ]; then
echo "---------- Tests in the current directory using valgrind ----------"
valgrind=valgrind
withvalgrind="with valgrind"
if [ $usemainvalgrind -ne 0 ]; then
for opts in \
"--disable-shared" \
"--with-link-size=3 --enable-pcre2-16 --enable-pcre2-32 --disable-shared"
do
opts="--enable-valgrind $opts"
runtest
done
fi
if [ $usejitvalgrind -ne 0 ]; then
jrvalgrind="valgrind --tool=memcheck -q --smc-check=all-non-file --suppressions=$srcdir/testdata/valgrind-jit.supp"
for opts in \
"--enable-jit --disable-shared" \
"--enable-jit --enable-pcre2-16 --enable-pcre2-32"
do
opts="--enable-valgrind $opts"
runtest
done
fi
fi
valgrind=
jrvalgrind=
withvalgrind=
# Clean up the distribution and then do at least one build and test in a
# directory other than the source directory. It doesn't work unless the
# source directory is cleaned up first.
if [ -f Makefile ]; then
echo "Running 'make distclean'"
make distclean >/dev/null 2>&1
if [ $? -ne 0 ]; then
echo "** 'make distclean' failed"
exit 1
fi
fi
echo "---------- End of tests in the source directory ----------"
echo "Removing teststdoutM and teststderrM"
rm -rf teststdoutM teststderrM
if [ $usetmp -ne 0 ]; then
echo "---------- Tests in the $tmp directory ----------"
srcdir=`pwd`
export srcdir
if [ ! -e $tmp ]; then
mkdir $tmp
fi
if [ ! -d $tmp ]; then
echo "** Failed to create $tmp or it is not a directory"
exit 1
fi
cd $tmp
if [ $? -ne 0 ]; then
echo "** Failed to cd to $tmp"
exit 1
fi
for opts in \
"--disable-shared"
do
runtest
done
echo "Removing $tmp"
rm -rf $tmp
fi
echo "---------- All done ----------"
# End

View File

@ -0,0 +1,460 @@
MAINTENANCE README FOR PCRE2
============================
The files in the "maint" directory of the PCRE2 source contain data, scripts,
and programs that are used for the maintenance of PCRE2, but which do not form
part of the PCRE2 distribution tarballs. This document describes these files
and also contains some notes for maintainers. Its contents are:
Files in the maint directory
Updating to a new Unicode release
Preparing for a PCRE2 release
Making a PCRE2 release
Long-term ideas (wish list)
Files in the maint directory
============================
GenerateCommon.py
A Python module containing data and functions that are used by the other
Generate scripts.
GenerateTest26.py
A Python script that generates input and expected output test data for test
26, which tests certain aspects of Unicode property support.
GenerateUcd.py
A Python script that generates the file pcre2_ucd.c from GenerateCommon.py
and Unicode data files, which are themselves downloaded from the Unicode web
site. The generated file contains the tables for a 2-stage lookup of Unicode
properties, along with some auxiliary tables. The script starts with a long
comment that gives details of the tables it constructs.
GenerateUcpHeader.py
A Python script that generates the file pcre2_ucp.h from GenerateCommon.py
and Unicode data files. The generated file defines constants for various
Unicode property values.
GenerateUcpTables.py
A Python script that generates the file pcre2_ucptables.c from
GenerateCommon.py and Unicode data files. The generated file contains tables
for looking up Unicode property names.
ManyConfigTests
A shell script that runs "configure, make, test" a number of times with
different configuration settings.
pcre2_chartables.c.non-standard
This is a set of character tables that came from a Windows system. It has
characters greater than 128 that are set as spaces, amongst other things. I
kept it so that it can be used for testing from time to time.
README
This file.
Unicode.tables
The files in this directory were downloaded from the Unicode web site. They
contain information about Unicode characters and scripts, and are used by the
Generate scripts. There is also UnicodeData.txt, which is no longer used by
any script, because it is useful occasionally for manually looking up the
details of certain characters. However, note that character names in this
file such as "Arabic sign sanah" do NOT mean that the character is in a
particular script (in this case, Arabic). Scripts.txt and
ScriptExtensions.txt are where to look for script information.
ucptest.c
A program for testing the Unicode property macros that do lookups in the
pcre2_ucd.c data, mainly useful after rebuilding the Unicode property tables.
Compile and run this in the "maint" directory (see comments at its head).
This program can also be used to find characters with specific properties and
to list which properties are supported.
ucptestdata
A directory containing four files, testinput{1,2} and testoutput{1,2}, for
use in conjunction with the ucptest program.
utf8.c
A short, freestanding C program for converting a Unicode code point into a
sequence of bytes in the UTF-8 encoding, and vice versa. If its argument is a
hex number such as 0x1234, it outputs a list of the equivalent UTF-8 bytes.
If its argument is a sequence of concatenated UTF-8 bytes (e.g. 12e188b4) it
treats them as a UTF-8 string and outputs the equivalent code points in hex.
See comments at its head for details.
Updating to a new Unicode release
=================================
When there is a new release of Unicode, the files in Unicode.tables must be
refreshed from the web site. Once that is done, the four Python scripts that
generate files from the Unicode data can be run from within the "maint"
directory.
Note: Previously, it was necessary to update lists of scripts and their
abbreviations by hand before running the Python scripts. This is no longer
necessary because the scripts have been upgraded to extract this information
themselves. Also, there used to be explicit lists of scripts in two of the man
pages. This is no longer the case; the pcre2test program can now output a list
of supported scripts.
You can give an output file name as an argument to the following scripts, but
by default:
GenerateUcd.py creates pcre2_ucd.c )
GenerateUcpHeader.py creates pcre2_ucp.h ) in the current directory
GenerateUcpTables.py creates pcre2_ucptables.c )
These files can be compared against the existing versions in the src directory
to check on any changes before replacing the old files, but you can also
generate directly into the final location by running:
./GenerateUcd.py ../src/pcre2_ucd.c
./GenerateUcpHeader.py ../src/pcre2_ucp.h
./GenerateUcpTables.py ../src/pcre2_ucptables.c
Once the .c and .h files are in the ../src directory, the ucptest program can
be compiled and used to check that the new tables work properly. The data files
in ucptestdata are set up to check a number of test characters. See the
comments at the start of ucptest.c. If there are new scripts, adding a few
tests to the files in ucptestdata is a good idea.
Finally, you should run the GenerateTest26.py script to regenerate new versions
of the input and expected output from a series of Unicode property tests that
are automatically generated from the Unicode data files. By default, the files
are written to testinput26 and testoutput26 in the current directory, but you
can give an alternative directory name as an argument to the script. These
files should eventually be installed in the main testdata directory.
Preparing for a PCRE2 release
=============================
This section contains a checklist of things that I do before building a new
release.
. Ensure that the version number and version date are correct in configure.ac.
. Update the library version numbers in configure.ac according to the rules
given below.
. If new build options or new source files have been added, ensure that they
are added to the CMake files as well as to the autoconf files. The relevant
files are CMakeLists.txt and config-cmake.h.in. After making a release, test
it out with CMake if there have been changes here.
. Run ./autogen.sh to ensure everything is up-to-date.
. Compile and test with many different config options, and combinations of
options. Also, test with valgrind by running "RunTest valgrind" and
"RunGrepTest valgrind". The script maint/ManyConfigTests now encapsulates
this testing. It runs tests with different configurations, and it also runs
some of them with valgrind, all of which can take quite some time.
. Run tests in both 32-bit and 64-bit environments if possible. I can no longer
run 32-bit tests.
. Run tests with two or more different compilers (e.g. clang and gcc), and
make use of -fsanitize=address and friends where possible. For gcc,
-fsanitize=undefined -std=gnu99 picks up undefined behaviour at runtime, but
needs -fno-sanitize=shift to get rid of warnings for shifts of negative
numbers in the JIT compiler. For clang, -fsanitize=address,undefined,integer
can be used but -fno-sanitize=alignment,shift,unsigned-integer-overflow must
be added when compiling with JIT. Another useful clang option is
-fsanitize=signed-integer-overflow
. Do a test build using CMake. Remove src/config.h first, lest it override the
version that CMake creates. Also do a CMake unity build to check that it
still works: [c]cmake -DCMAKE_UNITY_BUILD=ON sets up a unity build.
. Run perltest.sh on the test data for tests 1 and 4. The output should match
the PCRE2 test output, apart from the version identification at the start of
each test. Sometimes there are other differences in test 4 if PCRE2 and Perl
are using different Unicode releases. The other tests are not Perl-compatible
(they use various PCRE2-specific features or options).
. It is possible to test with the emulated memmove() function by undefining
HAVE_MEMMOVE and HAVE_BCOPY in config.h, though I do not do this often.
. Documentation: check AUTHORS, ChangeLog (check version and date), LICENCE,
NEWS (check version and date), NON-AUTOTOOLS-BUILD, and README. Many of these
won't need changing, but over the long term things do change.
. I used to test new releases myself on a number of different operating
systems. For example, on Solaris it is helpful to test using Sun's cc
compiler as a change from gcc. Adding -xarch=v9 to the cc options does a
64-bit test, but it also needs -S 64 for pcre2test to increase the stack size
for test 2. Since I retired I can no longer do much of this. There are
automated tests under Ubuntu, Alpine, and Windows that are now set up as
GitHub actions. Check that they are running clean.
. The buildbots at http://buildfarm.opencsw.org/ do some automated testing
of PCRE2 and should also be checked before putting out a release.
Updating version info for libtool
=================================
This set of rules for updating library version information came from a web page
whose URL I have forgotten. The version information consists of three parts:
(current, revision, age).
1. Start with version information of 0:0:0 for each libtool library.
2. Update the version information only immediately before a public release of
your software. More frequent updates are unnecessary, and only guarantee
that the current interface number gets larger faster.
3. If the library source code has changed at all since the last update, then
increment revision; c:r:a becomes c:r+1:a.
4. If any interfaces have been added, removed, or changed since the last
update, increment current, and set revision to 0.
5. If any interfaces have been added since the last public release, then
increment age.
6. If any interfaces have been removed or changed since the last public
release, then set age to 0.
The following explanation may help in understanding the above rules a bit
better. Consider that there are three possible kinds of reaction from users to
changes in a shared library:
1. Programs using the previous version may use the new version as a drop-in
replacement, and programs using the new version can also work with the
previous one. In other words, no recompiling nor relinking is needed. In
this case, increment revision only, don't touch current or age.
2. Programs using the previous version may use the new version as a drop-in
replacement, but programs using the new version may use APIs not present in
the previous one. In other words, a program linking against the new version
may fail if linked against the old version at run time. In this case, set
revision to 0, increment current and age.
3. Programs may need to be changed, recompiled, relinked in order to use the
new version. Increment current, set revision and age to 0.
Making a PCRE2 release
======================
Run PrepareRelease and commit the files that it changes. The first thing this
script does is to run CheckMan on the man pages; if it finds any markup errors,
it reports them and then aborts. Otherwise it removes trailing spaces from
sources and refreshes the HTML documentation. Update the GitHub repository with
"git push".
Once PrepareRelease has run clean, run "make distcheck" to create the tarballs
and the zipball. I then sign these files. Double-check with "git status" that
the repository is fully up-to-date, then create a new tag and a release on
GitHub. Upload the tarballs, zipball, and the signatures as "assets" of the
GitHub release.
When the new release is out, don't forget to tell webmaster@pcre.org and the
mailing list.
Future ideas (wish list)
========================
This section records a list of ideas so that they do not get forgotten. They
vary enormously in their usefulness and potential for implementation. Some are
very sensible; some are rather wacky. Some have been on this list for many
years.
. Optimization
There are always ideas for new optimizations so as to speed up pattern
matching. Most of them try to save work by recognizing a non-match without
having to scan all the possibilities. These are some that I've recorded:
* /((A{0,5}){0,5}){0,5}(something complex)/ on a non-matching string is very
slow, though Perl is fast. Can we speed up somehow? Convert to {0,125}?
OTOH, this is pathological - the user could easily fix it.
* Turn ={4} into ==== ? (for speed). I once did an experiment, and it seems
to have little effect, and maybe makes things worse.
* "Ends with literal string" - note that a single character doesn't gain much
over the existing "required code unit" feature that just remembers one code
unit.
* Remember an initial string rather than just 1 code unit.
* A required code unit from alternatives - not just the last unit, but an
earlier one if common to all alternatives.
* Friedl contains other ideas.
* The code does not set initial code unit flags for Unicode property types
such as \p; I don't know how much benefit there would be for, for example,
setting the bits for 0-9 and all values >= xC0 (in 8-bit mode) when a
pattern starts with \p{N}.
. If Perl gets to a consistent state over the settings of capturing sub-
patterns inside repeats, see if we can match it. One example of the
difference is the matching of /(main(O)?)+/ against mainOmain, where PCRE2
leaves $2 set. In Perl, it's unset. Changing this in PCRE2 will be very hard
because I think it needs much more state to be remembered.
. A feature to suspend a match via a callout was once requested.
. An option to convert results into character offsets and character lengths.
. A (non-Unix) user wanted pcregrep options to (a) list a file name just once,
preceded by a blank line, instead of adding it to every matched line, and (b)
support --outputfile=name.
. Define a union for the results from pcre2_pattern_info().
. Provide a "random access to the subject" facility so that the way in which it
is stored is independent of PCRE2. For efficiency, it probably isn't possible
to switch this dynamically. It would have to be specified when PCRE2 was
compiled. PCRE2 would then call a function every time it wanted a character.
. pcre2grep: add -rs for a sorted recurse. Having to store file names and sort
them will of course slow it down.
. Someone suggested --disable-callout to save code space when callouts are
never wanted. This seems rather marginal.
. A user suggested a parameter to limit the length of string matched, for
example if the parameter is N, the current match should fail if the matched
substring exceeds N. This could apply to both match functions. The value
could be a new field in the match context. Compare the offset_limit feature,
which limits where a match must start.
. Write a function that generates random matching strings for a compiled
pattern.
. Pcre2grep: an option to specify the output line separator, either as a string
or select from a fixed list. This is not straightforward, because at the
moment it outputs whatever is in the input file.
. Improve the code for duplicate checking in pcre2_dfa_match(). An incomplete,
non-thread-safe patch showed that this can help performance for patterns
where there are many alternatives. However, a simple thread-safe
implementation that I tried made things worse in many simple cases, so this
is not an obviously good thing.
. PCRE2 cannot at present distinguish between subpatterns with different names,
but the same number (created by the use of ?|). In order to do so, a way of
remembering *which* subpattern numbered n matched is needed. (*MARK) can
perhaps be used as a way round this problem. However, note that Perl does not
distinguish: like PCRE2, a name is just an alias for a number in Perl.
. Instead of having #ifdef HAVE_CONFIG_H in each module, put #include
"something" and the the #ifdef appears only in one place, in "something".
. Implement something like (?(R2+)... to check outer recursions.
. If Perl ever supports the POSIX notation [[.something.]] PCRE2 should try
to follow.
. A user wanted a way of ignoring all Unicode "mark" characters so that, for
example "a" followed by an accent would, together, match "a". This can only
be done clumsily at present by using a lookahead such as /(?=a)\X/, which
works for "combining" characters.
. Perl supports [\N{x}-\N{y}] as a Unicode range, even in EBCDIC. PCRE2
supports \N{U+dd..} everywhere, but not in EBCDIC.
. Unicode stuff from Perl:
\b{gcb} or \b{g} grapheme cluster boundary
\b{sb} sentence boundary
\b{wb} word boundary
See Unicode TR 29. The last two are very much aimed at natural language.
. Allow a callout to specify a number of characters to skip. This can be done
compatibly via an extra callout field.
. Allow callouts to return *PRUNE, *COMMIT, *THEN, *SKIP, with and without
continuing (that is, with and without an implied *FAIL). A new option,
PCRE2_CALLOUT_EXTENDED say, would be needed. This is unlikely ever to be
implemented by JIT, so this could be an option for pcre2_match().
. A limit on substitutions: a user suggested somehow finding a way of making
match_limit apply to the whole operation instead of each match separately.
. Some #defines could be replaced with enums to improve robustness.
. There was a request for an option for pcre2_match() to return the longest
match. This would mean searching for all possible matches, of course.
. Perl's /a modifier sets Unicode, but restricts \d etc to ASCII characters,
which is the PCRE2 default for PCRE2_UTF (use PCRE2_UCP to change). However,
Perl also has /aa, which in addition, disables ASCII/non-ASCII caseless
matching. Perhaps we need a new option PCRE2_CASELESS_RESTRICT_ASCII. In
practice, this just means not using the ucd_caseless_sets[] table.
. There is more that could be done to the oss-fuzz setup (needs some research).
A seed corpus could be built. I noted something about $LIB_FUZZING_ENGINE.
The test function could make use of get_substrings() to cover more code.
. A neater way of handling recursion file names in pcre2grep, e.g. a single
buffer that can grow. See also GitHub issue #2 (recursion looping via
symlinks).
. A user suggested that before/after parameters in pcre2grep could have
negative values, to list lines near to the matched line, but not necessarily
the line itself. For example, --before-context=-1 would list the line *after*
each matched line, without showing the matched line. The problem here is what
to do with matches that are close together. Maybe a simpler way would be a
flag to disable showing matched lines, only valid with either -A or -B?
. There was a suggestiong for a pcre2grep colour default, or possibly a more
general PCRE2GREP_OPT, but only for some options - not file names or patterns.
. Breaking loops that match an empty string: perhaps find a way of continuing
if *something* has changed, but this might mean remembering additional data.
"Something" could be a capture value, but then a list of previous values
would be needed to avoid a cycle of changes.
. If a function could be written to find 3-character (or other length) fixed
strings, at least one of which must be present for a match, efficient
pre-searching of large datasets could be implemented.
. If pcre2grep had --first-line (match only in the first line) it could be
efficiently used to find files "starting with xxx". What about --last-line?
There was also the suggestion of an option for pcre2grep to scan only the
start of a file. I am not keen - this is the job of "head".
. A user requested a means of determining whether a failed match was failed by
the start-of-match optimizations, or by running the match engine. Easy enough
to define a bit in the match data, but all three matchers would need work.
. Would inlining "simple" recursions provide a useful performance boost for the
interpreters? JIT already does some of this, but it may not be worth it for
the interpreters.
. Redesign handling of class/nclass/xclass because the compile code logic is
currently very contorted and obscure. Also there was a request for a way of
re-defining \w (and therefore \W, \b, and \B). An in-pattern sequence such as
(?w=[...]) was suggested. Easiest way would be simply to inline the class,
with lookarounds for \b and \B. Ideally the setting should last till the end
of the group, which means remembering all previous settings; maybe a fixed
amount of stack would do - how deep would anyone want to nest these things?
See GitHub issue #13 for a compendium of character class issues, including
(?[...]) extended classes.
. A user suggested something like --with-build-info to set a build information
string that could be retrieved by pcre2_config(). However, there's no
facility for a length limit in pcre2_config(), and what would be the
encoding?
. Quantified groups with a fixed count currently operate by replicating the
group in the compiled bytecode. This may not really matter in these days of
gigabyte memory, but perhaps another implementation might be considered.
Needs coordination between the interpreters and JIT.
. There are regular requests for variable-length lookbehinds.
. See also any suggestions in the GitHub issues.
Philip Hazel
Email local part: Philip.Hazel
Email domain: gmail.com
Last updated: 25 April 2022

View File

@ -0,0 +1,633 @@
# BidiMirroring-14.0.0.txt
# Date: 2021-08-08, 22:55:00 GMT [KW, RP]
# © 2021 Unicode®, Inc.
# For terms of use, see https://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# For documentation, see https://www.unicode.org/reports/tr44/
#
# Bidi_Mirroring_Glyph Property
#
# This file is an informative contributory data file in the
# Unicode Character Database.
#
# This data file lists characters that have the Bidi_Mirrored=Yes property
# value, for which there is another Unicode character that typically has a glyph
# that is the mirror image of the original character's glyph.
#
# The repertoire covered by the file is Unicode 14.0.0.
#
# The file contains a list of lines with mappings from one code point
# to another one for character-based mirroring.
# Note that for "real" mirroring, a rendering engine needs to select
# appropriate alternative glyphs, and that many Unicode characters do not
# have a mirror-image Unicode character.
#
# Each mapping line contains two fields, separated by a semicolon (';').
# Each of the two fields contains a code point represented as a
# variable-length hexadecimal value with 4 to 6 digits.
# A comment indicates where the characters are "BEST FIT" mirroring.
#
# Code points for which Bidi_Mirrored=Yes, but for which no appropriate
# characters exist with mirrored glyphs, are
# listed as comments at the end of the file.
#
# Formally, the default value of the Bidi_Mirroring_Glyph property
# for each code point is <none>, unless a mapping to
# some other character is specified in this data file. When a code
# point has the default value for the Bidi_Mirroring_Glyph property,
# that means that no other character exists whose glyph is suitable
# for character-based mirroring.
#
# For information on bidi mirroring, see UAX #9: Unicode Bidirectional Algorithm,
# at https://www.unicode.org/reports/tr9/
#
# This file was originally created by Markus Scherer.
# Extended for Unicode 3.2, 4.0, 4.1, 5.0, 5.1, 5.2, and 6.0 by Ken Whistler,
# and for subsequent versions by Ken Whistler, Laurentiu Iancu, and Roozbeh Pournader.
#
# Historical and Compatibility Information:
#
# The OpenType Mirroring Pairs List (OMPL) is frozen to match the
# Unicode 5.1 version of the Bidi_Mirroring_Glyph property (2008).
# See https://www.microsoft.com/typography/otspec/ompl.txt
#
# The Unicode 6.1 version of the Bidi_Mirroring_Glyph property (2011)
# added one mirroring pair: 27CB <--> 27CD.
#
# The Unicode 11.0 version of the Bidi_Mirroring_Glyph property (2018)
# underwent a substantial revision, to formally recognize all of the
# exact mirroring pairs and "BEST FIT" mirroring pairs that had been
# added after the freezing of the OMPL list. As a result, starting
# with Unicode 11.0, the bmg mapping values more accurately reflect
# the current status of glyphs for Bidi_Mirrored characters in
# the Unicode Standard, but this listing now extends significantly
# beyond the frozen OMPL list. Implementers should be aware of this
# intentional distinction.
#
# ############################################################
#
# Property: Bidi_Mirroring_Glyph
#
# @missing: 0000..10FFFF; <none>
0028; 0029 # LEFT PARENTHESIS
0029; 0028 # RIGHT PARENTHESIS
003C; 003E # LESS-THAN SIGN
003E; 003C # GREATER-THAN SIGN
005B; 005D # LEFT SQUARE BRACKET
005D; 005B # RIGHT SQUARE BRACKET
007B; 007D # LEFT CURLY BRACKET
007D; 007B # RIGHT CURLY BRACKET
00AB; 00BB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
00BB; 00AB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
0F3A; 0F3B # TIBETAN MARK GUG RTAGS GYON
0F3B; 0F3A # TIBETAN MARK GUG RTAGS GYAS
0F3C; 0F3D # TIBETAN MARK ANG KHANG GYON
0F3D; 0F3C # TIBETAN MARK ANG KHANG GYAS
169B; 169C # OGHAM FEATHER MARK
169C; 169B # OGHAM REVERSED FEATHER MARK
2039; 203A # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
203A; 2039 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
2045; 2046 # LEFT SQUARE BRACKET WITH QUILL
2046; 2045 # RIGHT SQUARE BRACKET WITH QUILL
207D; 207E # SUPERSCRIPT LEFT PARENTHESIS
207E; 207D # SUPERSCRIPT RIGHT PARENTHESIS
208D; 208E # SUBSCRIPT LEFT PARENTHESIS
208E; 208D # SUBSCRIPT RIGHT PARENTHESIS
2208; 220B # ELEMENT OF
2209; 220C # [BEST FIT] NOT AN ELEMENT OF
220A; 220D # SMALL ELEMENT OF
220B; 2208 # CONTAINS AS MEMBER
220C; 2209 # [BEST FIT] DOES NOT CONTAIN AS MEMBER
220D; 220A # SMALL CONTAINS AS MEMBER
2215; 29F5 # DIVISION SLASH
221F; 2BFE # RIGHT ANGLE
2220; 29A3 # ANGLE
2221; 299B # MEASURED ANGLE
2222; 29A0 # SPHERICAL ANGLE
2224; 2AEE # DOES NOT DIVIDE
223C; 223D # TILDE OPERATOR
223D; 223C # REVERSED TILDE
2243; 22CD # ASYMPTOTICALLY EQUAL TO
2245; 224C # APPROXIMATELY EQUAL TO
224C; 2245 # ALL EQUAL TO
2252; 2253 # APPROXIMATELY EQUAL TO OR THE IMAGE OF
2253; 2252 # IMAGE OF OR APPROXIMATELY EQUAL TO
2254; 2255 # COLON EQUALS
2255; 2254 # EQUALS COLON
2264; 2265 # LESS-THAN OR EQUAL TO
2265; 2264 # GREATER-THAN OR EQUAL TO
2266; 2267 # LESS-THAN OVER EQUAL TO
2267; 2266 # GREATER-THAN OVER EQUAL TO
2268; 2269 # [BEST FIT] LESS-THAN BUT NOT EQUAL TO
2269; 2268 # [BEST FIT] GREATER-THAN BUT NOT EQUAL TO
226A; 226B # MUCH LESS-THAN
226B; 226A # MUCH GREATER-THAN
226E; 226F # [BEST FIT] NOT LESS-THAN
226F; 226E # [BEST FIT] NOT GREATER-THAN
2270; 2271 # [BEST FIT] NEITHER LESS-THAN NOR EQUAL TO
2271; 2270 # [BEST FIT] NEITHER GREATER-THAN NOR EQUAL TO
2272; 2273 # [BEST FIT] LESS-THAN OR EQUIVALENT TO
2273; 2272 # [BEST FIT] GREATER-THAN OR EQUIVALENT TO
2274; 2275 # [BEST FIT] NEITHER LESS-THAN NOR EQUIVALENT TO
2275; 2274 # [BEST FIT] NEITHER GREATER-THAN NOR EQUIVALENT TO
2276; 2277 # LESS-THAN OR GREATER-THAN
2277; 2276 # GREATER-THAN OR LESS-THAN
2278; 2279 # [BEST FIT] NEITHER LESS-THAN NOR GREATER-THAN
2279; 2278 # [BEST FIT] NEITHER GREATER-THAN NOR LESS-THAN
227A; 227B # PRECEDES
227B; 227A # SUCCEEDS
227C; 227D # PRECEDES OR EQUAL TO
227D; 227C # SUCCEEDS OR EQUAL TO
227E; 227F # [BEST FIT] PRECEDES OR EQUIVALENT TO
227F; 227E # [BEST FIT] SUCCEEDS OR EQUIVALENT TO
2280; 2281 # [BEST FIT] DOES NOT PRECEDE
2281; 2280 # [BEST FIT] DOES NOT SUCCEED
2282; 2283 # SUBSET OF
2283; 2282 # SUPERSET OF
2284; 2285 # [BEST FIT] NOT A SUBSET OF
2285; 2284 # [BEST FIT] NOT A SUPERSET OF
2286; 2287 # SUBSET OF OR EQUAL TO
2287; 2286 # SUPERSET OF OR EQUAL TO
2288; 2289 # [BEST FIT] NEITHER A SUBSET OF NOR EQUAL TO
2289; 2288 # [BEST FIT] NEITHER A SUPERSET OF NOR EQUAL TO
228A; 228B # [BEST FIT] SUBSET OF WITH NOT EQUAL TO
228B; 228A # [BEST FIT] SUPERSET OF WITH NOT EQUAL TO
228F; 2290 # SQUARE IMAGE OF
2290; 228F # SQUARE ORIGINAL OF
2291; 2292 # SQUARE IMAGE OF OR EQUAL TO
2292; 2291 # SQUARE ORIGINAL OF OR EQUAL TO
2298; 29B8 # CIRCLED DIVISION SLASH
22A2; 22A3 # RIGHT TACK
22A3; 22A2 # LEFT TACK
22A6; 2ADE # ASSERTION
22A8; 2AE4 # TRUE
22A9; 2AE3 # FORCES
22AB; 2AE5 # DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
22B0; 22B1 # PRECEDES UNDER RELATION
22B1; 22B0 # SUCCEEDS UNDER RELATION
22B2; 22B3 # NORMAL SUBGROUP OF
22B3; 22B2 # CONTAINS AS NORMAL SUBGROUP
22B4; 22B5 # NORMAL SUBGROUP OF OR EQUAL TO
22B5; 22B4 # CONTAINS AS NORMAL SUBGROUP OR EQUAL TO
22B6; 22B7 # ORIGINAL OF
22B7; 22B6 # IMAGE OF
22B8; 27DC # MULTIMAP
22C9; 22CA # LEFT NORMAL FACTOR SEMIDIRECT PRODUCT
22CA; 22C9 # RIGHT NORMAL FACTOR SEMIDIRECT PRODUCT
22CB; 22CC # LEFT SEMIDIRECT PRODUCT
22CC; 22CB # RIGHT SEMIDIRECT PRODUCT
22CD; 2243 # REVERSED TILDE EQUALS
22D0; 22D1 # DOUBLE SUBSET
22D1; 22D0 # DOUBLE SUPERSET
22D6; 22D7 # LESS-THAN WITH DOT
22D7; 22D6 # GREATER-THAN WITH DOT
22D8; 22D9 # VERY MUCH LESS-THAN
22D9; 22D8 # VERY MUCH GREATER-THAN
22DA; 22DB # LESS-THAN EQUAL TO OR GREATER-THAN
22DB; 22DA # GREATER-THAN EQUAL TO OR LESS-THAN
22DC; 22DD # EQUAL TO OR LESS-THAN
22DD; 22DC # EQUAL TO OR GREATER-THAN
22DE; 22DF # EQUAL TO OR PRECEDES
22DF; 22DE # EQUAL TO OR SUCCEEDS
22E0; 22E1 # [BEST FIT] DOES NOT PRECEDE OR EQUAL
22E1; 22E0 # [BEST FIT] DOES NOT SUCCEED OR EQUAL
22E2; 22E3 # [BEST FIT] NOT SQUARE IMAGE OF OR EQUAL TO
22E3; 22E2 # [BEST FIT] NOT SQUARE ORIGINAL OF OR EQUAL TO
22E4; 22E5 # [BEST FIT] SQUARE IMAGE OF OR NOT EQUAL TO
22E5; 22E4 # [BEST FIT] SQUARE ORIGINAL OF OR NOT EQUAL TO
22E6; 22E7 # [BEST FIT] LESS-THAN BUT NOT EQUIVALENT TO
22E7; 22E6 # [BEST FIT] GREATER-THAN BUT NOT EQUIVALENT TO
22E8; 22E9 # [BEST FIT] PRECEDES BUT NOT EQUIVALENT TO
22E9; 22E8 # [BEST FIT] SUCCEEDS BUT NOT EQUIVALENT TO
22EA; 22EB # [BEST FIT] NOT NORMAL SUBGROUP OF
22EB; 22EA # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP
22EC; 22ED # [BEST FIT] NOT NORMAL SUBGROUP OF OR EQUAL TO
22ED; 22EC # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL
22F0; 22F1 # UP RIGHT DIAGONAL ELLIPSIS
22F1; 22F0 # DOWN RIGHT DIAGONAL ELLIPSIS
22F2; 22FA # ELEMENT OF WITH LONG HORIZONTAL STROKE
22F3; 22FB # ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
22F4; 22FC # SMALL ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
22F6; 22FD # ELEMENT OF WITH OVERBAR
22F7; 22FE # SMALL ELEMENT OF WITH OVERBAR
22FA; 22F2 # CONTAINS WITH LONG HORIZONTAL STROKE
22FB; 22F3 # CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
22FC; 22F4 # SMALL CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE
22FD; 22F6 # CONTAINS WITH OVERBAR
22FE; 22F7 # SMALL CONTAINS WITH OVERBAR
2308; 2309 # LEFT CEILING
2309; 2308 # RIGHT CEILING
230A; 230B # LEFT FLOOR
230B; 230A # RIGHT FLOOR
2329; 232A # LEFT-POINTING ANGLE BRACKET
232A; 2329 # RIGHT-POINTING ANGLE BRACKET
2768; 2769 # MEDIUM LEFT PARENTHESIS ORNAMENT
2769; 2768 # MEDIUM RIGHT PARENTHESIS ORNAMENT
276A; 276B # MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
276B; 276A # MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT
276C; 276D # MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT
276D; 276C # MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT
276E; 276F # HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT
276F; 276E # HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT
2770; 2771 # HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT
2771; 2770 # HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT
2772; 2773 # LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT
2773; 2772 # LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT
2774; 2775 # MEDIUM LEFT CURLY BRACKET ORNAMENT
2775; 2774 # MEDIUM RIGHT CURLY BRACKET ORNAMENT
27C3; 27C4 # OPEN SUBSET
27C4; 27C3 # OPEN SUPERSET
27C5; 27C6 # LEFT S-SHAPED BAG DELIMITER
27C6; 27C5 # RIGHT S-SHAPED BAG DELIMITER
27C8; 27C9 # REVERSE SOLIDUS PRECEDING SUBSET
27C9; 27C8 # SUPERSET PRECEDING SOLIDUS
27CB; 27CD # MATHEMATICAL RISING DIAGONAL
27CD; 27CB # MATHEMATICAL FALLING DIAGONAL
27D5; 27D6 # LEFT OUTER JOIN
27D6; 27D5 # RIGHT OUTER JOIN
27DC; 22B8 # LEFT MULTIMAP
27DD; 27DE # LONG RIGHT TACK
27DE; 27DD # LONG LEFT TACK
27E2; 27E3 # WHITE CONCAVE-SIDED DIAMOND WITH LEFTWARDS TICK
27E3; 27E2 # WHITE CONCAVE-SIDED DIAMOND WITH RIGHTWARDS TICK
27E4; 27E5 # WHITE SQUARE WITH LEFTWARDS TICK
27E5; 27E4 # WHITE SQUARE WITH RIGHTWARDS TICK
27E6; 27E7 # MATHEMATICAL LEFT WHITE SQUARE BRACKET
27E7; 27E6 # MATHEMATICAL RIGHT WHITE SQUARE BRACKET
27E8; 27E9 # MATHEMATICAL LEFT ANGLE BRACKET
27E9; 27E8 # MATHEMATICAL RIGHT ANGLE BRACKET
27EA; 27EB # MATHEMATICAL LEFT DOUBLE ANGLE BRACKET
27EB; 27EA # MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET
27EC; 27ED # MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET
27ED; 27EC # MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET
27EE; 27EF # MATHEMATICAL LEFT FLATTENED PARENTHESIS
27EF; 27EE # MATHEMATICAL RIGHT FLATTENED PARENTHESIS
2983; 2984 # LEFT WHITE CURLY BRACKET
2984; 2983 # RIGHT WHITE CURLY BRACKET
2985; 2986 # LEFT WHITE PARENTHESIS
2986; 2985 # RIGHT WHITE PARENTHESIS
2987; 2988 # Z NOTATION LEFT IMAGE BRACKET
2988; 2987 # Z NOTATION RIGHT IMAGE BRACKET
2989; 298A # Z NOTATION LEFT BINDING BRACKET
298A; 2989 # Z NOTATION RIGHT BINDING BRACKET
298B; 298C # LEFT SQUARE BRACKET WITH UNDERBAR
298C; 298B # RIGHT SQUARE BRACKET WITH UNDERBAR
298D; 2990 # LEFT SQUARE BRACKET WITH TICK IN TOP CORNER
298E; 298F # RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
298F; 298E # LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
2990; 298D # RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER
2991; 2992 # LEFT ANGLE BRACKET WITH DOT
2992; 2991 # RIGHT ANGLE BRACKET WITH DOT
2993; 2994 # LEFT ARC LESS-THAN BRACKET
2994; 2993 # RIGHT ARC GREATER-THAN BRACKET
2995; 2996 # DOUBLE LEFT ARC GREATER-THAN BRACKET
2996; 2995 # DOUBLE RIGHT ARC LESS-THAN BRACKET
2997; 2998 # LEFT BLACK TORTOISE SHELL BRACKET
2998; 2997 # RIGHT BLACK TORTOISE SHELL BRACKET
299B; 2221 # MEASURED ANGLE OPENING LEFT
29A0; 2222 # SPHERICAL ANGLE OPENING LEFT
29A3; 2220 # REVERSED ANGLE
29A4; 29A5 # ANGLE WITH UNDERBAR
29A5; 29A4 # REVERSED ANGLE WITH UNDERBAR
29A8; 29A9 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND RIGHT
29A9; 29A8 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND LEFT
29AA; 29AB # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND RIGHT
29AB; 29AA # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND LEFT
29AC; 29AD # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND UP
29AD; 29AC # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND UP
29AE; 29AF # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND DOWN
29AF; 29AE # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND DOWN
29B8; 2298 # CIRCLED REVERSE SOLIDUS
29C0; 29C1 # CIRCLED LESS-THAN
29C1; 29C0 # CIRCLED GREATER-THAN
29C4; 29C5 # SQUARED RISING DIAGONAL SLASH
29C5; 29C4 # SQUARED FALLING DIAGONAL SLASH
29CF; 29D0 # LEFT TRIANGLE BESIDE VERTICAL BAR
29D0; 29CF # VERTICAL BAR BESIDE RIGHT TRIANGLE
29D1; 29D2 # BOWTIE WITH LEFT HALF BLACK
29D2; 29D1 # BOWTIE WITH RIGHT HALF BLACK
29D4; 29D5 # TIMES WITH LEFT HALF BLACK
29D5; 29D4 # TIMES WITH RIGHT HALF BLACK
29D8; 29D9 # LEFT WIGGLY FENCE
29D9; 29D8 # RIGHT WIGGLY FENCE
29DA; 29DB # LEFT DOUBLE WIGGLY FENCE
29DB; 29DA # RIGHT DOUBLE WIGGLY FENCE
29E8; 29E9 # DOWN-POINTING TRIANGLE WITH LEFT HALF BLACK
29E9; 29E8 # DOWN-POINTING TRIANGLE WITH RIGHT HALF BLACK
29F5; 2215 # REVERSE SOLIDUS OPERATOR
29F8; 29F9 # BIG SOLIDUS
29F9; 29F8 # BIG REVERSE SOLIDUS
29FC; 29FD # LEFT-POINTING CURVED ANGLE BRACKET
29FD; 29FC # RIGHT-POINTING CURVED ANGLE BRACKET
2A2B; 2A2C # MINUS SIGN WITH FALLING DOTS
2A2C; 2A2B # MINUS SIGN WITH RISING DOTS
2A2D; 2A2E # PLUS SIGN IN LEFT HALF CIRCLE
2A2E; 2A2D # PLUS SIGN IN RIGHT HALF CIRCLE
2A34; 2A35 # MULTIPLICATION SIGN IN LEFT HALF CIRCLE
2A35; 2A34 # MULTIPLICATION SIGN IN RIGHT HALF CIRCLE
2A3C; 2A3D # INTERIOR PRODUCT
2A3D; 2A3C # RIGHTHAND INTERIOR PRODUCT
2A64; 2A65 # Z NOTATION DOMAIN ANTIRESTRICTION
2A65; 2A64 # Z NOTATION RANGE ANTIRESTRICTION
2A79; 2A7A # LESS-THAN WITH CIRCLE INSIDE
2A7A; 2A79 # GREATER-THAN WITH CIRCLE INSIDE
2A7B; 2A7C # [BEST FIT] LESS-THAN WITH QUESTION MARK ABOVE
2A7C; 2A7B # [BEST FIT] GREATER-THAN WITH QUESTION MARK ABOVE
2A7D; 2A7E # LESS-THAN OR SLANTED EQUAL TO
2A7E; 2A7D # GREATER-THAN OR SLANTED EQUAL TO
2A7F; 2A80 # LESS-THAN OR SLANTED EQUAL TO WITH DOT INSIDE
2A80; 2A7F # GREATER-THAN OR SLANTED EQUAL TO WITH DOT INSIDE
2A81; 2A82 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE
2A82; 2A81 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE
2A83; 2A84 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE RIGHT
2A84; 2A83 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE LEFT
2A85; 2A86 # [BEST FIT] LESS-THAN OR APPROXIMATE
2A86; 2A85 # [BEST FIT] GREATER-THAN OR APPROXIMATE
2A87; 2A88 # [BEST FIT] LESS-THAN AND SINGLE-LINE NOT EQUAL TO
2A88; 2A87 # [BEST FIT] GREATER-THAN AND SINGLE-LINE NOT EQUAL TO
2A89; 2A8A # [BEST FIT] LESS-THAN AND NOT APPROXIMATE
2A8A; 2A89 # [BEST FIT] GREATER-THAN AND NOT APPROXIMATE
2A8B; 2A8C # LESS-THAN ABOVE DOUBLE-LINE EQUAL ABOVE GREATER-THAN
2A8C; 2A8B # GREATER-THAN ABOVE DOUBLE-LINE EQUAL ABOVE LESS-THAN
2A8D; 2A8E # [BEST FIT] LESS-THAN ABOVE SIMILAR OR EQUAL
2A8E; 2A8D # [BEST FIT] GREATER-THAN ABOVE SIMILAR OR EQUAL
2A8F; 2A90 # [BEST FIT] LESS-THAN ABOVE SIMILAR ABOVE GREATER-THAN
2A90; 2A8F # [BEST FIT] GREATER-THAN ABOVE SIMILAR ABOVE LESS-THAN
2A91; 2A92 # LESS-THAN ABOVE GREATER-THAN ABOVE DOUBLE-LINE EQUAL
2A92; 2A91 # GREATER-THAN ABOVE LESS-THAN ABOVE DOUBLE-LINE EQUAL
2A93; 2A94 # LESS-THAN ABOVE SLANTED EQUAL ABOVE GREATER-THAN ABOVE SLANTED EQUAL
2A94; 2A93 # GREATER-THAN ABOVE SLANTED EQUAL ABOVE LESS-THAN ABOVE SLANTED EQUAL
2A95; 2A96 # SLANTED EQUAL TO OR LESS-THAN
2A96; 2A95 # SLANTED EQUAL TO OR GREATER-THAN
2A97; 2A98 # SLANTED EQUAL TO OR LESS-THAN WITH DOT INSIDE
2A98; 2A97 # SLANTED EQUAL TO OR GREATER-THAN WITH DOT INSIDE
2A99; 2A9A # DOUBLE-LINE EQUAL TO OR LESS-THAN
2A9A; 2A99 # DOUBLE-LINE EQUAL TO OR GREATER-THAN
2A9B; 2A9C # DOUBLE-LINE SLANTED EQUAL TO OR LESS-THAN
2A9C; 2A9B # DOUBLE-LINE SLANTED EQUAL TO OR GREATER-THAN
2A9D; 2A9E # [BEST FIT] SIMILAR OR LESS-THAN
2A9E; 2A9D # [BEST FIT] SIMILAR OR GREATER-THAN
2A9F; 2AA0 # [BEST FIT] SIMILAR ABOVE LESS-THAN ABOVE EQUALS SIGN
2AA0; 2A9F # [BEST FIT] SIMILAR ABOVE GREATER-THAN ABOVE EQUALS SIGN
2AA1; 2AA2 # DOUBLE NESTED LESS-THAN
2AA2; 2AA1 # DOUBLE NESTED GREATER-THAN
2AA6; 2AA7 # LESS-THAN CLOSED BY CURVE
2AA7; 2AA6 # GREATER-THAN CLOSED BY CURVE
2AA8; 2AA9 # LESS-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL
2AA9; 2AA8 # GREATER-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL
2AAA; 2AAB # SMALLER THAN
2AAB; 2AAA # LARGER THAN
2AAC; 2AAD # SMALLER THAN OR EQUAL TO
2AAD; 2AAC # LARGER THAN OR EQUAL TO
2AAF; 2AB0 # PRECEDES ABOVE SINGLE-LINE EQUALS SIGN
2AB0; 2AAF # SUCCEEDS ABOVE SINGLE-LINE EQUALS SIGN
2AB1; 2AB2 # [BEST FIT] PRECEDES ABOVE SINGLE-LINE NOT EQUAL TO
2AB2; 2AB1 # [BEST FIT] SUCCEEDS ABOVE SINGLE-LINE NOT EQUAL TO
2AB3; 2AB4 # PRECEDES ABOVE EQUALS SIGN
2AB4; 2AB3 # SUCCEEDS ABOVE EQUALS SIGN
2AB5; 2AB6 # [BEST FIT] PRECEDES ABOVE NOT EQUAL TO
2AB6; 2AB5 # [BEST FIT] SUCCEEDS ABOVE NOT EQUAL TO
2AB7; 2AB8 # [BEST FIT] PRECEDES ABOVE ALMOST EQUAL TO
2AB8; 2AB7 # [BEST FIT] SUCCEEDS ABOVE ALMOST EQUAL TO
2AB9; 2ABA # [BEST FIT] PRECEDES ABOVE NOT ALMOST EQUAL TO
2ABA; 2AB9 # [BEST FIT] SUCCEEDS ABOVE NOT ALMOST EQUAL TO
2ABB; 2ABC # DOUBLE PRECEDES
2ABC; 2ABB # DOUBLE SUCCEEDS
2ABD; 2ABE # SUBSET WITH DOT
2ABE; 2ABD # SUPERSET WITH DOT
2ABF; 2AC0 # SUBSET WITH PLUS SIGN BELOW
2AC0; 2ABF # SUPERSET WITH PLUS SIGN BELOW
2AC1; 2AC2 # SUBSET WITH MULTIPLICATION SIGN BELOW
2AC2; 2AC1 # SUPERSET WITH MULTIPLICATION SIGN BELOW
2AC3; 2AC4 # SUBSET OF OR EQUAL TO WITH DOT ABOVE
2AC4; 2AC3 # SUPERSET OF OR EQUAL TO WITH DOT ABOVE
2AC5; 2AC6 # SUBSET OF ABOVE EQUALS SIGN
2AC6; 2AC5 # SUPERSET OF ABOVE EQUALS SIGN
2AC7; 2AC8 # [BEST FIT] SUBSET OF ABOVE TILDE OPERATOR
2AC8; 2AC7 # [BEST FIT] SUPERSET OF ABOVE TILDE OPERATOR
2AC9; 2ACA # [BEST FIT] SUBSET OF ABOVE ALMOST EQUAL TO
2ACA; 2AC9 # [BEST FIT] SUPERSET OF ABOVE ALMOST EQUAL TO
2ACB; 2ACC # [BEST FIT] SUBSET OF ABOVE NOT EQUAL TO
2ACC; 2ACB # [BEST FIT] SUPERSET OF ABOVE NOT EQUAL TO
2ACD; 2ACE # SQUARE LEFT OPEN BOX OPERATOR
2ACE; 2ACD # SQUARE RIGHT OPEN BOX OPERATOR
2ACF; 2AD0 # CLOSED SUBSET
2AD0; 2ACF # CLOSED SUPERSET
2AD1; 2AD2 # CLOSED SUBSET OR EQUAL TO
2AD2; 2AD1 # CLOSED SUPERSET OR EQUAL TO
2AD3; 2AD4 # SUBSET ABOVE SUPERSET
2AD4; 2AD3 # SUPERSET ABOVE SUBSET
2AD5; 2AD6 # SUBSET ABOVE SUBSET
2AD6; 2AD5 # SUPERSET ABOVE SUPERSET
2ADE; 22A6 # SHORT LEFT TACK
2AE3; 22A9 # DOUBLE VERTICAL BAR LEFT TURNSTILE
2AE4; 22A8 # VERTICAL BAR DOUBLE LEFT TURNSTILE
2AE5; 22AB # DOUBLE VERTICAL BAR DOUBLE LEFT TURNSTILE
2AEC; 2AED # DOUBLE STROKE NOT SIGN
2AED; 2AEC # REVERSED DOUBLE STROKE NOT SIGN
2AEE; 2224 # DOES NOT DIVIDE WITH REVERSED NEGATION SLASH
2AF7; 2AF8 # TRIPLE NESTED LESS-THAN
2AF8; 2AF7 # TRIPLE NESTED GREATER-THAN
2AF9; 2AFA # DOUBLE-LINE SLANTED LESS-THAN OR EQUAL TO
2AFA; 2AF9 # DOUBLE-LINE SLANTED GREATER-THAN OR EQUAL TO
2BFE; 221F # REVERSED RIGHT ANGLE
2E02; 2E03 # LEFT SUBSTITUTION BRACKET
2E03; 2E02 # RIGHT SUBSTITUTION BRACKET
2E04; 2E05 # LEFT DOTTED SUBSTITUTION BRACKET
2E05; 2E04 # RIGHT DOTTED SUBSTITUTION BRACKET
2E09; 2E0A # LEFT TRANSPOSITION BRACKET
2E0A; 2E09 # RIGHT TRANSPOSITION BRACKET
2E0C; 2E0D # LEFT RAISED OMISSION BRACKET
2E0D; 2E0C # RIGHT RAISED OMISSION BRACKET
2E1C; 2E1D # LEFT LOW PARAPHRASE BRACKET
2E1D; 2E1C # RIGHT LOW PARAPHRASE BRACKET
2E20; 2E21 # LEFT VERTICAL BAR WITH QUILL
2E21; 2E20 # RIGHT VERTICAL BAR WITH QUILL
2E22; 2E23 # TOP LEFT HALF BRACKET
2E23; 2E22 # TOP RIGHT HALF BRACKET
2E24; 2E25 # BOTTOM LEFT HALF BRACKET
2E25; 2E24 # BOTTOM RIGHT HALF BRACKET
2E26; 2E27 # LEFT SIDEWAYS U BRACKET
2E27; 2E26 # RIGHT SIDEWAYS U BRACKET
2E28; 2E29 # LEFT DOUBLE PARENTHESIS
2E29; 2E28 # RIGHT DOUBLE PARENTHESIS
2E55; 2E56 # LEFT SQUARE BRACKET WITH STROKE
2E56; 2E55 # RIGHT SQUARE BRACKET WITH STROKE
2E57; 2E58 # LEFT SQUARE BRACKET WITH DOUBLE STROKE
2E58; 2E57 # RIGHT SQUARE BRACKET WITH DOUBLE STROKE
2E59; 2E5A # TOP HALF LEFT PARENTHESIS
2E5A; 2E59 # TOP HALF RIGHT PARENTHESIS
2E5B; 2E5C # BOTTOM HALF LEFT PARENTHESIS
2E5C; 2E5B # BOTTOM HALF RIGHT PARENTHESIS
3008; 3009 # LEFT ANGLE BRACKET
3009; 3008 # RIGHT ANGLE BRACKET
300A; 300B # LEFT DOUBLE ANGLE BRACKET
300B; 300A # RIGHT DOUBLE ANGLE BRACKET
300C; 300D # [BEST FIT] LEFT CORNER BRACKET
300D; 300C # [BEST FIT] RIGHT CORNER BRACKET
300E; 300F # [BEST FIT] LEFT WHITE CORNER BRACKET
300F; 300E # [BEST FIT] RIGHT WHITE CORNER BRACKET
3010; 3011 # LEFT BLACK LENTICULAR BRACKET
3011; 3010 # RIGHT BLACK LENTICULAR BRACKET
3014; 3015 # LEFT TORTOISE SHELL BRACKET
3015; 3014 # RIGHT TORTOISE SHELL BRACKET
3016; 3017 # LEFT WHITE LENTICULAR BRACKET
3017; 3016 # RIGHT WHITE LENTICULAR BRACKET
3018; 3019 # LEFT WHITE TORTOISE SHELL BRACKET
3019; 3018 # RIGHT WHITE TORTOISE SHELL BRACKET
301A; 301B # LEFT WHITE SQUARE BRACKET
301B; 301A # RIGHT WHITE SQUARE BRACKET
FE59; FE5A # SMALL LEFT PARENTHESIS
FE5A; FE59 # SMALL RIGHT PARENTHESIS
FE5B; FE5C # SMALL LEFT CURLY BRACKET
FE5C; FE5B # SMALL RIGHT CURLY BRACKET
FE5D; FE5E # SMALL LEFT TORTOISE SHELL BRACKET
FE5E; FE5D # SMALL RIGHT TORTOISE SHELL BRACKET
FE64; FE65 # SMALL LESS-THAN SIGN
FE65; FE64 # SMALL GREATER-THAN SIGN
FF08; FF09 # FULLWIDTH LEFT PARENTHESIS
FF09; FF08 # FULLWIDTH RIGHT PARENTHESIS
FF1C; FF1E # FULLWIDTH LESS-THAN SIGN
FF1E; FF1C # FULLWIDTH GREATER-THAN SIGN
FF3B; FF3D # FULLWIDTH LEFT SQUARE BRACKET
FF3D; FF3B # FULLWIDTH RIGHT SQUARE BRACKET
FF5B; FF5D # FULLWIDTH LEFT CURLY BRACKET
FF5D; FF5B # FULLWIDTH RIGHT CURLY BRACKET
FF5F; FF60 # FULLWIDTH LEFT WHITE PARENTHESIS
FF60; FF5F # FULLWIDTH RIGHT WHITE PARENTHESIS
FF62; FF63 # [BEST FIT] HALFWIDTH LEFT CORNER BRACKET
FF63; FF62 # [BEST FIT] HALFWIDTH RIGHT CORNER BRACKET
# The following characters have no appropriate mirroring character.
# For these characters it is up to the rendering system
# to provide mirrored glyphs.
# 2140; DOUBLE-STRUCK N-ARY SUMMATION
# 2201; COMPLEMENT
# 2202; PARTIAL DIFFERENTIAL
# 2203; THERE EXISTS
# 2204; THERE DOES NOT EXIST
# 2211; N-ARY SUMMATION
# 2216; SET MINUS
# 221A; SQUARE ROOT
# 221B; CUBE ROOT
# 221C; FOURTH ROOT
# 221D; PROPORTIONAL TO
# 2226; NOT PARALLEL TO
# 222B; INTEGRAL
# 222C; DOUBLE INTEGRAL
# 222D; TRIPLE INTEGRAL
# 222E; CONTOUR INTEGRAL
# 222F; SURFACE INTEGRAL
# 2230; VOLUME INTEGRAL
# 2231; CLOCKWISE INTEGRAL
# 2232; CLOCKWISE CONTOUR INTEGRAL
# 2233; ANTICLOCKWISE CONTOUR INTEGRAL
# 2239; EXCESS
# 223B; HOMOTHETIC
# 223E; INVERTED LAZY S
# 223F; SINE WAVE
# 2240; WREATH PRODUCT
# 2241; NOT TILDE
# 2242; MINUS TILDE
# 2244; NOT ASYMPTOTICALLY EQUAL TO
# 2246; APPROXIMATELY BUT NOT ACTUALLY EQUAL TO
# 2247; NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO
# 2248; ALMOST EQUAL TO
# 2249; NOT ALMOST EQUAL TO
# 224A; ALMOST EQUAL OR EQUAL TO
# 224B; TRIPLE TILDE
# 225F; QUESTIONED EQUAL TO
# 2260; NOT EQUAL TO
# 2262; NOT IDENTICAL TO
# 228C; MULTISET
# 22A7; MODELS
# 22AA; TRIPLE VERTICAL BAR RIGHT TURNSTILE
# 22AC; DOES NOT PROVE
# 22AD; NOT TRUE
# 22AE; DOES NOT FORCE
# 22AF; NEGATED DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
# 22BE; RIGHT ANGLE WITH ARC
# 22BF; RIGHT TRIANGLE
# 22F5; ELEMENT OF WITH DOT ABOVE
# 22F8; ELEMENT OF WITH UNDERBAR
# 22F9; ELEMENT OF WITH TWO HORIZONTAL STROKES
# 22FF; Z NOTATION BAG MEMBERSHIP
# 2320; TOP HALF INTEGRAL
# 2321; BOTTOM HALF INTEGRAL
# 27C0; THREE DIMENSIONAL ANGLE
# 27CC; LONG DIVISION
# 27D3; LOWER RIGHT CORNER WITH DOT
# 27D4; UPPER LEFT CORNER WITH DOT
# 299C; RIGHT ANGLE VARIANT WITH SQUARE
# 299D; MEASURED RIGHT ANGLE WITH DOT
# 299E; ANGLE WITH S INSIDE
# 299F; ACUTE ANGLE
# 29A2; TURNED ANGLE
# 29A6; OBLIQUE ANGLE OPENING UP
# 29A7; OBLIQUE ANGLE OPENING DOWN
# 29C2; CIRCLE WITH SMALL CIRCLE TO THE RIGHT
# 29C3; CIRCLE WITH TWO HORIZONTAL STROKES TO THE RIGHT
# 29C9; TWO JOINED SQUARES
# 29CE; RIGHT TRIANGLE ABOVE LEFT TRIANGLE
# 29DC; INCOMPLETE INFINITY
# 29E1; INCREASES AS
# 29E3; EQUALS SIGN AND SLANTED PARALLEL
# 29E4; EQUALS SIGN AND SLANTED PARALLEL WITH TILDE ABOVE
# 29E5; IDENTICAL TO AND SLANTED PARALLEL
# 29F4; RULE-DELAYED
# 29F6; SOLIDUS WITH OVERBAR
# 29F7; REVERSE SOLIDUS WITH HORIZONTAL STROKE
# 2A0A; MODULO TWO SUM
# 2A0B; SUMMATION WITH INTEGRAL
# 2A0C; QUADRUPLE INTEGRAL OPERATOR
# 2A0D; FINITE PART INTEGRAL
# 2A0E; INTEGRAL WITH DOUBLE STROKE
# 2A0F; INTEGRAL AVERAGE WITH SLASH
# 2A10; CIRCULATION FUNCTION
# 2A11; ANTICLOCKWISE INTEGRATION
# 2A12; LINE INTEGRATION WITH RECTANGULAR PATH AROUND POLE
# 2A13; LINE INTEGRATION WITH SEMICIRCULAR PATH AROUND POLE
# 2A14; LINE INTEGRATION NOT INCLUDING THE POLE
# 2A15; INTEGRAL AROUND A POINT OPERATOR
# 2A16; QUATERNION INTEGRAL OPERATOR
# 2A17; INTEGRAL WITH LEFTWARDS ARROW WITH HOOK
# 2A18; INTEGRAL WITH TIMES SIGN
# 2A19; INTEGRAL WITH INTERSECTION
# 2A1A; INTEGRAL WITH UNION
# 2A1B; INTEGRAL WITH OVERBAR
# 2A1C; INTEGRAL WITH UNDERBAR
# 2A1E; LARGE LEFT TRIANGLE OPERATOR
# 2A1F; Z NOTATION SCHEMA COMPOSITION
# 2A20; Z NOTATION SCHEMA PIPING
# 2A21; Z NOTATION SCHEMA PROJECTION
# 2A24; PLUS SIGN WITH TILDE ABOVE
# 2A26; PLUS SIGN WITH TILDE BELOW
# 2A29; MINUS SIGN WITH COMMA ABOVE
# 2A3E; Z NOTATION RELATIONAL COMPOSITION
# 2A57; SLOPING LARGE OR
# 2A58; SLOPING LARGE AND
# 2A6A; TILDE OPERATOR WITH DOT ABOVE
# 2A6B; TILDE OPERATOR WITH RISING DOTS
# 2A6C; SIMILAR MINUS SIMILAR
# 2A6D; CONGRUENT WITH DOT ABOVE
# 2A6F; ALMOST EQUAL TO WITH CIRCUMFLEX ACCENT
# 2A70; APPROXIMATELY EQUAL OR EQUAL TO
# 2A73; EQUALS SIGN ABOVE TILDE OPERATOR
# 2A74; DOUBLE COLON EQUAL
# 2AA3; DOUBLE NESTED LESS-THAN WITH UNDERBAR
# 2ADC; FORKING
# 2AE2; VERTICAL BAR TRIPLE RIGHT TURNSTILE
# 2AE6; LONG DASH FROM LEFT MEMBER OF DOUBLE VERTICAL
# 2AF3; PARALLEL WITH TILDE OPERATOR
# 2AFB; TRIPLE SOLIDUS BINARY RELATION
# 2AFD; DOUBLE SOLIDUS OPERATOR
# 1D6DB; MATHEMATICAL BOLD PARTIAL DIFFERENTIAL
# 1D715; MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL
# 1D74F; MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL
# 1D789; MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL
# 1D7C3; MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL
# EOF

View File

@ -0,0 +1,212 @@
# PropertyAliases-14.0.0.txt
# Date: 2021-03-08, 19:35:48 GMT
# © 2021 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# For documentation, see http://www.unicode.org/reports/tr44/
#
# This file contains aliases for properties used in the UCD.
# These names can be used for XML formats of UCD data, for regular-expression
# property tests, and other programmatic textual descriptions of Unicode data.
#
# The names may be translated in appropriate environments, and additional
# aliases may be useful.
#
# FORMAT
#
# Each line has two or more fields, separated by semicolons.
#
# First Field: The first field is the short name for the property.
# It is typically an abbreviation, but in a number of cases it is simply
# a duplicate of the "long name" in the second field.
# For Unihan database tags, the short name is actually a longer string than
# the tag specified in the second field.
#
# Second Field: The second field is the long name for the property,
# typically the formal name used in documentation about the property.
#
# The above are the preferred aliases. Other aliases may be listed in additional fields.
#
# Loose matching should be applied to all property names and property values, with
# the exception of String Property values. With loose matching of property names and
# values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property
# values, numeric equivalencies are applied: thus "01.00" is equivalent to "1".
#
# NOTE: Property value names are NOT unique across properties. For example:
#
# AL means Arabic Letter for the Bidi_Class property, and
# AL means Above_Left for the Combining_Class property, and
# AL means Alphabetic for the Line_Break property.
#
# In addition, some property names may be the same as some property value names.
# For example:
#
# sc means the Script property, and
# Sc means the General_Category property value Currency_Symbol (Sc)
#
# The combination of property value and property name is, however, unique.
#
# For more information, see UAX #44, Unicode Character Database, and
# UTS #18, Unicode Regular Expressions.
# ================================================
# ================================================
# Numeric Properties
# ================================================
cjkAccountingNumeric ; kAccountingNumeric
cjkOtherNumeric ; kOtherNumeric
cjkPrimaryNumeric ; kPrimaryNumeric
nv ; Numeric_Value
# ================================================
# String Properties
# ================================================
cf ; Case_Folding
cjkCompatibilityVariant ; kCompatibilityVariant
dm ; Decomposition_Mapping
FC_NFKC ; FC_NFKC_Closure
lc ; Lowercase_Mapping
NFKC_CF ; NFKC_Casefold
scf ; Simple_Case_Folding ; sfc
slc ; Simple_Lowercase_Mapping
stc ; Simple_Titlecase_Mapping
suc ; Simple_Uppercase_Mapping
tc ; Titlecase_Mapping
uc ; Uppercase_Mapping
# ================================================
# Miscellaneous Properties
# ================================================
bmg ; Bidi_Mirroring_Glyph
bpb ; Bidi_Paired_Bracket
cjkIICore ; kIICore
cjkIRG_GSource ; kIRG_GSource
cjkIRG_HSource ; kIRG_HSource
cjkIRG_JSource ; kIRG_JSource
cjkIRG_KPSource ; kIRG_KPSource
cjkIRG_KSource ; kIRG_KSource
cjkIRG_MSource ; kIRG_MSource
cjkIRG_SSource ; kIRG_SSource
cjkIRG_TSource ; kIRG_TSource
cjkIRG_UKSource ; kIRG_UKSource
cjkIRG_USource ; kIRG_USource
cjkIRG_VSource ; kIRG_VSource
cjkRSUnicode ; kRSUnicode ; Unicode_Radical_Stroke; URS
EqUIdeo ; Equivalent_Unified_Ideograph
isc ; ISO_Comment
JSN ; Jamo_Short_Name
na ; Name
na1 ; Unicode_1_Name
Name_Alias ; Name_Alias
scx ; Script_Extensions
# ================================================
# Catalog Properties
# ================================================
age ; Age
blk ; Block
sc ; Script
# ================================================
# Enumerated Properties
# ================================================
bc ; Bidi_Class
bpt ; Bidi_Paired_Bracket_Type
ccc ; Canonical_Combining_Class
dt ; Decomposition_Type
ea ; East_Asian_Width
gc ; General_Category
GCB ; Grapheme_Cluster_Break
hst ; Hangul_Syllable_Type
InPC ; Indic_Positional_Category
InSC ; Indic_Syllabic_Category
jg ; Joining_Group
jt ; Joining_Type
lb ; Line_Break
NFC_QC ; NFC_Quick_Check
NFD_QC ; NFD_Quick_Check
NFKC_QC ; NFKC_Quick_Check
NFKD_QC ; NFKD_Quick_Check
nt ; Numeric_Type
SB ; Sentence_Break
vo ; Vertical_Orientation
WB ; Word_Break
# ================================================
# Binary Properties
# ================================================
AHex ; ASCII_Hex_Digit
Alpha ; Alphabetic
Bidi_C ; Bidi_Control
Bidi_M ; Bidi_Mirrored
Cased ; Cased
CE ; Composition_Exclusion
CI ; Case_Ignorable
Comp_Ex ; Full_Composition_Exclusion
CWCF ; Changes_When_Casefolded
CWCM ; Changes_When_Casemapped
CWKCF ; Changes_When_NFKC_Casefolded
CWL ; Changes_When_Lowercased
CWT ; Changes_When_Titlecased
CWU ; Changes_When_Uppercased
Dash ; Dash
Dep ; Deprecated
DI ; Default_Ignorable_Code_Point
Dia ; Diacritic
EBase ; Emoji_Modifier_Base
EComp ; Emoji_Component
EMod ; Emoji_Modifier
Emoji ; Emoji
EPres ; Emoji_Presentation
Ext ; Extender
ExtPict ; Extended_Pictographic
Gr_Base ; Grapheme_Base
Gr_Ext ; Grapheme_Extend
Gr_Link ; Grapheme_Link
Hex ; Hex_Digit
Hyphen ; Hyphen
IDC ; ID_Continue
Ideo ; Ideographic
IDS ; ID_Start
IDSB ; IDS_Binary_Operator
IDST ; IDS_Trinary_Operator
Join_C ; Join_Control
LOE ; Logical_Order_Exception
Lower ; Lowercase
Math ; Math
NChar ; Noncharacter_Code_Point
OAlpha ; Other_Alphabetic
ODI ; Other_Default_Ignorable_Code_Point
OGr_Ext ; Other_Grapheme_Extend
OIDC ; Other_ID_Continue
OIDS ; Other_ID_Start
OLower ; Other_Lowercase
OMath ; Other_Math
OUpper ; Other_Uppercase
Pat_Syn ; Pattern_Syntax
Pat_WS ; Pattern_White_Space
PCM ; Prepended_Concatenation_Mark
QMark ; Quotation_Mark
Radical ; Radical
RI ; Regional_Indicator
SD ; Soft_Dotted
STerm ; Sentence_Terminal
Term ; Terminal_Punctuation
UIdeo ; Unified_Ideograph
Upper ; Uppercase
VS ; Variation_Selector
WSpace ; White_Space ; space
XIDC ; XID_Continue
XIDS ; XID_Start
XO_NFC ; Expands_On_NFC
XO_NFD ; Expands_On_NFD
XO_NFKC ; Expands_On_NFKC
XO_NFKD ; Expands_On_NFKD
# ================================================
# Total: 129
# EOF

View File

@ -0,0 +1,628 @@
# ScriptExtensions-14.0.0.txt
# Date: 2021-06-04, 02:19:38 GMT
# © 2021 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# For documentation, see http://www.unicode.org/reports/tr44/
#
# The Script_Extensions property indicates which characters are commonly used
# with more than one script, but with a limited number of scripts.
# For each code point, there is one or more property values. Each such value is a Script property value.
# For more information, see:
# UAX #24, Unicode Script Property: https://www.unicode.org/reports/tr24/
# Especially the sections:
# https://www.unicode.org/reports/tr24/#Assignment_Script_Values
# https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
#
# Each Script_Extensions value in this file consists of a set
# of one or more abbreviated Script property values. The ordering of the
# values in that set is not material, but for stability in presentation
# it is given here as alphabetical.
#
# The Script_Extensions values are presented in sorted order in the file.
# They are sorted first by the number of Script property values in their sets,
# and then alphabetically by first differing Script property value.
#
# Following each distinct Script_Extensions value is the list of code
# points associated with that value, listed in code point order.
#
# All code points not explicitly listed for Script_Extensions
# have as their value the corresponding Script property value
#
# @missing: 0000..10FFFF; <script>
# ================================================
# Property: Script_Extensions
# ================================================
# Script_Extensions=Beng
1CF7 ; Beng # Mc VEDIC SIGN ATIKRAMA
# Total code points: 1
# ================================================
# Script_Extensions=Deva
1CD1 ; Deva # Mn VEDIC TONE SHARA
1CD4 ; Deva # Mn VEDIC SIGN YAJURVEDIC MIDLINE SVARITA
1CDB ; Deva # Mn VEDIC TONE TRIPLE SVARITA
1CDE..1CDF ; Deva # Mn [2] VEDIC TONE TWO DOTS BELOW..VEDIC TONE THREE DOTS BELOW
1CE2..1CE8 ; Deva # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
1CEB..1CEC ; Deva # Lo [2] VEDIC SIGN ANUSVARA VAMAGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL
1CEE..1CF1 ; Deva # Lo [4] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ANUSVARA UBHAYATO MUKHA
# Total code points: 18
# ================================================
# Script_Extensions=Dupl
1BCA0..1BCA3 ; Dupl # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
# Total code points: 4
# ================================================
# Script_Extensions=Grek
0342 ; Grek # Mn COMBINING GREEK PERISPOMENI
0345 ; Grek # Mn COMBINING GREEK YPOGEGRAMMENI
1DC0..1DC1 ; Grek # Mn [2] COMBINING DOTTED GRAVE ACCENT..COMBINING DOTTED ACUTE ACCENT
# Total code points: 4
# ================================================
# Script_Extensions=Hani
3006 ; Hani # Lo IDEOGRAPHIC CLOSING MARK
303E..303F ; Hani # So [2] IDEOGRAPHIC VARIATION INDICATOR..IDEOGRAPHIC HALF FILL SPACE
3190..3191 ; Hani # So [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK
3192..3195 ; Hani # No [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK
3196..319F ; Hani # So [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK
31C0..31E3 ; Hani # So [36] CJK STROKE T..CJK STROKE Q
3220..3229 ; Hani # No [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN
322A..3247 ; Hani # So [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO
3280..3289 ; Hani # No [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN
328A..32B0 ; Hani # So [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT
32C0..32CB ; Hani # So [12] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DECEMBER
32FF ; Hani # So SQUARE ERA NAME REIWA
3358..3370 ; Hani # So [25] IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR ZERO..IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR TWENTY-FOUR
337B..337F ; Hani # So [5] SQUARE ERA NAME HEISEI..SQUARE CORPORATION
33E0..33FE ; Hani # So [31] IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY ONE..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
1D360..1D371 ; Hani # No [18] COUNTING ROD UNIT DIGIT ONE..COUNTING ROD TENS DIGIT NINE
1F250..1F251 ; Hani # So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
# Total code points: 238
# ================================================
# Script_Extensions=Latn
0363..036F ; Latn # Mn [13] COMBINING LATIN SMALL LETTER A..COMBINING LATIN SMALL LETTER X
# Total code points: 13
# ================================================
# Script_Extensions=Nand
1CFA ; Nand # Lo VEDIC SIGN DOUBLE ANUSVARA ANTARGOMUKHA
# Total code points: 1
# ================================================
# Script_Extensions=Syrc
1DFA ; Syrc # Mn COMBINING DOT BELOW LEFT
# Total code points: 1
# ================================================
# Script_Extensions=Arab Copt
102E0 ; Arab Copt # Mn COPTIC EPACT THOUSANDS MARK
102E1..102FB ; Arab Copt # No [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED
# Total code points: 28
# ================================================
# Script_Extensions=Arab Rohg
06D4 ; Arab Rohg # Po ARABIC FULL STOP
# Total code points: 1
# ================================================
# Script_Extensions=Arab Nkoo
FD3E ; Arab Nkoo # Pe ORNATE LEFT PARENTHESIS
FD3F ; Arab Nkoo # Ps ORNATE RIGHT PARENTHESIS
# Total code points: 2
# ================================================
# Script_Extensions=Arab Syrc
064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
0670 ; Arab Syrc # Mn ARABIC LETTER SUPERSCRIPT ALEF
# Total code points: 12
# ================================================
# Script_Extensions=Arab Thaa
FDF2 ; Arab Thaa # Lo ARABIC LIGATURE ALLAH ISOLATED FORM
FDFD ; Arab Thaa # So ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
# Total code points: 2
# ================================================
# Script_Extensions=Beng Deva
1CD5..1CD6 ; Beng Deva # Mn [2] VEDIC TONE YAJURVEDIC AGGRAVATED INDEPENDENT SVARITA..VEDIC TONE YAJURVEDIC INDEPENDENT SVARITA
1CD8 ; Beng Deva # Mn VEDIC TONE CANDRA BELOW
1CE1 ; Beng Deva # Mc VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA
1CEA ; Beng Deva # Lo VEDIC SIGN ANUSVARA BAHIRGOMUKHA
1CED ; Beng Deva # Mn VEDIC SIGN TIRYAK
1CF5..1CF6 ; Beng Deva # Lo [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA
A8F1 ; Beng Deva # Mn COMBINING DEVANAGARI SIGN AVAGRAHA
# Total code points: 9
# ================================================
# Script_Extensions=Bopo Hani
302A..302D ; Bopo Hani # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
# Total code points: 4
# ================================================
# Script_Extensions=Bugi Java
A9CF ; Bugi Java # Lm JAVANESE PANGRANGKEP
# Total code points: 1
# ================================================
# Script_Extensions=Cprt Linb
10102 ; Cprt Linb # Po AEGEAN CHECK MARK
10137..1013F ; Cprt Linb # So [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT
# Total code points: 10
# ================================================
# Script_Extensions=Cyrl Glag
0484 ; Cyrl Glag # Mn COMBINING CYRILLIC PALATALIZATION
0487 ; Cyrl Glag # Mn COMBINING CYRILLIC POKRYTIE
2E43 ; Cyrl Glag # Po DASH WITH LEFT UPTURN
A66F ; Cyrl Glag # Mn COMBINING CYRILLIC VZMET
# Total code points: 4
# ================================================
# Script_Extensions=Cyrl Latn
0485..0486 ; Cyrl Latn # Mn [2] COMBINING CYRILLIC DASIA PNEUMATA..COMBINING CYRILLIC PSILI PNEUMATA
# Total code points: 2
# ================================================
# Script_Extensions=Cyrl Perm
0483 ; Cyrl Perm # Mn COMBINING CYRILLIC TITLO
# Total code points: 1
# ================================================
# Script_Extensions=Cyrl Syrc
1DF8 ; Cyrl Syrc # Mn COMBINING DOT ABOVE LEFT
# Total code points: 1
# ================================================
# Script_Extensions=Deva Gran
1CD3 ; Deva Gran # Po VEDIC SIGN NIHSHVASA
1CF3 ; Deva Gran # Lo VEDIC SIGN ROTATED ARDHAVISARGA
1CF8..1CF9 ; Deva Gran # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
# Total code points: 4
# ================================================
# Script_Extensions=Deva Nand
1CE9 ; Deva Nand # Lo VEDIC SIGN ANUSVARA ANTARGOMUKHA
# Total code points: 1
# ================================================
# Script_Extensions=Deva Shrd
1CD7 ; Deva Shrd # Mn VEDIC TONE YAJURVEDIC KATHAKA INDEPENDENT SVARITA
1CD9 ; Deva Shrd # Mn VEDIC TONE YAJURVEDIC KATHAKA INDEPENDENT SVARITA SCHROEDER
1CDC..1CDD ; Deva Shrd # Mn [2] VEDIC TONE KATHAKA ANUDATTA..VEDIC TONE DOT BELOW
1CE0 ; Deva Shrd # Mn VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA
# Total code points: 5
# ================================================
# Script_Extensions=Deva Taml
A8F3 ; Deva Taml # Lo DEVANAGARI SIGN CANDRABINDU VIRAMA
# Total code points: 1
# ================================================
# Script_Extensions=Geor Latn
10FB ; Geor Latn # Po GEORGIAN PARAGRAPH SEPARATOR
# Total code points: 1
# ================================================
# Script_Extensions=Gran Taml
0BE6..0BEF ; Gran Taml # Nd [10] TAMIL DIGIT ZERO..TAMIL DIGIT NINE
0BF0..0BF2 ; Gran Taml # No [3] TAMIL NUMBER TEN..TAMIL NUMBER ONE THOUSAND
0BF3 ; Gran Taml # So TAMIL DAY SIGN
11301 ; Gran Taml # Mn GRANTHA SIGN CANDRABINDU
11303 ; Gran Taml # Mc GRANTHA SIGN VISARGA
1133B..1133C ; Gran Taml # Mn [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA
11FD0..11FD1 ; Gran Taml # No [2] TAMIL FRACTION ONE QUARTER..TAMIL FRACTION ONE HALF-1
11FD3 ; Gran Taml # No TAMIL FRACTION THREE QUARTERS
# Total code points: 21
# ================================================
# Script_Extensions=Gujr Khoj
0AE6..0AEF ; Gujr Khoj # Nd [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Guru Mult
0A66..0A6F ; Guru Mult # Nd [10] GURMUKHI DIGIT ZERO..GURMUKHI DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Hani Latn
A700..A707 ; Hani Latn # Sk [8] MODIFIER LETTER CHINESE TONE YIN PING..MODIFIER LETTER CHINESE TONE YANG RU
# Total code points: 8
# ================================================
# Script_Extensions=Hira Kana
3031..3035 ; Hira Kana # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF
3099..309A ; Hira Kana # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
309B..309C ; Hira Kana # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
30A0 ; Hira Kana # Pd KATAKANA-HIRAGANA DOUBLE HYPHEN
30FC ; Hira Kana # Lm KATAKANA-HIRAGANA PROLONGED SOUND MARK
FF70 ; Hira Kana # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
FF9E..FF9F ; Hira Kana # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
# Total code points: 14
# ================================================
# Script_Extensions=Knda Nand
0CE6..0CEF ; Knda Nand # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Latn Mong
202F ; Latn Mong # Zs NARROW NO-BREAK SPACE
# Total code points: 1
# ================================================
# Script_Extensions=Mani Ougr
10AF2 ; Mani Ougr # Po MANICHAEAN PUNCTUATION DOUBLE DOT WITHIN DOT
# Total code points: 1
# ================================================
# Script_Extensions=Mong Phag
1802..1803 ; Mong Phag # Po [2] MONGOLIAN COMMA..MONGOLIAN FULL STOP
1805 ; Mong Phag # Po MONGOLIAN FOUR DOTS
# Total code points: 3
# ================================================
# Script_Extensions=Arab Syrc Thaa
061C ; Arab Syrc Thaa # Cf ARABIC LETTER MARK
# Total code points: 1
# ================================================
# Script_Extensions=Arab Thaa Yezi
0660..0669 ; Arab Thaa Yezi # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Beng Cakm Sylo
09E6..09EF ; Beng Cakm Sylo # Nd [10] BENGALI DIGIT ZERO..BENGALI DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Cakm Mymr Tale
1040..1049 ; Cakm Mymr Tale # Nd [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Cpmn Cprt Linb
10100..10101 ; Cpmn Cprt Linb # Po [2] AEGEAN WORD SEPARATOR LINE..AEGEAN WORD SEPARATOR DOT
# Total code points: 2
# ================================================
# Script_Extensions=Cprt Lina Linb
10107..10133 ; Cprt Lina Linb # No [45] AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOUSAND
# Total code points: 45
# ================================================
# Script_Extensions=Deva Gran Knda
1CF4 ; Deva Gran Knda # Mn VEDIC TONE CANDRA ABOVE
# Total code points: 1
# ================================================
# Script_Extensions=Deva Gran Latn
20F0 ; Deva Gran Latn # Mn COMBINING ASTERISK ABOVE
# Total code points: 1
# ================================================
# Script_Extensions=Hani Hira Kana
303C ; Hani Hira Kana # Lo MASU MARK
303D ; Hani Hira Kana # Po PART ALTERNATION MARK
# Total code points: 2
# ================================================
# Script_Extensions=Kali Latn Mymr
A92E ; Kali Latn Mymr # Po KAYAH LI SIGN CWI
# Total code points: 1
# ================================================
# Script_Extensions=Beng Deva Gran Knda
1CD0 ; Beng Deva Gran Knda # Mn VEDIC TONE KARSHANA
1CD2 ; Beng Deva Gran Knda # Mn VEDIC TONE PRENKHA
# Total code points: 2
# ================================================
# Script_Extensions=Buhd Hano Tagb Tglg
1735..1736 ; Buhd Hano Tagb Tglg # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION
# Total code points: 2
# ================================================
# Script_Extensions=Deva Dogr Kthi Mahj
0966..096F ; Deva Dogr Kthi Mahj # Nd [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Bopo Hang Hani Hira Kana
3003 ; Bopo Hang Hani Hira Kana # Po DITTO MARK
3013 ; Bopo Hang Hani Hira Kana # So GETA MARK
301C ; Bopo Hang Hani Hira Kana # Pd WAVE DASH
301D ; Bopo Hang Hani Hira Kana # Ps REVERSED DOUBLE PRIME QUOTATION MARK
301E..301F ; Bopo Hang Hani Hira Kana # Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK
3030 ; Bopo Hang Hani Hira Kana # Pd WAVY DASH
3037 ; Bopo Hang Hani Hira Kana # So IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL
FE45..FE46 ; Bopo Hang Hani Hira Kana # Po [2] SESAME DOT..WHITE SESAME DOT
# Total code points: 10
# ================================================
# Script_Extensions=Arab Nkoo Rohg Syrc Thaa Yezi
060C ; Arab Nkoo Rohg Syrc Thaa Yezi # Po ARABIC COMMA
061B ; Arab Nkoo Rohg Syrc Thaa Yezi # Po ARABIC SEMICOLON
# Total code points: 2
# ================================================
# Script_Extensions=Bopo Hang Hani Hira Kana Yiii
3001..3002 ; Bopo Hang Hani Hira Kana Yiii # Po [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP
3008 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT ANGLE BRACKET
3009 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT ANGLE BRACKET
300A ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT DOUBLE ANGLE BRACKET
300B ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT DOUBLE ANGLE BRACKET
300C ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT CORNER BRACKET
300D ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT CORNER BRACKET
300E ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE CORNER BRACKET
300F ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE CORNER BRACKET
3010 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT BLACK LENTICULAR BRACKET
3011 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT BLACK LENTICULAR BRACKET
3014 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT TORTOISE SHELL BRACKET
3015 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT TORTOISE SHELL BRACKET
3016 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE LENTICULAR BRACKET
3017 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE LENTICULAR BRACKET
3018 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE TORTOISE SHELL BRACKET
3019 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE TORTOISE SHELL BRACKET
301A ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE SQUARE BRACKET
301B ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE SQUARE BRACKET
30FB ; Bopo Hang Hani Hira Kana Yiii # Po KATAKANA MIDDLE DOT
FF61 ; Bopo Hang Hani Hira Kana Yiii # Po HALFWIDTH IDEOGRAPHIC FULL STOP
FF62 ; Bopo Hang Hani Hira Kana Yiii # Ps HALFWIDTH LEFT CORNER BRACKET
FF63 ; Bopo Hang Hani Hira Kana Yiii # Pe HALFWIDTH RIGHT CORNER BRACKET
FF64..FF65 ; Bopo Hang Hani Hira Kana Yiii # Po [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDLE DOT
# Total code points: 26
# ================================================
# Script_Extensions=Deva Knda Mlym Orya Taml Telu
1CDA ; Deva Knda Mlym Orya Taml Telu # Mn VEDIC TONE DOUBLE SVARITA
# Total code points: 1
# ================================================
# Script_Extensions=Adlm Arab Nkoo Rohg Syrc Thaa Yezi
061F ; Adlm Arab Nkoo Rohg Syrc Thaa Yezi # Po ARABIC QUESTION MARK
# Total code points: 1
# ================================================
# Script_Extensions=Beng Deva Gran Knda Nand Orya Telu Tirh
1CF2 ; Beng Deva Gran Knda Nand Orya Telu Tirh # Lo VEDIC SIGN ARDHAVISARGA
# Total code points: 1
# ================================================
# Script_Extensions=Adlm Arab Mand Mani Ougr Phlp Rohg Sogd Syrc
0640 ; Adlm Arab Mand Mani Ougr Phlp Rohg Sogd Syrc # Lm ARABIC TATWEEL
# Total code points: 1
# ================================================
# Script_Extensions=Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh
A836..A837 ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # So [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
A838 ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # Sc NORTH INDIC RUPEE MARK
A839 ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # So NORTH INDIC QUANTITY MARK
# Total code points: 4
# ================================================
# Script_Extensions=Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh
0952 ; Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh # Mn DEVANAGARI STRESS SIGN ANUDATTA
# Total code points: 1
# ================================================
# Script_Extensions=Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh
0951 ; Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh # Mn DEVANAGARI STRESS SIGN UDATTA
# Total code points: 1
# ================================================
# Script_Extensions=Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Nand Sind Takr Tirh
A833..A835 ; Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Nand Sind Takr Tirh # No [3] NORTH INDIC FRACTION ONE SIXTEENTH..NORTH INDIC FRACTION THREE SIXTEENTHS
# Total code points: 3
# ================================================
# Script_Extensions=Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Nand Sind Takr Tirh
A830..A832 ; Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Nand Sind Takr Tirh # No [3] NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTION THREE QUARTERS
# Total code points: 3
# ================================================
# Script_Extensions=Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh
0964 ; Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh # Po DEVANAGARI DANDA
# Total code points: 1
# ================================================
# Script_Extensions=Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Limb Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh
0965 ; Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Limb Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh # Po DEVANAGARI DOUBLE DANDA
# Total code points: 1
# EOF

View File

@ -0,0 +1,141 @@
const unsigned char _pcre_default_tables[] = {
0,1,2,3,4,5,6,7,
8,9,10,11,12,13,14,15,
16,17,18,19,20,21,22,23,
24,25,26,27,28,29,30,31,
32,33,34,35,36,37,38,39,
40,41,42,43,44,45,46,47,
48,49,50,51,52,53,54,55,
56,57,58,59,60,61,62,63,
64,97,98,99,100,101,102,103,
104,105,106,107,108,109,110,111,
112,113,114,115,116,117,118,119,
120,121,122,91,92,93,94,95,
96,97,98,99,100,101,102,103,
104,105,106,107,108,109,110,111,
112,113,114,115,116,117,118,119,
120,121,122,123,124,125,126,127,
128,129,130,131,132,133,134,135,
136,137,138,139,140,141,142,143,
144,145,146,147,148,149,150,151,
152,153,154,155,156,157,158,159,
160,161,162,163,164,165,166,167,
168,169,170,171,172,173,174,175,
176,177,178,179,180,181,182,183,
184,185,186,187,188,189,190,191,
224,225,226,227,228,229,230,231,
232,233,234,235,236,237,238,239,
240,241,242,243,244,245,246,215,
248,249,250,251,252,253,254,223,
224,225,226,227,228,229,230,231,
232,233,234,235,236,237,238,239,
240,241,242,243,244,245,246,247,
248,249,250,251,252,253,254,255,
0,1,2,3,4,5,6,7,
8,9,10,11,12,13,14,15,
16,17,18,19,20,21,22,23,
24,25,26,27,28,29,30,31,
32,33,34,35,36,37,38,39,
40,41,42,43,44,45,46,47,
48,49,50,51,52,53,54,55,
56,57,58,59,60,61,62,63,
64,97,98,99,100,101,102,103,
104,105,106,107,108,109,110,111,
112,113,114,115,116,117,118,119,
120,121,122,91,92,93,94,95,
96,65,66,67,68,69,70,71,
72,73,74,75,76,77,78,79,
80,81,82,83,84,85,86,87,
88,89,90,123,124,125,126,127,
128,129,130,131,132,133,134,135,
136,137,138,139,140,141,142,143,
144,145,146,147,148,149,150,151,
152,153,154,155,156,157,158,159,
160,161,162,163,164,165,166,167,
168,169,170,171,172,173,174,175,
176,177,178,179,180,181,182,183,
184,185,186,187,188,189,190,191,
224,225,226,227,228,229,230,231,
232,233,234,235,236,237,238,239,
240,241,242,243,244,245,246,215,
248,249,250,251,252,253,254,223,
192,193,194,195,196,197,198,199,
200,201,202,203,204,205,206,207,
208,209,210,211,212,213,214,247,
216,217,218,219,220,221,222,255,
0,62,0,0,1,0,0,0,
0,0,0,0,0,0,0,0,
32,0,0,0,1,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,255,3,
126,0,0,0,126,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,255,3,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,12,2,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
254,255,255,7,0,0,0,0,
0,0,0,0,0,0,0,0,
255,255,127,127,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,254,255,255,7,
0,0,0,0,0,4,32,4,
0,0,0,128,255,255,127,255,
0,0,0,0,0,0,255,3,
254,255,255,135,254,255,255,7,
0,0,0,0,0,4,44,6,
255,255,127,255,255,255,127,255,
0,0,0,0,254,255,255,255,
255,255,255,255,255,255,255,127,
0,0,0,0,254,255,255,255,
255,255,255,255,255,255,255,255,
0,2,0,0,255,255,255,255,
255,255,255,255,255,255,255,127,
0,0,0,0,255,255,255,255,
255,255,255,255,255,255,255,255,
0,0,0,0,254,255,0,252,
1,0,0,248,1,0,0,120,
0,0,0,0,254,255,255,255,
0,0,128,0,0,0,128,0,
255,255,255,255,0,0,0,0,
0,0,0,0,0,0,0,128,
255,255,255,255,0,0,0,0,
0,0,0,0,0,0,0,0,
/* Fiddled by hand when the table bits changed. May be broken! */
128,0,0,0,0,0,0,0,
0,1,1,1,1,1,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
1,0,0,0,128,0,0,0,
128,128,128,128,0,0,128,0,
24,24,24,24,24,24,24,24,
24,24,0,0,0,0,0,128,
0,18,18,18,18,18,18,18,
18,18,18,18,18,18,18,18,
18,18,18,18,18,18,18,18,
18,18,18,128,128,0,128,16,
0,18,18,18,18,18,18,18,
18,18,18,18,18,18,18,18,
18,18,18,18,18,18,18,18,
18,18,18,128,128,0,0,0,
0,0,0,0,0,1,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0,
0,0,18,0,0,0,0,0,
0,0,24,24,0,18,0,0,
0,24,18,0,0,0,0,0,
18,18,18,18,18,18,18,18,
18,18,18,18,18,18,18,18,
18,18,18,18,18,18,18,0,
18,18,18,18,18,18,18,18,
18,18,18,18,18,18,18,18,
18,18,18,18,18,18,18,18,
18,18,18,18,18,18,18,0,
18,18,18,18,18,18,18,18
};

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,50 @@
findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f
findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f
findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af
findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf
findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf
findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df
findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef
findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff
findprop 0100 0101 0102 0103 0104 0105 0106
findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7
findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
findprop 10000 10001 e01ef f0000 100000
findprop 1b00 12000 7c0 a840 10900
findprop 1d79 a77d
findprop 0800 083e a4d0 a4f7 aa80 aadf
findprop 10b00 10b35 13000 1342e 10840 10855
findprop 11100 1113c 11680 116c0
findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89
findprop 118a0 11ac7 16ad0
findprop 11700 14400 108e0 11280 1d800
findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
findprop a836 a833 1cf4 20f0 1cd0
findprop 32ff
findprop 1f16d
findprop U+10e93 U+10eaa
findprop 0602 202a 202b 202c 2068 2069 202d 202e 2067

View File

@ -0,0 +1,19 @@
find script Han
find type Pe script Common scriptx Hangul
find type Sk
find type Pd
find gbreak LVT
find script Old_Uyghur
find bidi PDF
find bidi CS
find bidi CS type Sm
find bidi B
find bidi FSI
find bidi PDI
find bidi RLI
find bidi RLO
find bidi S
find bidi WS
find script bopo
find bool prependedconcatenationmark
find bool pcm

View File

@ -0,0 +1,409 @@
findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
U+0000 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0001 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0002 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0003 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0004 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0005 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0006 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0007 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0008 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0009 S Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
U+000A B Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
U+000B S Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
U+000C WS Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
U+000D B Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
U+000E BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+000F BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
U+0010 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0011 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0012 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0013 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0014 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0015 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0016 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0017 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0018 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0019 BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+001A BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+001B BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+001C B Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+001D B Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+001E B Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+001F S Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
U+0020 WS Separator: Space separator, common, Other, [ascii, emoji, emojicomponent, graphemebase, patternsyntax]
U+0021 ON Punctuation: Other punctuation, common, Other, [ascii, caseignorable, graphemebase, patternsyntax, quotationmark]
U+0022 ON Punctuation: Other punctuation, common, Other, [ascii, graphemebase, math, patternsyntax]
U+0023 ET Punctuation: Other punctuation, common, Other, [ascii, dash, graphemebase, patternsyntax]
U+0024 ET Symbol: Currency symbol, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
U+0025 ET Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
U+0026 ON Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
U+0027 ON Punctuation: Other punctuation, common, Other, [ascii, bidimirrored, graphemebase, math, patternsyntax]
U+0028 ON Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
U+0029 ON Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
U+002A ON Punctuation: Other punctuation, common, Other, [ascii, dash, graphemebase, patternsyntax]
U+002B ES Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
U+002C CS Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
U+002D ES Punctuation: Dash punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
U+002E CS Punctuation: Other punctuation, common, Other, [graphemebase, whitespace]
U+002F CS Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
U+0030 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
U+0031 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
U+0032 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
U+0033 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
U+0034 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
U+0035 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
U+0036 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
U+0037 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
U+0038 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
U+0039 EN Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
U+003A CS Punctuation: Other punctuation, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
U+003B ON Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
U+003C ON Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
U+003D ON Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
U+003E ON Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
U+003F ON Punctuation: Other punctuation, common, Other, [ascii, caseignorable, graphemebase, patternsyntax, quotationmark]
findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
U+0040 ON Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
U+0041 L Letter: Upper case letter, latin, Other, U+0061, [graphemebase]
U+0042 L Letter: Upper case letter, latin, Other, U+0062, [graphemebase]
U+0043 L Letter: Upper case letter, latin, Other, U+0063, [graphemebase]
U+0044 L Letter: Upper case letter, latin, Other, U+0064, [graphemebase]
U+0045 L Letter: Upper case letter, latin, Other, U+0065, [graphemebase]
U+0046 L Letter: Upper case letter, latin, Other, U+0066, [graphemebase]
U+0047 L Letter: Upper case letter, latin, Other, U+0067, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+0048 L Letter: Upper case letter, latin, Other, U+0068, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+0049 L Letter: Upper case letter, latin, Other, U+0069, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+004A L Letter: Upper case letter, latin, Other, U+006A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+004B L Letter: Upper case letter, latin, Other, U+006B, U+212A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+004C L Letter: Upper case letter, latin, Other, U+006C, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+004D L Letter: Upper case letter, latin, Other, U+006D, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+004E L Letter: Upper case letter, latin, Other, U+006E, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+004F L Letter: Upper case letter, latin, Other, U+006F, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
U+0050 L Letter: Upper case letter, latin, Other, U+0070, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+0051 L Letter: Upper case letter, latin, Other, U+0071, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+0052 L Letter: Upper case letter, latin, Other, U+0072, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+0053 L Letter: Upper case letter, latin, Other, U+0073, U+017F, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+0054 L Letter: Upper case letter, latin, Other, U+0074, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+0055 L Letter: Upper case letter, latin, Other, U+0075, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+0056 L Letter: Upper case letter, latin, Other, U+0076, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+0057 L Letter: Upper case letter, latin, Other, U+0077, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+0058 L Letter: Upper case letter, latin, Other, U+0078, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+0059 L Letter: Upper case letter, latin, Other, U+0079, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+005A L Letter: Upper case letter, latin, Other, U+007A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
U+005B ON Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
U+005C ON Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
U+005D ON Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
U+005E ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
U+005F ON Punctuation: Connector punctuation, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, deprecated, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
U+0060 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
U+0061 L Letter: Lower case letter, latin, Other, U+0041, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
U+0062 L Letter: Lower case letter, latin, Other, U+0042, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
U+0063 L Letter: Lower case letter, latin, Other, U+0043, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
U+0064 L Letter: Lower case letter, latin, Other, U+0044, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
U+0065 L Letter: Lower case letter, latin, Other, U+0045, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
U+0066 L Letter: Lower case letter, latin, Other, U+0046, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
U+0067 L Letter: Lower case letter, latin, Other, U+0047, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+0068 L Letter: Lower case letter, latin, Other, U+0048, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+0069 L Letter: Lower case letter, latin, Other, U+0049, [caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
U+006A L Letter: Lower case letter, latin, Other, U+004A, [caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
U+006B L Letter: Lower case letter, latin, Other, U+004B, U+212A, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+006C L Letter: Lower case letter, latin, Other, U+004C, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+006D L Letter: Lower case letter, latin, Other, U+004D, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+006E L Letter: Lower case letter, latin, Other, U+004E, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+006F L Letter: Lower case letter, latin, Other, U+004F, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
U+0070 L Letter: Lower case letter, latin, Other, U+0050, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+0071 L Letter: Lower case letter, latin, Other, U+0051, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+0072 L Letter: Lower case letter, latin, Other, U+0052, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+0073 L Letter: Lower case letter, latin, Other, U+0053, U+017F, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+0074 L Letter: Lower case letter, latin, Other, U+0054, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+0075 L Letter: Lower case letter, latin, Other, U+0055, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+0076 L Letter: Lower case letter, latin, Other, U+0056, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+0077 L Letter: Lower case letter, latin, Other, U+0057, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+0078 L Letter: Lower case letter, latin, Other, U+0058, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+0079 L Letter: Lower case letter, latin, Other, U+0059, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+007A L Letter: Lower case letter, latin, Other, U+005A, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+007B ON Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
U+007C ON Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
U+007D ON Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
U+007E ON Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
U+007F BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f
U+0080 BN Control: Control, common, Control
U+0081 BN Control: Control, common, Control
U+0082 BN Control: Control, common, Control
U+0083 BN Control: Control, common, Control
U+0084 BN Control: Control, common, Control
U+0085 B Control: Control, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
U+0086 BN Control: Control, common, Control
U+0087 BN Control: Control, common, Control
U+0088 BN Control: Control, common, Control
U+0089 BN Control: Control, common, Control
U+008A BN Control: Control, common, Control
U+008B BN Control: Control, common, Control
U+008C BN Control: Control, common, Control
U+008D BN Control: Control, common, Control
U+008E BN Control: Control, common, Control
U+008F BN Control: Control, common, Control
findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f
U+0090 BN Control: Control, common, Control
U+0091 BN Control: Control, common, Control
U+0092 BN Control: Control, common, Control
U+0093 BN Control: Control, common, Control
U+0094 BN Control: Control, common, Control
U+0095 BN Control: Control, common, Control
U+0096 BN Control: Control, common, Control
U+0097 BN Control: Control, common, Control
U+0098 BN Control: Control, common, Control
U+0099 BN Control: Control, common, Control
U+009A BN Control: Control, common, Control
U+009B BN Control: Control, common, Control
U+009C BN Control: Control, common, Control
U+009D BN Control: Control, common, Control
U+009E BN Control: Control, common, Control
U+009F BN Control: Control, common, Control
findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af
U+00A0 CS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
U+00A1 ON Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
U+00A2 ET Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
U+00A3 ET Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
U+00A4 ET Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
U+00A5 ET Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
U+00A6 ON Symbol: Other symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
U+00A7 ON Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
U+00A8 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+00A9 ON Symbol: Other symbol, common, Extended Pictographic, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+00AA L Letter: Other letter, latin, Other, [caseignorable, graphemeextend]
U+00AB ON Punctuation: Initial punctuation, common, Other, [graphemebase, sentenceterminal, terminalpunctuation]
U+00AC ON Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
U+00AD BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
U+00AE ON Symbol: Other symbol, common, Extended Pictographic, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+00AF ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf
U+00B0 ET Symbol: Other symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
U+00B1 ET Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
U+00B2 EN Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+00B3 EN Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+00B4 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+00B5 L Letter: Lower case letter, common, Other, U+03BC, U+039C, [alphabetic, deprecated, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+00B6 ON Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
U+00B7 ON Punctuation: Other punctuation, common, Other, [alphabetic, graphemebase, idcontinue, xidcontinue]
U+00B8 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+00B9 EN Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+00BA L Letter: Other letter, latin, Other, [caseignorable, graphemeextend]
U+00BB ON Punctuation: Final punctuation, common, Other, [graphemebase, sentenceterminal, terminalpunctuation]
U+00BC ON Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+00BD ON Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+00BE ON Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+00BF ON Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf
U+00C0 L Letter: Upper case letter, latin, Other, U+00E0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00C1 L Letter: Upper case letter, latin, Other, U+00E1, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00C2 L Letter: Upper case letter, latin, Other, U+00E2, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00C3 L Letter: Upper case letter, latin, Other, U+00E3, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00C4 L Letter: Upper case letter, latin, Other, U+00E4, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00C5 L Letter: Upper case letter, latin, Other, U+00E5, U+212B, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00C6 L Letter: Upper case letter, latin, Other, U+00E6, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00C7 L Letter: Upper case letter, latin, Other, U+00E7, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00C8 L Letter: Upper case letter, latin, Other, U+00E8, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00C9 L Letter: Upper case letter, latin, Other, U+00E9, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00CA L Letter: Upper case letter, latin, Other, U+00EA, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00CB L Letter: Upper case letter, latin, Other, U+00EB, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00CC L Letter: Upper case letter, latin, Other, U+00EC, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00CD L Letter: Upper case letter, latin, Other, U+00ED, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00CE L Letter: Upper case letter, latin, Other, U+00EE, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00CF L Letter: Upper case letter, latin, Other, U+00EF, [alphabetic, graphemeextend, idcontinue, xidcontinue]
findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df
U+00D0 L Letter: Upper case letter, latin, Other, U+00F0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00D1 L Letter: Upper case letter, latin, Other, U+00F1, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00D2 L Letter: Upper case letter, latin, Other, U+00F2, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00D3 L Letter: Upper case letter, latin, Other, U+00F3, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00D4 L Letter: Upper case letter, latin, Other, U+00F4, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00D5 L Letter: Upper case letter, latin, Other, U+00F5, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00D6 L Letter: Upper case letter, latin, Other, U+00F6, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00D7 ON Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
U+00D8 L Letter: Upper case letter, latin, Other, U+00F8, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00D9 L Letter: Upper case letter, latin, Other, U+00F9, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00DA L Letter: Upper case letter, latin, Other, U+00FA, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00DB L Letter: Upper case letter, latin, Other, U+00FB, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00DC L Letter: Upper case letter, latin, Other, U+00FC, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00DD L Letter: Upper case letter, latin, Other, U+00FD, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00DE L Letter: Upper case letter, latin, Other, U+00FE, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+00DF L Letter: Lower case letter, latin, Other, U+1E9E, [alphabetic, deprecated, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef
U+00E0 L Letter: Lower case letter, latin, Other, U+00C0, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00E1 L Letter: Lower case letter, latin, Other, U+00C1, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00E2 L Letter: Lower case letter, latin, Other, U+00C2, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00E3 L Letter: Lower case letter, latin, Other, U+00C3, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00E4 L Letter: Lower case letter, latin, Other, U+00C4, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00E5 L Letter: Lower case letter, latin, Other, U+00C5, U+212B, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00E6 L Letter: Lower case letter, latin, Other, U+00C6, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00E7 L Letter: Lower case letter, latin, Other, U+00C7, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00E8 L Letter: Lower case letter, latin, Other, U+00C8, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00E9 L Letter: Lower case letter, latin, Other, U+00C9, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00EA L Letter: Lower case letter, latin, Other, U+00CA, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00EB L Letter: Lower case letter, latin, Other, U+00CB, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00EC L Letter: Lower case letter, latin, Other, U+00CC, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00ED L Letter: Lower case letter, latin, Other, U+00CD, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00EE L Letter: Lower case letter, latin, Other, U+00CE, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00EF L Letter: Lower case letter, latin, Other, U+00CF, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff
U+00F0 L Letter: Lower case letter, latin, Other, U+00D0, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00F1 L Letter: Lower case letter, latin, Other, U+00D1, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00F2 L Letter: Lower case letter, latin, Other, U+00D2, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00F3 L Letter: Lower case letter, latin, Other, U+00D3, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00F4 L Letter: Lower case letter, latin, Other, U+00D4, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00F5 L Letter: Lower case letter, latin, Other, U+00D5, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00F6 L Letter: Lower case letter, latin, Other, U+00D6, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00F7 ON Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
U+00F8 L Letter: Lower case letter, latin, Other, U+00D8, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00F9 L Letter: Lower case letter, latin, Other, U+00D9, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00FA L Letter: Lower case letter, latin, Other, U+00DA, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00FB L Letter: Lower case letter, latin, Other, U+00DB, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00FC L Letter: Lower case letter, latin, Other, U+00DC, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00FD L Letter: Lower case letter, latin, Other, U+00DD, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00FE L Letter: Lower case letter, latin, Other, U+00DE, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00FF L Letter: Lower case letter, latin, Other, U+0178, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
findprop 0100 0101 0102 0103 0104 0105 0106
U+0100 L Letter: Upper case letter, latin, Other, U+0101, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+0101 L Letter: Lower case letter, latin, Other, U+0100, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+0102 L Letter: Upper case letter, latin, Other, U+0103, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+0103 L Letter: Lower case letter, latin, Other, U+0102, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+0104 L Letter: Upper case letter, latin, Other, U+0105, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+0105 L Letter: Lower case letter, latin, Other, U+0104, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+0106 L Letter: Upper case letter, latin, Other, U+0107, [alphabetic, graphemeextend, idcontinue, xidcontinue]
findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7
U+FFE0 ET Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+FFE1 ET Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+FFE2 ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
U+FFE3 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+FFE4 ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+FFE5 ET Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+FFE6 ET Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+FFE7 L Control: Unassigned, unknown, Other
findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
U+FFE8 ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+FFE9 ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
U+FFEA ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
U+FFEB ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
U+FFEC ON Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
U+FFED ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+FFEE ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+FFEF L Control: Unassigned, unknown, Other
findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
U+FFF8 BN Control: Unassigned, unknown, Control, [dash, defaultignorablecodepoint, deprecated, extendedpictographic, joincontrol, lowercase, patternwhitespace, quotationmark, sentenceterminal, softdotted, xidcontinue, xidstart]
U+FFF9 ON Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
U+FFFA ON Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
U+FFFB ON Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
U+FFFC ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+FFFD ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+FFFE BN Control: Unassigned, unknown, Other, [changeswhenuppercased, deprecated, emojicomponent, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+FFFF BN Control: Unassigned, unknown, Other, [changeswhenuppercased, deprecated, emojicomponent, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
findprop 10000 10001 e01ef f0000 100000
U+10000 L Letter: Other letter, linearb, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+10001 L Letter: Other letter, linearb, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+E01EF NSM Mark: Non-spacing mark, inherited, Extend, []
U+F0000 L Control: Private use, unknown, Other
U+100000 L Control: Private use, unknown, Other
findprop 1b00 12000 7c0 a840 10900
U+1B00 NSM Mark: Non-spacing mark, balinese, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
U+12000 L Letter: Other letter, cuneiform, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+07C0 R Number: Decimal number, nko, Other, [graphemebase, patternsyntax, terminalpunctuation]
U+A840 L Letter: Other letter, phagspa, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+10900 R Letter: Other letter, phoenician, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
findprop 1d79 a77d
U+1D79 L Letter: Lower case letter, latin, Other, U+A77D, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+A77D L Letter: Upper case letter, latin, Other, U+1D79, [alphabetic, graphemeextend, idcontinue, xidcontinue]
findprop 0800 083e a4d0 a4f7 aa80 aadf
U+0800 R Letter: Other letter, samaritan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+083E R Punctuation: Other punctuation, samaritan, Other, [bidimirrored, graphemebase, math, patternsyntax]
U+A4D0 L Letter: Other letter, lisu, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+A4F7 L Letter: Other letter, lisu, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AA80 L Letter: Other letter, taiviet, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AADF L Punctuation: Other punctuation, taiviet, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
findprop 10b00 10b35 13000 1342e 10840 10855
U+10B00 R Letter: Other letter, avestan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+10B35 R Letter: Other letter, avestan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+13000 L Letter: Other letter, egyptianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+1342E L Letter: Other letter, egyptianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+10840 R Letter: Other letter, imperialaramaic, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+10855 R Letter: Other letter, imperialaramaic, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
findprop 11100 1113c 11680 116c0
U+11100 NSM Mark: Non-spacing mark, chakma, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
U+1113C L Number: Decimal number, chakma, Other, [graphemebase, patternsyntax, terminalpunctuation]
U+11680 L Letter: Other letter, takri, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+116C0 L Number: Decimal number, takri, Other, [graphemebase, patternsyntax, terminalpunctuation]
findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89
U+000D B Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
U+000A B Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
U+000E BN Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0711 NSM Mark: Non-spacing mark, syriac, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
U+1B04 L Mark: Spacing mark, balinese, SpacingMark, [dash, emoji, extendedpictographic, graphemebase, patternsyntax]
U+1111 L Letter: Other letter, hangul, Hangul syllable type L, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+1169 L Letter: Other letter, hangul, Hangul syllable type V, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+11FE L Letter: Other letter, hangul, Hangul syllable type T, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AE4C L Letter: Other letter, hangul, Hangul syllable type LV, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AD89 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
findprop 118a0 11ac7 16ad0
U+118A0 L Letter: Upper case letter, warangciti, Other, U+118C0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+11AC7 L Letter: Other letter, paucinhau, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+16AD0 L Letter: Other letter, bassavah, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
findprop 11700 14400 108e0 11280 1d800
U+11700 L Letter: Other letter, ahom, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+14400 L Letter: Other letter, anatolianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+108E0 R Letter: Other letter, hatran, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+11280 L Letter: Other letter, multani, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+1D800 L Symbol: Other symbol, signwriting, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
U+11800 L Letter: Other letter, dogra, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+1E903 R Letter: Upper case letter, adlam, Other, U+1E925, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+11DA9 L Number: Decimal number, gunjalagondi, Other, [graphemebase, patternsyntax, terminalpunctuation]
U+10D27 NSM Mark: Non-spacing mark, hanifirohingya, Extend, [extendedpictographic, graphemebase, patternsyntax]
U+11EE0 L Letter: Other letter, makasar, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+16E48 L Letter: Upper case letter, medefaidrin, Other, U+16E68, [alphabetic, graphemeextend, idcontinue, xidcontinue]
U+10F27 R Letter: Other letter, oldsogdian, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+10F30 AL Letter: Other letter, sogdian, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
findprop a836 a833 1cf4 20f0 1cd0
U+A836 L Symbol: Other symbol, common, Other, [devanagari, gurmukhi, gujarati, kaithi, takri, khojki, mahajani, modi, khudawadi, tirhuta, dogra], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+A833 L Number: Other number, common, Other, [devanagari, gurmukhi, gujarati, kannada, kaithi, takri, khojki, mahajani, modi, khudawadi, tirhuta, dogra, nandinagari], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+1CF4 NSM Mark: Non-spacing mark, inherited, Extend, [devanagari, kannada, grantha], [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
U+20F0 NSM Mark: Non-spacing mark, inherited, Extend, [latin, devanagari, grantha], [caseignorable, graphemebase, patternsyntax, quotationmark]
U+1CD0 NSM Mark: Non-spacing mark, inherited, Extend, [devanagari, bengali, kannada, grantha], [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
findprop 32ff
U+32FF L Symbol: Other symbol, common, Other, [han], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
findprop 1f16d
U+1F16D ON Symbol: Other symbol, common, Extended Pictographic, [ascii, sentenceterminal, unifiedideograph, whitespace, xidcontinue]
findprop U+10e93 U+10eaa
U+10E93 R Letter: Other letter, yezidi, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+10EAA R Control: Unassigned, unknown, Other
findprop 0602 202a 202b 202c 2068 2069 202d 202e 2067
U+0602 AN Control: Format, arabic, Prepend, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, lowercase]
U+202A LRE Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
U+202B RLE Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
U+202C PDF Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
U+2068 FSI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
U+2069 PDI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
U+202D LRO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
U+202E RLO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
U+2067 RLI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]

View File

@ -0,0 +1,298 @@
find script Han
U+2E80..U+2E99 ON Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
U+2E9B..U+2EF3 ON Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
U+2F00..U+2FD5 ON Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
U+3005 L Letter: Modifier letter, han, Other, [emoji, emojimodifierbase, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
U+3007 L Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
U+3021..U+3029 L Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
U+3038..U+303A L Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
U+303B L Letter: Modifier letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
U+3400..U+4DBF L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+4E00..U+9FFF L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+F900..U+FA0D L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
U+FA0E..U+FA0F L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+FA10 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
U+FA11 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+FA12 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
U+FA13..U+FA14 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+FA15..U+FA1E L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
U+FA1F L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+FA20 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
U+FA21 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+FA22 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
U+FA23..U+FA24 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+FA25..U+FA26 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
U+FA27..U+FA29 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+FA2A..U+FA6D L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
U+FA70..U+FAD9 L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
U+16FE2 ON Punctuation: Other punctuation, han, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+16FE3 L Letter: Modifier letter, han, Other, [emoji, emojimodifierbase, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
U+16FF0..U+16FF1 L Mark: Spacing mark, han, SpacingMark, [caseignorable, graphemeextend, idcontinue, ideographic, xidcontinue]
U+20000..U+2A6DF L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+2A700..U+2B738 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+2B740..U+2B81D L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+2B820..U+2CEA1 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+2CEB0..U+2EBE0 L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+2F800..U+2FA1D L Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
U+30000..U+3134A L Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
find type Pe script Common scriptx Hangul
U+3009 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
U+300B ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
U+300D ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [graphemebase, sentenceterminal, terminalpunctuation]
U+300F ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [graphemebase, sentenceterminal, terminalpunctuation]
U+3011 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
U+3015 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
U+3017 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
U+3019 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
U+301B ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
U+301E..U+301F ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han], [softdotted, terminalpunctuation, unifiedideograph, xidcontinue, xidstart]
U+FF63 ON Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, emojimodifier, emojimodifierbase]
find type Sk
U+005E ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
U+0060 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
U+00A8 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+00AF ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+00B4 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+00B8 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+02C2..U+02C5 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+02D2..U+02DF ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+02E5..U+02E9 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+02EA..U+02EB ON Symbol: Modifier symbol, bopomofo, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+02ED ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+02EF..U+02FF ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+0375 ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+0384 ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+0385 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+0888 AL Symbol: Modifier symbol, arabic, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, math, softdotted, xidcontinue, xidstart]
U+1FBD ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+1FBF..U+1FC1 ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+1FCD..U+1FCF ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+1FDD..U+1FDF ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+1FED..U+1FEF ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+1FFD..U+1FFE ON Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+309B..U+309C ON Symbol: Modifier symbol, common, Other, [hiragana, katakana], [alphabetic, bidimirrored, caseignorable, cased, changeswhencasefolded, changeswhenlowercased, changeswhentitlecased, changeswhenuppercased, dash, defaultignorablecodepoint, deprecated, diacritic, emoji, emojicomponent, emojimodifier, emojimodifierbase, emojipresentation, extendedpictographic, extender, graphemebase, graphemeextend, graphemelink, hexdigit, idsbinaryoperator, idstrinaryoperator, idcontinue, idstart, ideographic, sentenceterminal, unifiedideograph, whitespace, xidcontinue]
U+A700..U+A707 ON Symbol: Modifier symbol, common, Other, [latin, han], [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+A708..U+A716 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+A720..U+A721 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+A789..U+A78A L Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+AB5B L Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+AB6A..U+AB6B ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+FBB2..U+FBC2 AL Symbol: Modifier symbol, arabic, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, math, softdotted, xidcontinue, xidstart]
U+FF3E ON Symbol: Modifier symbol, common, Other, [asciihexdigit, bidicontrol, bidimirrored, cased, changeswhencasefolded, sentenceterminal, unifiedideograph, whitespace, xidstart]
U+FF40 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+FFE3 ON Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+1F3FB..U+1F3FF ON Symbol: Modifier symbol, common, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternsyntax, radical, sentenceterminal, terminalpunctuation]
find type Pd
U+002D ES Punctuation: Dash punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
U+058A ON Punctuation: Dash punctuation, armenian, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
U+05BE R Punctuation: Dash punctuation, hebrew, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
U+1400 ON Punctuation: Dash punctuation, canadianaboriginal, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
U+1806 ON Punctuation: Dash punctuation, mongolian, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
U+2010..U+2015 ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
U+2E17 ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
U+2E1A ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
U+2E3A..U+2E3B ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
U+2E40 ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
U+2E5D ON Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
U+301C ON Punctuation: Dash punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han], [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
U+3030 ON Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
U+30A0 ON Punctuation: Dash punctuation, common, Other, [hiragana, katakana], [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
U+FE31..U+FE32 ON Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
U+FE58 ON Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
U+FE63 ES Punctuation: Dash punctuation, common, Other, [caseignorable, sentenceterminal, unifiedideograph, xidcontinue]
U+FF0D ES Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
U+10EAD R Punctuation: Dash punctuation, yezidi, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
find gbreak LVT
U+AC01..U+AC1B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AC1D..U+AC37 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AC39..U+AC53 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AC55..U+AC6F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AC71..U+AC8B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AC8D..U+ACA7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+ACA9..U+ACC3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+ACC5..U+ACDF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+ACE1..U+ACFB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+ACFD..U+AD17 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AD19..U+AD33 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AD35..U+AD4F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AD51..U+AD6B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AD6D..U+AD87 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AD89..U+ADA3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+ADA5..U+ADBF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+ADC1..U+ADDB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+ADDD..U+ADF7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+ADF9..U+AE13 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AE15..U+AE2F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AE31..U+AE4B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AE4D..U+AE67 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AE69..U+AE83 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AE85..U+AE9F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AEA1..U+AEBB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AEBD..U+AED7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AED9..U+AEF3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AEF5..U+AF0F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AF11..U+AF2B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AF2D..U+AF47 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AF49..U+AF63 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AF65..U+AF7F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AF81..U+AF9B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AF9D..U+AFB7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AFB9..U+AFD3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AFD5..U+AFEF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+AFF1..U+B00B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B00D..U+B027 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B029..U+B043 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B045..U+B05F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B061..U+B07B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B07D..U+B097 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B099..U+B0B3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B0B5..U+B0CF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B0D1..U+B0EB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B0ED..U+B107 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B109..U+B123 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B125..U+B13F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B141..U+B15B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B15D..U+B177 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B179..U+B193 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B195..U+B1AF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B1B1..U+B1CB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B1CD..U+B1E7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B1E9..U+B203 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B205..U+B21F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B221..U+B23B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B23D..U+B257 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B259..U+B273 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B275..U+B28F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B291..U+B2AB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B2AD..U+B2C7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B2C9..U+B2E3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B2E5..U+B2FF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B301..U+B31B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B31D..U+B337 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B339..U+B353 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B355..U+B36F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B371..U+B38B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B38D..U+B3A7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B3A9..U+B3C3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B3C5..U+B3DF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B3E1..U+B3FB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B3FD..U+B417 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B419..U+B433 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B435..U+B44F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B451..U+B46B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B46D..U+B487 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B489..U+B4A3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B4A5..U+B4BF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B4C1..U+B4DB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B4DD..U+B4F7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B4F9..U+B513 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B515..U+B52F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B531..U+B54B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B54D..U+B567 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B569..U+B583 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B585..U+B59F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B5A1..U+B5BB L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B5BD..U+B5D7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B5D9..U+B5F3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B5F5..U+B60F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B611..U+B62B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B62D..U+B647 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B649..U+B663 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B665..U+B67F L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B681..U+B69B L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B69D..U+B6B7 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B6B9..U+B6D3 L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+B6D5..U+B6EF L Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
...
find script Old_Uyghur
U+10F70..U+10F81 R Letter: Other letter, olduyghur, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+10F82..U+10F85 NSM Mark: Non-spacing mark, olduyghur, Extend, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
U+10F86..U+10F89 R Punctuation: Other punctuation, olduyghur, Other, [bidimirrored, graphemebase, math, patternsyntax]
find bidi PDF
U+202C PDF Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
find bidi CS
U+002C CS Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
U+002E CS Punctuation: Other punctuation, common, Other, [graphemebase, whitespace]
U+002F CS Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
U+003A CS Punctuation: Other punctuation, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
U+00A0 CS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
U+060C CS Punctuation: Other punctuation, common, Other, [arabic, syriac, thaana, nko, hanifirohingya, yezidi], [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
U+202F CS Separator: Space separator, common, Other, [latin, mongolian], [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
U+2044 CS Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
U+FE50 CS Punctuation: Other punctuation, common, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
U+FE52 CS Punctuation: Other punctuation, common, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+FE55 CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
U+FF0C CS Punctuation: Other punctuation, common, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
U+FF0E CS Punctuation: Other punctuation, common, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+FF0F CS Punctuation: Other punctuation, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+FF1A CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
find bidi CS type Sm
U+2044 CS Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
find bidi B
U+000A B Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
U+000D B Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
U+001C..U+001E B Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
U+0085 B Control: Control, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
U+2029 B Separator: Paragraph separator, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
find bidi FSI
U+2068 FSI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
find bidi PDI
U+2069 PDI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
find bidi RLI
U+2067 RLI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
find bidi RLO
U+202E RLO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
find bidi S
U+0009 S Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
U+000B S Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
U+001F S Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
find bidi WS
U+000C WS Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
U+0020 WS Separator: Space separator, common, Other, [ascii, emoji, emojicomponent, graphemebase, patternsyntax]
U+1680 WS Separator: Space separator, ogham, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
U+2000..U+200A WS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
U+2028 WS Separator: Line separator, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
U+205F WS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
U+3000 WS Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
find script bopo
U+02EA..U+02EB ON Symbol: Modifier symbol, bopomofo, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
U+3105..U+312F L Letter: Other letter, bopomofo, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+31A0..U+31BF L Letter: Other letter, bopomofo, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
find bool prependedconcatenationmark
U+00AD BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
U+180E BN Control: Format, mongolian, Control, [caseignorable, prependedconcatenationmark]
U+200B BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
U+2060 BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
U+2118 ON Symbol: Mathematical symbol, common, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
U+3030 ON Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
U+AAC0 L Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
U+AAC2 L Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
U+FE0F NSM Mark: Non-spacing mark, inherited, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, math, patternwhitespace, prependedconcatenationmark]
U+FE55 CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
U+FEFF BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
U+FF1A CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
U+FF21..U+FF26 L Letter: Upper case letter, latin, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
U+10D22..U+10D23 AL Letter: Other letter, hanifirohingya, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
U+1135D L Letter: Other letter, grantha, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, graphemeextend, hexdigit, logicalorderexception, lowercase, math, noncharactercodepoint, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
U+1BCA0..U+1BCA3 BN Control: Format, common, Control, [duployan], [caseignorable, prependedconcatenationmark]
U+1D173..U+1D17A BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
U+1F1E6..U+1F1FF L Symbol: Other symbol, common, Regional Indicator, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
find bool pcm
U+00AD BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
U+180E BN Control: Format, mongolian, Control, [caseignorable, prependedconcatenationmark]
U+200B BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
U+2060 BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
U+2118 ON Symbol: Mathematical symbol, common, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
U+3030 ON Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
U+AAC0 L Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
U+AAC2 L Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
U+FE0F NSM Mark: Non-spacing mark, inherited, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, math, patternwhitespace, prependedconcatenationmark]
U+FE55 CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
U+FEFF BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
U+FF1A CS Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
U+FF21..U+FF26 L Letter: Upper case letter, latin, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
U+10D22..U+10D23 AL Letter: Other letter, hanifirohingya, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
U+1135D L Letter: Other letter, grantha, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, graphemeextend, hexdigit, logicalorderexception, lowercase, math, noncharactercodepoint, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
U+1BCA0..U+1BCA3 BN Control: Format, common, Control, [duployan], [caseignorable, prependedconcatenationmark]
U+1D173..U+1D17A BN Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
U+1F1E6..U+1F1FF L Symbol: Other symbol, common, Regional Indicator, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]

View File

@ -0,0 +1,347 @@
/****************************************************
* PCRE maintainers' helper program: UTF-8 converter *
****************************************************/
/* This is a test program for converting character code points to UTF-8 and
vice versa. Note that this program conforms to the original definition of
UTF-8, which allows codepoints up to 7fffffff. The more recent definition
limits the validity of Unicode UTF-8 codepoints to a maximum of 10ffffff, and
forbids the "surrogate" code points. This program now gives warnings for these
invalid code points.
The arguments are either single code point values written as U+hh.. or 0xhh..
for conversion to UTF-8, or sequences of hex values, written without 0x and
optionally including spaces (but such arguments must be quoted), for conversion
from UTF-8 to codepoints. For example:
./utf8 0x1234
U+00001234 => e1 88 b4
./utf8 "e1 88 b4"
U+00001234 <= e1 88 b4
In the second case, a number of UTF-8 characters can be present in one
argument. In other words, each such argument is interpreted (after ignoring
spaces) as a string of UTF-8 bytes representing a string of characters:
./utf8 "65 e188b4 77"
0x00000065 <= 65
0x00001234 <= e1 88 b4
0x00000077 <= 77
If the option -s is given, the sequence of UTF-bytes is written out between
angle brackets at the end of the line. On a UTF-8 terminal, this will show the
appropriate graphic for the code point.
Errors provoke error messages, but the program carries on with the next
argument. The return code is always zero.
Philip Hazel
Original creation data: unknown
Code extended and tidied to avoid compiler warnings: 26 March 2020
*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
/* The valid ranges for UTF-8 characters are:
0000 0000 to 0000 007f 1 byte (ascii)
0000 0080 to 0000 07ff 2 bytes
0000 0800 to 0000 ffff 3 bytes
0001 0000 to 001f ffff 4 bytes
0020 0000 to 03ff ffff 5 bytes
0400 0000 to 7fff ffff 6 bytes
*/
static const unsigned int utf8_table1[] = {
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
static const int utf8_table2[] = {
0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
static const int utf8_table3[] = {
0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
/*************************************************
* Convert character value to UTF-8 *
*************************************************/
/* This function takes an unsigned long integer value in the range 0 -
0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes.
Arguments:
cvalue the character value
buffer pointer to buffer for result - at least 6 bytes long
Returns: number of bytes placed in the buffer
0 if input code point is too big
*/
static size_t
ord2utf8(unsigned long int cvalue, unsigned char *buffer)
{
size_t i, j;
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
if (cvalue <= utf8_table1[i]) break;
if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
buffer += i;
for (j = i; j > 0; j--)
{
*buffer-- = 0x80 | (cvalue & 0x3f);
cvalue >>= 6;
}
*buffer = utf8_table2[i] | cvalue;
return i + 1;
}
/*************************************************
* Convert UTF-8 string to value *
*************************************************/
/* This function takes one or more bytes that represent a UTF-8 character from
the start of a string of bytes. It returns the value of the character, or the
offset of a malformation. For an overlong encoding that works but is not the
correct (shortest) one, the error offset is just after the last byte.
Argument:
buffer a pointer to the byte vector
buffend a pointer to the end of the buffer
vptr a pointer to a variable to receive the value
lenptr a pointer to a variable to receive the offset when error detected
Returns: > 0 => the number of bytes consumed
0 => invalid UTF-8: first byte missing 0x40 bit
-1 => invalid UTF-8: first byte has too many high-order 1-bits
-2 => incomplete sequence at end of string
-3 => incomplete sequence within string
-4 => overlong code sequence
*/
static int
utf82ord(unsigned char *buffer, unsigned char *buffend,
long unsigned int *vptr, int *lenptr)
{
unsigned int c = *buffer++;
unsigned int d = c;
int i, j, s;
/* Check for an ASCII character, or find the number of additional bytes in a
multibyte character. */
for (i = -1; i < 6; i++)
{
if ((d & 0x80) == 0) break;
d <<= 1;
}
switch (i)
{
case -1: /* ASCII character; first byte does not have 0x80 bit */
*vptr = c;
return 1;
case 0: /* First byte has 0x80 but is missing 0x40 bit */
*lenptr = 0;
return 0;
case 6:
*lenptr = 0; /* Too many high bits */
return -1;
default:
break;
}
/* i now has a value in the range 1-5 */
s = 6*i;
d = (c & utf8_table3[i]) << s;
for (j = 0; j < i; j++)
{
if (buffer >= buffend)
{
*lenptr = j + 1;
return -2;
}
c = *buffer++;
if ((c & 0xc0) != 0x80)
{
*lenptr = j + 1;
return -3;
}
s -= 6;
d |= (c & 0x3f) << s;
}
/* Valid UTF-8 syntax */
*vptr = d;
/* Check that encoding was the correct one, not overlong */
for (j = 0; j < (int)(sizeof(utf8_table1)/sizeof(int)); j++)
if (d <= utf8_table1[j]) break;
if (j != i)
{
*lenptr = i + 1;
return -4;
}
/* Valid value */
return i + 1;
}
/*************************************************
* Main Program *
*************************************************/
int
main(int argc, char **argv)
{
int i = 1;
int show = 0;
unsigned char buffer[64];
if (argc > 1 && strcmp(argv[1], "-s") == 0)
{
show = 1;
i = 2;
}
for (; i < argc; i++)
{
char *x = argv[i];
char *endptr;
if (strncmp(x, "0x", 2) == 0 || strncmp(x, "U+", 2) == 0)
{
size_t rc, j;
unsigned long int d = strtoul(x+2, &endptr, 16);
if (*endptr != 0)
{
printf("** Invalid hex number %s\n", x);
continue; /* With next argument */
}
rc = ord2utf8(d, buffer);
printf("U+%08lx => ", d);
if (rc == 0)
printf("** Code point greater than 0x7fffffff cannot be encoded");
else
{
for (j = 0; j < rc; j++) printf("%02x ", buffer[j]);
if (show)
{
printf(">");
for (j = 0; j < rc; j++) printf("%c", buffer[j]);
printf("< ");
}
if (d >= 0xd800 && d <= 0xdfff)
printf("** Invalid Unicode (surrogate)");
else if (d > 0x10ffff)
printf("** Invalid Unicode (greater than U+10ffff)");
}
printf("\n");
}
else
{
unsigned char *bptr;
unsigned char *buffend;
int len = 0;
int y = 0;
int z = 0;
for (;;)
{
while (*x == ' ') x++;
if (*x == 0 && !z) break;
if (!isxdigit(*x))
{
printf("** Malformed hex string: %s\n", argv[i]);
len = -1;
break;
}
y = y * 16 + tolower(*x) - ((isdigit(*x))? '0' : 'W');
x++;
if (z)
{
buffer[len++] = y;
y = 0;
}
z ^= 1;
}
if (len < 0) continue; /* With next argument after malformation */
bptr = buffer;
buffend = buffer + len;
while (bptr < buffend)
{
unsigned long int d;
int j;
int offset;
int rc = utf82ord(bptr, buffend, &d, &offset);
if (rc > 0)
{
printf("U+%08lx <= ", d);
for (j = 0; j < rc; j++) printf("%02x ", bptr[j]);
if (show)
{
printf(">");
for (j = 0; j < rc; j++) printf("%c", bptr[j]);
printf("<");
}
printf("\n");
bptr += rc;
}
else if (rc == -4)
{
printf("U+%08lx <= ", d);
for (j = 0; j < offset; j++) printf("%02x ", bptr[j]);
printf("** Overlong UTF-8 sequence\n");
bptr += offset;
}
else
{
switch (rc)
{
case 0: printf("** First byte missing 0x40 bit");
break;
case -1: printf("** First byte has too many high-order bits");
break;
case -2: printf("** Incomplete UTF-8 sequence at end of string");
break;
case -3: printf("** Incomplete UTF-8 sequence");
break;
default: printf("** Unexpected return %d from utf82ord()", rc);
break;
}
printf(" at offset %d in string ", offset);
while (bptr < buffend) printf("%02x ", *bptr++);
printf("\n");
break;
}
}
}
}
return 0;
}
/* End */