forked from LeenkxTeam/LNXSDK
		
	
		
			
	
	
		
			924 lines
		
	
	
		
			33 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			924 lines
		
	
	
		
			33 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|  | #! /usr/bin/python | ||
|  | 
 | ||
|  | #                   PCRE2 UNICODE PROPERTY SUPPORT | ||
|  | #                   ------------------------------ | ||
|  | # | ||
|  | # This script generates the pcre2_ucd.c file from Unicode data files. This is | ||
|  | # the compressed Unicode property data used by PCRE2. The script was created in | ||
|  | # December 2021 as part of the Unicode data generation refactoring. It is | ||
|  | # basically a re-working of the MultiStage2.py script that was submitted to the | ||
|  | # PCRE project by Peter Kankowski in 2008 as part of a previous upgrading of | ||
|  | # Unicode property support. A number of extensions have since been added. The | ||
|  | # main difference in the 2021 upgrade (apart from comments and layout) is that | ||
|  | # the data tables (e.g. list of script names) are now listed in or generated by | ||
|  | # a separate Python module that is shared with the other Generate scripts. | ||
|  | # | ||
|  | # This script must be run in the "maint" directory. It requires the following | ||
|  | # Unicode data tables: BidiMirrorring.txt, CaseFolding.txt, | ||
|  | # DerivedBidiClass.txt, DerivedCoreProperties.txt, DerivedGeneralCategory.txt, | ||
|  | # GraphemeBreakProperty.txt, PropList.txt, PropertyAliases.txt, | ||
|  | # PropertyValueAliases.txt, ScriptExtensions.txt, Scripts.txt, and | ||
|  | # emoji-data.txt. These must be in the Unicode.tables subdirectory. | ||
|  | # | ||
|  | # The emoji-data.txt file is found in the "emoji" subdirectory even though it | ||
|  | # is technically part of a different (but coordinated) standard as shown | ||
|  | # in files associated with Unicode Technical Standard #51 ("Unicode Emoji"), | ||
|  | # for example: | ||
|  | # | ||
|  | # http://unicode.org/Public/emoji/13.0/ReadMe.txt | ||
|  | # | ||
|  | # DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted" | ||
|  | # subdirectory of the Unicode database (UCD) on the Unicode web site; | ||
|  | # GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. The other files | ||
|  | # are in the top-level UCD directory. | ||
|  | # | ||
|  | # ----------------------------------------------------------------------------- | ||
|  | # Minor modifications made to the original script: | ||
|  | #  Added #! line at start | ||
|  | #  Removed tabs | ||
|  | #  Made it work with Python 2.4 by rewriting two statements that needed 2.5 | ||
|  | #  Consequent code tidy | ||
|  | #  Adjusted data file names to take from the Unicode.tables directory | ||
|  | #  Adjusted global table names by prefixing _pcre_. | ||
|  | #  Commented out stuff relating to the casefolding table, which isn't used; | ||
|  | #    removed completely in 2012. | ||
|  | #  Corrected size calculation | ||
|  | #  Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed. | ||
|  | #  Update for PCRE2: name changes, and SUPPORT_UCP is abolished. | ||
|  | # | ||
|  | # Major modifications made to the original script: | ||
|  | #  Added code to add a grapheme break property field to records. | ||
|  | # | ||
|  | #  Added code to search for sets of more than two characters that must match | ||
|  | #  each other caselessly. A new table is output containing these sets, and | ||
|  | #  offsets into the table are added to the main output records. This new | ||
|  | #  code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer | ||
|  | #  used. | ||
|  | # | ||
|  | #  Update for Python3: | ||
|  | #    . Processed with 2to3, but that didn't fix everything | ||
|  | #    . Changed string.strip to str.strip | ||
|  | #    . Added encoding='utf-8' to the open() call | ||
|  | #    . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is | ||
|  | #        required and the result of the division is a float | ||
|  | # | ||
|  | #  Added code to scan the emoji-data.txt file to find the Extended Pictographic | ||
|  | #  property, which is used by PCRE2 as a grapheme breaking property. This was | ||
|  | #  done when updating to Unicode 11.0.0 (July 2018). | ||
|  | # | ||
|  | #  Added code to add a Script Extensions field to records. This has increased | ||
|  | #  their size from 8 to 12 bytes, only 10 of which are currently used. | ||
|  | # | ||
|  | #  Added code to add a bidi class field to records by scanning the | ||
|  | #  DerivedBidiClass.txt and PropList.txt files. This uses one of the two spare | ||
|  | #  bytes, so now 11 out of 12 are in use. | ||
|  | # | ||
|  | # 01-March-2010:     Updated list of scripts for Unicode 5.2.0 | ||
|  | # 30-April-2011:     Updated list of scripts for Unicode 6.0.0 | ||
|  | #     July-2012:     Updated list of scripts for Unicode 6.1.0 | ||
|  | # 20-August-2012:    Added scan of GraphemeBreakProperty.txt and added a new | ||
|  | #                      field in the record to hold the value. Luckily, the | ||
|  | #                      structure had a hole in it, so the resulting table is | ||
|  | #                      not much bigger than before. | ||
|  | # 18-September-2012: Added code for multiple caseless sets. This uses the | ||
|  | #                      final hole in the structure. | ||
|  | # 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0 | ||
|  | # 13-May-2014:       Updated for PCRE2 | ||
|  | # 03-June-2014:      Updated for Python 3 | ||
|  | # 20-June-2014:      Updated for Unicode 7.0.0 | ||
|  | # 12-August-2014:    Updated to put Unicode version into the file | ||
|  | # 19-June-2015:      Updated for Unicode 8.0.0 | ||
|  | # 02-July-2017:      Updated for Unicode 10.0.0 | ||
|  | # 03-July-2018:      Updated for Unicode 11.0.0 | ||
|  | # 07-July-2018:      Added code to scan emoji-data.txt for the Extended | ||
|  | #                      Pictographic property. | ||
|  | # 01-October-2018:   Added the 'Unknown' script name | ||
|  | # 03-October-2018:   Added new field for Script Extensions | ||
|  | # 27-July-2019:      Updated for Unicode 12.1.0 | ||
|  | # 10-March-2020:     Updated for Unicode 13.0.0 | ||
|  | # PCRE2-10.39:       Updated for Unicode 14.0.0 | ||
|  | # 05-December-2021:  Added code to scan DerivedBidiClass.txt for bidi class, | ||
|  | #                      and also PropList.txt for the Bidi_Control property | ||
|  | # 19-December-2021:  Reworked script extensions lists to be bit maps instead | ||
|  | #                      of zero-terminated lists of script numbers. | ||
|  | # ---------------------------------------------------------------------------- | ||
|  | # | ||
|  | # Changes to the refactored script: | ||
|  | # | ||
|  | # 26-December-2021:  Refactoring completed | ||
|  | # 10-January-2022:   Addition of general Boolean property support | ||
|  | # 12-January-2022:   Merge scriptx and bidiclass fields | ||
|  | # 14-January-2022:   Enlarge Boolean property offset to 12 bits | ||
|  | # | ||
|  | # ---------------------------------------------------------------------------- | ||
|  | # | ||
|  | # | ||
|  | # The main tables generated by this script are used by macros defined in | ||
|  | # pcre2_internal.h. They look up Unicode character properties using short | ||
|  | # sequences of code that contains no branches, which makes for greater speed. | ||
|  | # | ||
|  | # Conceptually, there is a table of records (of type ucd_record), one for each | ||
|  | # Unicode character. Each record contains the script number, script extension | ||
|  | # value, character type, grapheme break type, offset to caseless matching set, | ||
|  | # offset to the character's other case, the bidi class, and offset to bitmap of | ||
|  | # Boolean properties. | ||
|  | # | ||
|  | # A real table covering all Unicode characters would be far too big. It can be | ||
|  | # efficiently compressed by observing that many characters have the same | ||
|  | # record, and many blocks of characters (taking 128 characters in a block) have | ||
|  | # the same set of records as other blocks. This leads to a 2-stage lookup | ||
|  | # process. | ||
|  | # | ||
|  | # This script constructs seven tables. The ucd_caseless_sets table contains | ||
|  | # lists of characters that all match each other caselessly. Each list is | ||
|  | # in order, and is terminated by NOTACHAR (0xffffffff), which is larger than | ||
|  | # any valid character. The first list is empty; this is used for characters | ||
|  | # that are not part of any list. | ||
|  | # | ||
|  | # The ucd_digit_sets table contains the code points of the '9' characters in | ||
|  | # each set of 10 decimal digits in Unicode. This is used to ensure that digits | ||
|  | # in script runs all come from the same set. The first element in the vector | ||
|  | # contains the number of subsequent elements, which are in ascending order. | ||
|  | # | ||
|  | # Scripts are partitioned into two groups. Scripts that appear in at least one | ||
|  | # character's script extension list come first, followed by "Unknown" and then | ||
|  | # all the rest. This sorting is done automatically in the GenerateCommon.py | ||
|  | # script. A script's number is its index in the script_names list. | ||
|  | # | ||
|  | # The ucd_script_sets table contains bitmaps that represent lists of scripts | ||
|  | # for Script Extensions properties. Each bitmap consists of a fixed number of | ||
|  | # unsigned 32-bit numbers, enough to allocate a bit for every script that is | ||
|  | # used in any character's extension list, that is, enough for every script | ||
|  | # whose number is less than ucp_Unknown. A character's script extension value | ||
|  | # in its ucd record is an offset into the ucd_script_sets vector. The first | ||
|  | # bitmap has no bits set; characters that have no script extensions have zero | ||
|  | # as their script extensions value so that they use this map. | ||
|  | # | ||
|  | # The ucd_boolprop_sets table contains bitmaps that represent lists of Boolean | ||
|  | # properties. Each bitmap consists of a fixed number of unsigned 32-bit | ||
|  | # numbers, enough to allocate a bit for each supported Boolean property. | ||
|  | # | ||
|  | # The ucd_records table contains one instance of every unique character record | ||
|  | # that is required. The ucd_stage1 table is indexed by a character's block | ||
|  | # number, which is the character's code point divided by 128, since 128 is the | ||
|  | # size of each block. The result of a lookup in ucd_stage1 a "virtual" block | ||
|  | # number. | ||
|  | # | ||
|  | # The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by | ||
|  | # the offset of a character within its own block, and the result is the index | ||
|  | # number of the required record in the ucd_records vector. | ||
|  | # | ||
|  | # The following examples are correct for the Unicode 14.0.0 database. Future | ||
|  | # updates may make change the actual lookup values. | ||
|  | # | ||
|  | # Example: lowercase "a" (U+0061) is in block 0 | ||
|  | #          lookup 0 in stage1 table yields 0 | ||
|  | #          lookup 97 (0x61) in the first table in stage2 yields 35 | ||
|  | #          record 35 is { 0, 5, 12, 0, -32, 18432, 44 } | ||
|  | #             0 = ucp_Latin   => Latin script | ||
|  | #             5 = ucp_Ll      => Lower case letter | ||
|  | #            12 = ucp_gbOther => Grapheme break property "Other" | ||
|  | #             0               => Not part of a caseless set | ||
|  | #           -32 (-0x20)       => Other case is U+0041 | ||
|  | #         18432 = 0x4800      => Combined Bidi class + script extension values | ||
|  | #            44               => Offset to Boolean properties | ||
|  | # | ||
|  | # The top 5 bits of the sixth field are the Bidi class, with the rest being the | ||
|  | # script extension value, giving: | ||
|  | # | ||
|  | #             9 = ucp_bidiL   => Bidi class left-to-right | ||
|  | #             0               => No special script extension property | ||
|  | # | ||
|  | # Almost all lowercase latin characters resolve to the same record. One or two | ||
|  | # are different because they are part of a multi-character caseless set (for | ||
|  | # example, k, K and the Kelvin symbol are such a set). | ||
|  | # | ||
|  | # Example: hiragana letter A (U+3042) is in block 96 (0x60) | ||
|  | #          lookup 96 in stage1 table yields 93 | ||
|  | #          lookup 66 (0x42) in table 93 in stage2 yields 819 | ||
|  | #          record 819 is { 20, 7, 12, 0, 0, 18432, 82 } | ||
|  | #            20 = ucp_Hiragana => Hiragana script | ||
|  | #             7 = ucp_Lo       => Other letter | ||
|  | #            12 = ucp_gbOther  => Grapheme break property "Other" | ||
|  | #             0                => Not part of a caseless set | ||
|  | #             0                => No other case | ||
|  | #         18432 = 0x4800       => Combined Bidi class + script extension values | ||
|  | #            82                => Offset to Boolean properties | ||
|  | # | ||
|  | # The top 5 bits of the sixth field are the Bidi class, with the rest being the | ||
|  | # script extension value, giving: | ||
|  | # | ||
|  | #             9 = ucp_bidiL   => Bidi class left-to-right | ||
|  | #             0               => No special script extension property | ||
|  | # | ||
|  | # Example: vedic tone karshana (U+1CD0) is in block 57 (0x39) | ||
|  | #          lookup 57 in stage1 table yields 55 | ||
|  | #          lookup 80 (0x50) in table 55 in stage2 yields 621 | ||
|  | #          record 621 is { 84, 12, 3, 0, 0, 26762, 96 } | ||
|  | #            84 = ucp_Inherited => Script inherited from predecessor | ||
|  | #            12 = ucp_Mn        => Non-spacing mark | ||
|  | #             3 = ucp_gbExtend  => Grapheme break property "Extend" | ||
|  | #             0                 => Not part of a caseless set | ||
|  | #             0                 => No other case | ||
|  | #         26762 = 0x688A        => Combined Bidi class + script extension values | ||
|  | #            96                 => Offset to Boolean properties | ||
|  | # | ||
|  | # The top 5 bits of the sixth field are the Bidi class, with the rest being the | ||
|  | # script extension value, giving: | ||
|  | # | ||
|  | #            13 = ucp_bidiNSM   => Bidi class non-spacing mark | ||
|  | #           138                 => Script Extension list offset = 138 | ||
|  | # | ||
|  | # At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8, | ||
|  | # 18, and 47 set. This means that this character is expected to be used with | ||
|  | # any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha. | ||
|  | # | ||
|  | #  Philip Hazel, last updated 14 January 2022. | ||
|  | ############################################################################## | ||
|  | 
 | ||
|  | 
 | ||
|  | # Import standard modules | ||
|  | 
 | ||
|  | import re | ||
|  | import string | ||
|  | import sys | ||
|  | 
 | ||
|  | # Import common data lists and functions | ||
|  | 
 | ||
|  | from GenerateCommon import \ | ||
|  |   bidi_classes, \ | ||
|  |   bool_properties, \ | ||
|  |   bool_propsfiles, \ | ||
|  |   bool_props_list_item_size, \ | ||
|  |   break_properties, \ | ||
|  |   category_names, \ | ||
|  |   general_category_names, \ | ||
|  |   script_abbrevs, \ | ||
|  |   script_list_item_size, \ | ||
|  |   script_names, \ | ||
|  |   open_output | ||
|  | 
 | ||
|  | # Some general parameters | ||
|  | 
 | ||
|  | MAX_UNICODE = 0x110000 | ||
|  | NOTACHAR = 0xffffffff | ||
|  | 
 | ||
|  | 
 | ||
|  | # --------------------------------------------------------------------------- | ||
|  | #                         DEFINE FUNCTIONS | ||
|  | # --------------------------------------------------------------------------- | ||
|  | 
 | ||
|  | 
 | ||
|  | # Parse a line of Scripts.txt, GraphemeBreakProperty.txt, DerivedBidiClass.txt | ||
|  | # or DerivedGeneralCategory.txt | ||
|  | 
 | ||
|  | def make_get_names(enum): | ||
|  |   return lambda chardata: enum.index(chardata[1]) | ||
|  | 
 | ||
|  | 
 | ||
|  | # Parse a line of CaseFolding.txt | ||
|  | 
 | ||
|  | def get_other_case(chardata): | ||
|  |   if chardata[1] == 'C' or chardata[1] == 'S': | ||
|  |     return int(chardata[2], 16) - int(chardata[0], 16) | ||
|  |   return 0 | ||
|  | 
 | ||
|  | 
 | ||
|  | # Parse a line of ScriptExtensions.txt | ||
|  | 
 | ||
|  | def get_script_extension(chardata): | ||
|  |   global last_script_extension | ||
|  | 
 | ||
|  |   offset = len(script_lists) * script_list_item_size | ||
|  |   if last_script_extension == chardata[1]: | ||
|  |     return offset - script_list_item_size | ||
|  | 
 | ||
|  |   last_script_extension = chardata[1] | ||
|  |   script_lists.append(tuple(script_abbrevs.index(abbrev) for abbrev in last_script_extension.split(' '))) | ||
|  |   return offset | ||
|  | 
 | ||
|  | 
 | ||
|  | # Read a whole table in memory, setting/checking the Unicode version | ||
|  | 
 | ||
|  | def read_table(file_name, get_value, default_value): | ||
|  |   global unicode_version | ||
|  | 
 | ||
|  |   f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name) | ||
|  |   file_base = f.group(1) | ||
|  |   version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$" | ||
|  |   file = open(file_name, 'r', encoding='utf-8') | ||
|  |   f = re.match(version_pat, file.readline()) | ||
|  |   version = f.group(1) | ||
|  |   if unicode_version == "": | ||
|  |     unicode_version = version | ||
|  |   elif unicode_version != version: | ||
|  |     print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr) | ||
|  | 
 | ||
|  |   table = [default_value] * MAX_UNICODE | ||
|  |   for line in file: | ||
|  |     line = re.sub(r'#.*', '', line) | ||
|  |     chardata = list(map(str.strip, line.split(';'))) | ||
|  |     if len(chardata) <= 1: | ||
|  |       continue | ||
|  |     value = get_value(chardata) | ||
|  |     m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) | ||
|  |     char = int(m.group(1), 16) | ||
|  |     if m.group(3) is None: | ||
|  |       last = char | ||
|  |     else: | ||
|  |       last = int(m.group(3), 16) | ||
|  |     for i in range(char, last + 1): | ||
|  |       # It is important not to overwrite a previously set value because in the | ||
|  |       # CaseFolding file there are lines to be ignored (returning the default | ||
|  |       # value of 0) which often come after a line which has already set data. | ||
|  |       if table[i] == default_value: | ||
|  |         table[i] = value | ||
|  |   file.close() | ||
|  |   return table | ||
|  | 
 | ||
|  | 
 | ||
|  | # Get the smallest possible C language type for the values in a table | ||
|  | 
 | ||
|  | def get_type_size(table): | ||
|  |   type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4), | ||
|  |     ("signed char", 1), ("int16_t", 2), ("int32_t", 4)] | ||
|  |   limits = [(0, 255), (0, 65535), (0, 4294967295), (-128, 127), | ||
|  |     (-32768, 32767), (-2147483648, 2147483647)] | ||
|  |   minval = min(table) | ||
|  |   maxval = max(table) | ||
|  |   for num, (minlimit, maxlimit) in enumerate(limits): | ||
|  |     if minlimit <= minval and maxval <= maxlimit: | ||
|  |       return type_size[num] | ||
|  |   raise OverflowError("Too large to fit into C types") | ||
|  | 
 | ||
|  | 
 | ||
|  | # Get the total size of a list of tables | ||
|  | 
 | ||
|  | def get_tables_size(*tables): | ||
|  |   total_size = 0 | ||
|  |   for table in tables: | ||
|  |     type, size = get_type_size(table) | ||
|  |     total_size += size * len(table) | ||
|  |   return total_size | ||
|  | 
 | ||
|  | 
 | ||
|  | # Compress a table into the two stages | ||
|  | 
 | ||
|  | def compress_table(table, block_size): | ||
|  |   blocks = {} # Dictionary for finding identical blocks | ||
|  |   stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table) | ||
|  |   stage2 = [] # Stage 2 table contains the blocks with property values | ||
|  |   table = tuple(table) | ||
|  |   for i in range(0, len(table), block_size): | ||
|  |     block = table[i:i+block_size] | ||
|  |     start = blocks.get(block) | ||
|  |     if start is None: | ||
|  |       # Allocate a new block | ||
|  |       start = len(stage2) / block_size | ||
|  |       stage2 += block | ||
|  |       blocks[block] = start | ||
|  |     stage1.append(start) | ||
|  |   return stage1, stage2 | ||
|  | 
 | ||
|  | 
 | ||
|  | # Output a table | ||
|  | 
 | ||
|  | def write_table(table, table_name, block_size = None): | ||
|  |   type, size = get_type_size(table) | ||
|  |   ELEMS_PER_LINE = 16 | ||
|  | 
 | ||
|  |   s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table)) | ||
|  |   if block_size: | ||
|  |     s += ", block = %d" % block_size | ||
|  |   f.write(s + " */\n") | ||
|  |   table = tuple(table) | ||
|  |   if block_size is None: | ||
|  |     fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */\n" | ||
|  |     mult = MAX_UNICODE / len(table) | ||
|  |     for i in range(0, len(table), ELEMS_PER_LINE): | ||
|  |       f.write(fmt % (table[i:i+ELEMS_PER_LINE] + (int(i * mult),))) | ||
|  |   else: | ||
|  |     if block_size > ELEMS_PER_LINE: | ||
|  |       el = ELEMS_PER_LINE | ||
|  |     else: | ||
|  |       el = block_size | ||
|  |     fmt = "%3d," * el + "\n" | ||
|  |     if block_size > ELEMS_PER_LINE: | ||
|  |       fmt = fmt * int(block_size / ELEMS_PER_LINE) | ||
|  |     for i in range(0, len(table), block_size): | ||
|  |       f.write(("\n/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])) | ||
|  |   f.write("};\n\n") | ||
|  | 
 | ||
|  | 
 | ||
|  | # Extract the unique combinations of properties into records | ||
|  | 
 | ||
|  | def combine_tables(*tables): | ||
|  |   records = {} | ||
|  |   index = [] | ||
|  |   for t in zip(*tables): | ||
|  |     i = records.get(t) | ||
|  |     if i is None: | ||
|  |       i = records[t] = len(records) | ||
|  |     index.append(i) | ||
|  |   return index, records | ||
|  | 
 | ||
|  | 
 | ||
|  | # Create a record struct | ||
|  | 
 | ||
|  | def get_record_size_struct(records): | ||
|  |   size = 0 | ||
|  |   structure = 'typedef struct {\n' | ||
|  |   for i in range(len(records[0])): | ||
|  |     record_slice = [record[i] for record in records] | ||
|  |     slice_type, slice_size = get_type_size(record_slice) | ||
|  |     # add padding: round up to the nearest power of slice_size | ||
|  |     size = (size + slice_size - 1) & -slice_size | ||
|  |     size += slice_size | ||
|  |     structure += '%s property_%d;\n' % (slice_type, i) | ||
|  | 
 | ||
|  |   # round up to the first item of the next structure in array | ||
|  |   record_slice = [record[0] for record in records] | ||
|  |   slice_type, slice_size = get_type_size(record_slice) | ||
|  |   size = (size + slice_size - 1) & -slice_size | ||
|  | 
 | ||
|  |   structure += '} ucd_record;\n*/\n' | ||
|  |   return size, structure | ||
|  | 
 | ||
|  | 
 | ||
|  | # Write records | ||
|  | 
 | ||
|  | def write_records(records, record_size): | ||
|  |   f.write('const ucd_record PRIV(ucd_records)[] = { ' + \ | ||
|  |     '/* %d bytes, record size %d */\n' % (len(records) * record_size, record_size)) | ||
|  |   records = list(zip(list(records.keys()), list(records.values()))) | ||
|  |   records.sort(key = lambda x: x[1]) | ||
|  |   for i, record in enumerate(records): | ||
|  |     f.write(('  {' + '%6d, ' * len(record[0]) + '}, /* %3d */\n') % (record[0] + (i,))) | ||
|  |   f.write('};\n\n') | ||
|  | 
 | ||
|  | 
 | ||
|  | # Write a bit set | ||
|  | 
 | ||
|  | def write_bitsets(list, item_size): | ||
|  |   for d in list: | ||
|  |     bitwords = [0] * item_size | ||
|  |     for idx in d: | ||
|  |       bitwords[idx // 32] |= 1 << (idx & 31) | ||
|  |     s = " " | ||
|  |     for x in bitwords: | ||
|  |       f.write("%s" % s) | ||
|  |       s = ", " | ||
|  |       f.write("0x%08xu" % x) | ||
|  |     f.write(",\n") | ||
|  |   f.write("};\n\n") | ||
|  | 
 | ||
|  | 
 | ||
|  | # --------------------------------------------------------------------------- | ||
|  | # This bit of code must have been useful when the original script was being | ||
|  | # developed. Retain it just in case it is ever needed again. | ||
|  | 
 | ||
|  | # def test_record_size(): | ||
|  | #   tests = [ \ | ||
|  | #     ( [(3,), (6,), (6,), (1,)], 1 ), \ | ||
|  | #     ( [(300,), (600,), (600,), (100,)], 2 ), \ | ||
|  | #     ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \ | ||
|  | #     ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \ | ||
|  | #     ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ | ||
|  | #     ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ | ||
|  | #     ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \ | ||
|  | #     ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \ | ||
|  | #   ] | ||
|  | #   for test in tests: | ||
|  | #     size, struct = get_record_size_struct(test[0]) | ||
|  | #     assert(size == test[1]) | ||
|  | # test_record_size() | ||
|  | # --------------------------------------------------------------------------- | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | # --------------------------------------------------------------------------- | ||
|  | #                       MAIN CODE FOR CREATING TABLES | ||
|  | # --------------------------------------------------------------------------- | ||
|  | 
 | ||
|  | unicode_version = "" | ||
|  | 
 | ||
|  | # Some of the tables imported from GenerateCommon.py have alternate comment | ||
|  | # strings for use by GenerateUcpHeader. The comments are not wanted here, so | ||
|  | # remove them. | ||
|  | 
 | ||
|  | bidi_classes = bidi_classes[::2] | ||
|  | break_properties = break_properties[::2] | ||
|  | category_names = category_names[::2] | ||
|  | 
 | ||
|  | # Create the various tables from Unicode data files | ||
|  | 
 | ||
|  | script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown')) | ||
|  | category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn')) | ||
|  | break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_properties), break_properties.index('Other')) | ||
|  | other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0) | ||
|  | bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', make_get_names(bidi_classes), bidi_classes.index('L')) | ||
|  | 
 | ||
|  | # The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now | ||
|  | # we need to find the Extended_Pictographic property for emoji characters. This | ||
|  | # can be set as an additional grapheme break property, because the default for | ||
|  | # all the emojis is "other". We scan the emoji-data.txt file and modify the | ||
|  | # break-props table. | ||
|  | 
 | ||
|  | file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8') | ||
|  | for line in file: | ||
|  |   line = re.sub(r'#.*', '', line) | ||
|  |   chardata = list(map(str.strip, line.split(';'))) | ||
|  |   if len(chardata) <= 1: | ||
|  |     continue | ||
|  |   if chardata[1] != "Extended_Pictographic": | ||
|  |     continue | ||
|  |   m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) | ||
|  |   char = int(m.group(1), 16) | ||
|  |   if m.group(3) is None: | ||
|  |     last = char | ||
|  |   else: | ||
|  |     last = int(m.group(3), 16) | ||
|  |   for i in range(char, last + 1): | ||
|  |     if break_props[i] != break_properties.index('Other'): | ||
|  |       print("WARNING: Emoji 0x%x has break property %s, not 'Other'", | ||
|  |         i, break_properties[break_props[i]], file=sys.stderr) | ||
|  |     break_props[i] = break_properties.index('Extended_Pictographic') | ||
|  | file.close() | ||
|  | 
 | ||
|  | # Handle script extensions. The get_script_extesion() function maintains a | ||
|  | # list of unique bitmaps representing lists of scripts, returning the offset | ||
|  | # in that list. Initialize the list with an empty set, which is used for | ||
|  | # characters that have no script extensions. | ||
|  | 
 | ||
|  | script_lists = [[]] | ||
|  | last_script_extension = "" | ||
|  | scriptx_bidi_class = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0) | ||
|  | 
 | ||
|  | for idx in range(len(scriptx_bidi_class)): | ||
|  |   scriptx_bidi_class[idx] = scriptx_bidi_class[idx] | (bidi_class[idx] << 11) | ||
|  | bidi_class = None | ||
|  | 
 | ||
|  | # Find the Boolean properties of each character. This next bit of magic creates | ||
|  | # a list of empty lists. Using [[]] * MAX_UNICODE gives a list of references to | ||
|  | # the *same* list, which is not what we want. | ||
|  | 
 | ||
|  | bprops = [[] for _ in range(MAX_UNICODE)] | ||
|  | 
 | ||
|  | # Collect the properties from the various files | ||
|  | 
 | ||
|  | for filename in bool_propsfiles: | ||
|  |   try: | ||
|  |     file = open('Unicode.tables/' + filename, 'r') | ||
|  |   except IOError: | ||
|  |     print(f"** Couldn't open {'Unicode.tables/' + filename}\n") | ||
|  |     sys.exit(1) | ||
|  | 
 | ||
|  |   for line in file: | ||
|  |     line = re.sub(r'#.*', '', line) | ||
|  |     data = list(map(str.strip, line.split(';'))) | ||
|  |     if len(data) <= 1: | ||
|  |       continue | ||
|  | 
 | ||
|  |     try: | ||
|  |       ix = bool_properties.index(data[1]) | ||
|  |     except ValueError: | ||
|  |       continue | ||
|  | 
 | ||
|  |     m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', data[0]) | ||
|  |     char = int(m.group(1), 16) | ||
|  |     if m.group(3) is None: | ||
|  |       last = char | ||
|  |     else: | ||
|  |       last = int(m.group(3), 16) | ||
|  | 
 | ||
|  |     for i in range(char, last + 1): | ||
|  |       bprops[i].append(ix) | ||
|  | 
 | ||
|  |   file.close() | ||
|  | 
 | ||
|  | # The ASCII property isn't listed in any files, but it is easy enough to add | ||
|  | # it manually. | ||
|  | 
 | ||
|  | ix = bool_properties.index("ASCII") | ||
|  | for i in range(128): | ||
|  |   bprops[i].append(ix) | ||
|  | 
 | ||
|  | # The Bidi_Mirrored property isn't listed in any property files. We have to | ||
|  | # deduce it from the file that lists the mirrored characters. | ||
|  | 
 | ||
|  | ix = bool_properties.index("Bidi_Mirrored") | ||
|  | 
 | ||
|  | try: | ||
|  |   file = open('Unicode.tables/BidiMirroring.txt', 'r') | ||
|  | except IOError: | ||
|  |   print(f"** Couldn't open {'Unicode.tables/BidiMirroring.txt'}\n") | ||
|  |   sys.exit(1) | ||
|  | 
 | ||
|  | for line in file: | ||
|  |   line = re.sub(r'#.*', '', line) | ||
|  |   data = list(map(str.strip, line.split(';'))) | ||
|  |   if len(data) <= 1: | ||
|  |     continue | ||
|  |   c = int(data[0], 16) | ||
|  |   bprops[c].append(ix) | ||
|  | 
 | ||
|  | file.close() | ||
|  | 
 | ||
|  | # Scan each character's boolean property list and created a list of unique | ||
|  | # lists, at the same time, setting the index in that list for each property in | ||
|  | # the bool_props vector. | ||
|  | 
 | ||
|  | bool_props = [0] * MAX_UNICODE | ||
|  | bool_props_lists = [[]] | ||
|  | 
 | ||
|  | for c in range(MAX_UNICODE): | ||
|  |   s = set(bprops[c]) | ||
|  |   for i in range(len(bool_props_lists)): | ||
|  |     if s == set(bool_props_lists[i]): | ||
|  |       break; | ||
|  |   else: | ||
|  |     bool_props_lists.append(bprops[c]) | ||
|  |     i += 1 | ||
|  | 
 | ||
|  |   bool_props[c] = i * bool_props_list_item_size | ||
|  | 
 | ||
|  | # This block of code was added by PH in September 2012. It scans the other_case | ||
|  | # table to find sets of more than two characters that must all match each other | ||
|  | # caselessly. Later in this script a table of these sets is written out. | ||
|  | # However, we have to do this work here in order to compute the offsets in the | ||
|  | # table that are inserted into the main table. | ||
|  | 
 | ||
|  | # The CaseFolding.txt file lists pairs, but the common logic for reading data | ||
|  | # sets only one value, so first we go through the table and set "return" | ||
|  | # offsets for those that are not already set. | ||
|  | 
 | ||
|  | for c in range(MAX_UNICODE): | ||
|  |   if other_case[c] != 0 and other_case[c + other_case[c]] == 0: | ||
|  |     other_case[c + other_case[c]] = -other_case[c] | ||
|  | 
 | ||
|  | # Now scan again and create equivalence sets. | ||
|  | 
 | ||
|  | caseless_sets = [] | ||
|  | 
 | ||
|  | for c in range(MAX_UNICODE): | ||
|  |   o = c + other_case[c] | ||
|  | 
 | ||
|  |   # Trigger when this character's other case does not point back here. We | ||
|  |   # now have three characters that are case-equivalent. | ||
|  | 
 | ||
|  |   if other_case[o] != -other_case[c]: | ||
|  |     t = o + other_case[o] | ||
|  | 
 | ||
|  |     # Scan the existing sets to see if any of the three characters are already | ||
|  |     # part of a set. If so, unite the existing set with the new set. | ||
|  | 
 | ||
|  |     appended = 0 | ||
|  |     for s in caseless_sets: | ||
|  |       found = 0 | ||
|  |       for x in s: | ||
|  |         if x == c or x == o or x == t: | ||
|  |           found = 1 | ||
|  | 
 | ||
|  |       # Add new characters to an existing set | ||
|  | 
 | ||
|  |       if found: | ||
|  |         found = 0 | ||
|  |         for y in [c, o, t]: | ||
|  |           for x in s: | ||
|  |             if x == y: | ||
|  |               found = 1 | ||
|  |           if not found: | ||
|  |             s.append(y) | ||
|  |         appended = 1 | ||
|  | 
 | ||
|  |     # If we have not added to an existing set, create a new one. | ||
|  | 
 | ||
|  |     if not appended: | ||
|  |       caseless_sets.append([c, o, t]) | ||
|  | 
 | ||
|  | # End of loop looking for caseless sets. | ||
|  | 
 | ||
|  | # Now scan the sets and set appropriate offsets for the characters. | ||
|  | 
 | ||
|  | caseless_offsets = [0] * MAX_UNICODE | ||
|  | 
 | ||
|  | offset = 1; | ||
|  | for s in caseless_sets: | ||
|  |   for x in s: | ||
|  |     caseless_offsets[x] = offset | ||
|  |   offset += len(s) + 1 | ||
|  | 
 | ||
|  | # End of block of code for creating offsets for caseless matching sets. | ||
|  | 
 | ||
|  | 
 | ||
|  | # Combine all the tables | ||
|  | 
 | ||
|  | table, records = combine_tables(script, category, break_props, | ||
|  |   caseless_offsets, other_case, scriptx_bidi_class, bool_props) | ||
|  | 
 | ||
|  | # Find the record size and create a string definition of the structure for | ||
|  | # outputting as a comment. | ||
|  | 
 | ||
|  | record_size, record_struct = get_record_size_struct(list(records.keys())) | ||
|  | 
 | ||
|  | # Find the optimum block size for the two-stage table | ||
|  | 
 | ||
|  | min_size = sys.maxsize | ||
|  | for block_size in [2 ** i for i in range(5,10)]: | ||
|  |   size = len(records) * record_size | ||
|  |   stage1, stage2 = compress_table(table, block_size) | ||
|  |   size += get_tables_size(stage1, stage2) | ||
|  |   #print "/* block size %5d  => %5d bytes */" % (block_size, size) | ||
|  |   if size < min_size: | ||
|  |     min_size = size | ||
|  |     min_stage1, min_stage2 = stage1, stage2 | ||
|  |     min_block_size = block_size | ||
|  | 
 | ||
|  | 
 | ||
|  | # --------------------------------------------------------------------------- | ||
|  | #                   MAIN CODE FOR WRITING THE OUTPUT FILE | ||
|  | # --------------------------------------------------------------------------- | ||
|  | 
 | ||
|  | # Open the output file (no return on failure). This call also writes standard | ||
|  | # header boilerplate. | ||
|  | 
 | ||
|  | f = open_output("pcre2_ucd.c") | ||
|  | 
 | ||
|  | # Output this file's heading text | ||
|  | 
 | ||
|  | f.write("""\
 | ||
|  | /* This file contains tables of Unicode properties that are extracted from | ||
|  | Unicode data files. See the comments at the start of maint/GenerateUcd.py for | ||
|  | details. | ||
|  | 
 | ||
|  | As well as being part of the PCRE2 library, this file is #included by the | ||
|  | pcre2test program, which redefines the PRIV macro to change table names from | ||
|  | _pcre2_xxx to xxxx, thereby avoiding name clashes with the library. At present, | ||
|  | just one of these tables is actually needed. When compiling the library, some | ||
|  | headers are needed. */ | ||
|  | 
 | ||
|  | #ifndef PCRE2_PCRE2TEST | ||
|  | #ifdef HAVE_CONFIG_H | ||
|  | #include "config.h" | ||
|  | #endif | ||
|  | #include "pcre2_internal.h" | ||
|  | #endif /* PCRE2_PCRE2TEST */ | ||
|  | 
 | ||
|  | /* The tables herein are needed only when UCP support is built, and in PCRE2 | ||
|  | that happens automatically with UTF support. This module should not be | ||
|  | referenced otherwise, so it should not matter whether it is compiled or not. | ||
|  | However a comment was received about space saving - maybe the guy linked all | ||
|  | the modules rather than using a library - so we include a condition to cut out | ||
|  | the tables when not needed. But don't leave a totally empty module because some | ||
|  | compilers barf at that. Instead, just supply some small dummy tables. */ | ||
|  | 
 | ||
|  | #ifndef SUPPORT_UNICODE | ||
|  | const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0}}; | ||
|  | const uint16_t PRIV(ucd_stage1)[] = {0}; | ||
|  | const uint16_t PRIV(ucd_stage2)[] = {0}; | ||
|  | const uint32_t PRIV(ucd_caseless_sets)[] = {0}; | ||
|  | #else | ||
|  | \n""")
 | ||
|  | 
 | ||
|  | # --- Output some variable heading stuff --- | ||
|  | 
 | ||
|  | f.write("/* Total size: %d bytes, block size: %d. */\n\n" % (min_size, min_block_size)) | ||
|  | f.write('const char *PRIV(unicode_version) = "{}";\n\n'.format(unicode_version)) | ||
|  | 
 | ||
|  | f.write("""\
 | ||
|  | /* When recompiling tables with a new Unicode version, please check the types | ||
|  | in this structure definition with those in pcre2_internal.h (the actual field | ||
|  | names will be different). | ||
|  | \n""")
 | ||
|  | 
 | ||
|  | f.write(record_struct) | ||
|  | 
 | ||
|  | f.write("""
 | ||
|  | /* If the 32-bit library is run in non-32-bit mode, character values greater | ||
|  | than 0x10ffff may be encountered. For these we set up a special record. */ | ||
|  | 
 | ||
|  | #if PCRE2_CODE_UNIT_WIDTH == 32 | ||
|  | const ucd_record PRIV(dummy_ucd_record)[] = {{ | ||
|  |   ucp_Unknown,    /* script */ | ||
|  |   ucp_Cn,         /* type unassigned */ | ||
|  |   ucp_gbOther,    /* grapheme break property */ | ||
|  |   0,              /* case set */ | ||
|  |   0,              /* other case */ | ||
|  |   0 | (ucp_bidiL << UCD_BIDICLASS_SHIFT), /* script extension and bidi class */ | ||
|  |   0,              /* bool properties offset */ | ||
|  |   }}; | ||
|  | #endif | ||
|  | \n""")
 | ||
|  | 
 | ||
|  | # --- Output the table of caseless character sets --- | ||
|  | 
 | ||
|  | f.write("""\
 | ||
|  | /* This table contains lists of characters that are caseless sets of | ||
|  | more than one character. Each list is terminated by NOTACHAR. */ | ||
|  | 
 | ||
|  | const uint32_t PRIV(ucd_caseless_sets)[] = { | ||
|  |   NOTACHAR, | ||
|  | """)
 | ||
|  | 
 | ||
|  | for s in caseless_sets: | ||
|  |   s = sorted(s) | ||
|  |   for x in s: | ||
|  |     f.write('  0x%04x,' % x) | ||
|  |   f.write('  NOTACHAR,\n') | ||
|  | f.write('};\n\n') | ||
|  | 
 | ||
|  | # --- Other tables are not needed by pcre2test --- | ||
|  | 
 | ||
|  | f.write("""\
 | ||
|  | /* When #included in pcre2test, we don't need the table of digit sets, nor the | ||
|  | the large main UCD tables. */ | ||
|  | 
 | ||
|  | #ifndef PCRE2_PCRE2TEST | ||
|  | \n""")
 | ||
|  | 
 | ||
|  | # --- Read Scripts.txt again for the sets of 10 digits. --- | ||
|  | 
 | ||
|  | digitsets = [] | ||
|  | file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8') | ||
|  | 
 | ||
|  | for line in file: | ||
|  |   m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line) | ||
|  |   if m is None: | ||
|  |     continue | ||
|  |   first = int(m.group(1),16) | ||
|  |   last  = int(m.group(2),16) | ||
|  |   if ((last - first + 1) % 10) != 0: | ||
|  |     f.write("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last), | ||
|  |       file=sys.stderr) | ||
|  |   while first < last: | ||
|  |     digitsets.append(first + 9) | ||
|  |     first += 10 | ||
|  | file.close() | ||
|  | digitsets.sort() | ||
|  | 
 | ||
|  | f.write("""\
 | ||
|  | /* This table lists the code points for the '9' characters in each set of | ||
|  | decimal digits. It is used to ensure that all the digits in a script run come | ||
|  | from the same set. */ | ||
|  | 
 | ||
|  | const uint32_t PRIV(ucd_digit_sets)[] = { | ||
|  | """)
 | ||
|  | 
 | ||
|  | f.write("  %d,  /* Number of subsequent values */" % len(digitsets)) | ||
|  | count = 8 | ||
|  | for d in digitsets: | ||
|  |   if count == 8: | ||
|  |     f.write("\n ") | ||
|  |     count = 0 | ||
|  |   f.write(" 0x%05x," % d) | ||
|  |   count += 1 | ||
|  | f.write("\n};\n\n") | ||
|  | 
 | ||
|  | f.write("""\
 | ||
|  | /* This vector is a list of script bitsets for the Script Extension property. | ||
|  | The number of 32-bit words in each bitset is #defined in pcre2_ucp.h as | ||
|  | ucd_script_sets_item_size. */ | ||
|  | 
 | ||
|  | const uint32_t PRIV(ucd_script_sets)[] = { | ||
|  | """)
 | ||
|  | write_bitsets(script_lists, script_list_item_size) | ||
|  | 
 | ||
|  | f.write("""\
 | ||
|  | /* This vector is a list of bitsets for Boolean properties. The number of | ||
|  | 32_bit words in each bitset is #defined as ucd_boolprop_sets_item_size in | ||
|  | pcre2_ucp.h. */ | ||
|  | 
 | ||
|  | const uint32_t PRIV(ucd_boolprop_sets)[] = { | ||
|  | """)
 | ||
|  | write_bitsets(bool_props_lists, bool_props_list_item_size) | ||
|  | 
 | ||
|  | 
 | ||
|  | # Output the main UCD tables. | ||
|  | 
 | ||
|  | f.write("""\
 | ||
|  | /* These are the main two-stage UCD tables. The fields in each record are: | ||
|  | script (8 bits), character type (8 bits), grapheme break property (8 bits), | ||
|  | offset to multichar other cases or zero (8 bits), offset to other case or zero | ||
|  | (32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed | ||
|  | into a 16-bit field, and offset in binary properties table (16 bits). */ | ||
|  | \n""")
 | ||
|  | 
 | ||
|  | write_records(records, record_size) | ||
|  | write_table(min_stage1, 'PRIV(ucd_stage1)') | ||
|  | write_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size) | ||
|  | 
 | ||
|  | f.write("#if UCD_BLOCK_SIZE != %d\n" % min_block_size) | ||
|  | f.write("""\
 | ||
|  | #error Please correct UCD_BLOCK_SIZE in pcre2_internal.h | ||
|  | #endif | ||
|  | #endif  /* SUPPORT_UNICODE */ | ||
|  | 
 | ||
|  | #endif  /* PCRE2_PCRE2TEST */ | ||
|  | 
 | ||
|  | /* End of pcre2_ucd.c */ | ||
|  | """)
 | ||
|  | 
 | ||
|  | f.close | ||
|  | 
 | ||
|  | # End |