356 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			356 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|  | #! /usr/bin/python | ||
|  | 
 | ||
|  | #                   PCRE2 UNICODE PROPERTY SUPPORT | ||
|  | #                   ------------------------------ | ||
|  | 
 | ||
|  | # This file is a Python module containing common lists and functions for the | ||
|  | # GenerateXXX scripts that create various.c and .h files from Unicode data | ||
|  | # files. It was created as part of a re-organizaton of these scripts in | ||
|  | # December 2021. | ||
|  | 
 | ||
|  | 
 | ||
|  | import re | ||
|  | 
 | ||
|  | 
 | ||
|  | # --------------------------------------------------------------------------- | ||
|  | #                             DATA LISTS | ||
|  | # --------------------------------------------------------------------------- | ||
|  | 
 | ||
|  | # BIDI classes in the DerivedBidiClass.txt file, with comments. | ||
|  | 
 | ||
|  | bidi_classes = [ | ||
|  |   'AL',  'Arabic letter', | ||
|  |   'AN',  'Arabic number', | ||
|  |   'B',   'Paragraph separator', | ||
|  |   'BN',  'Boundary neutral', | ||
|  |   'CS',  'Common separator', | ||
|  |   'EN',  'European number', | ||
|  |   'ES',  'European separator', | ||
|  |   'ET',  'European terminator', | ||
|  |   'FSI', 'First strong isolate', | ||
|  |   'L',   'Left to right', | ||
|  |   'LRE', 'Left to right embedding', | ||
|  |   'LRI', 'Left to right isolate', | ||
|  |   'LRO', 'Left to right override', | ||
|  |   'NSM', 'Non-spacing mark', | ||
|  |   'ON',  'Other neutral', | ||
|  |   'PDF', 'Pop directional format', | ||
|  |   'PDI', 'Pop directional isolate', | ||
|  |   'R',   'Right to left', | ||
|  |   'RLE', 'Right to left embedding', | ||
|  |   'RLI', 'Right to left isolate', | ||
|  |   'RLO', 'Right to left override', | ||
|  |   'S',   'Segment separator', | ||
|  |   'WS',  'White space' | ||
|  |   ] | ||
|  | 
 | ||
|  | # Particular category property names, with comments. NOTE: If ever this list | ||
|  | # is changed, the table called "catposstab" in the pcre2_auto_possess.c file | ||
|  | # must be edited to keep in step. | ||
|  | 
 | ||
|  | category_names = [ | ||
|  |   'Cc', 'Control', | ||
|  |   'Cf', 'Format', | ||
|  |   'Cn', 'Unassigned', | ||
|  |   'Co', 'Private use', | ||
|  |   'Cs', 'Surrogate', | ||
|  |   'Ll', 'Lower case letter', | ||
|  |   'Lm', 'Modifier letter', | ||
|  |   'Lo', 'Other letter', | ||
|  |   'Lt', 'Title case letter', | ||
|  |   'Lu', 'Upper case letter', | ||
|  |   'Mc', 'Spacing mark', | ||
|  |   'Me', 'Enclosing mark', | ||
|  |   'Mn', 'Non-spacing mark', | ||
|  |   'Nd', 'Decimal number', | ||
|  |   'Nl', 'Letter number', | ||
|  |   'No', 'Other number', | ||
|  |   'Pc', 'Connector punctuation', | ||
|  |   'Pd', 'Dash punctuation', | ||
|  |   'Pe', 'Close punctuation', | ||
|  |   'Pf', 'Final punctuation', | ||
|  |   'Pi', 'Initial punctuation', | ||
|  |   'Po', 'Other punctuation', | ||
|  |   'Ps', 'Open punctuation', | ||
|  |   'Sc', 'Currency symbol', | ||
|  |   'Sk', 'Modifier symbol', | ||
|  |   'Sm', 'Mathematical symbol', | ||
|  |   'So', 'Other symbol', | ||
|  |   'Zl', 'Line separator', | ||
|  |   'Zp', 'Paragraph separator', | ||
|  |   'Zs', 'Space separator' | ||
|  |   ] | ||
|  | 
 | ||
|  | # The Extended_Pictographic property is not found in the file where all the | ||
|  | # others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt | ||
|  | # file, but we list it here so that the name has the correct index value. | ||
|  | 
 | ||
|  | break_properties = [ | ||
|  |   'CR',                    ' 0', | ||
|  |   'LF',                    ' 1', | ||
|  |   'Control',               ' 2', | ||
|  |   'Extend',                ' 3', | ||
|  |   'Prepend',               ' 4', | ||
|  |   'SpacingMark',           ' 5', | ||
|  |   'L',                     ' 6 Hangul syllable type L', | ||
|  |   'V',                     ' 7 Hangul syllable type V', | ||
|  |   'T',                     ' 8 Hangul syllable type T', | ||
|  |   'LV',                    ' 9 Hangul syllable type LV', | ||
|  |   'LVT',                   '10 Hangul syllable type LVT', | ||
|  |   'Regional_Indicator',    '11', | ||
|  |   'Other',                 '12', | ||
|  |   'ZWJ',                   '13', | ||
|  |   'Extended_Pictographic', '14' | ||
|  |   ] | ||
|  | 
 | ||
|  | # List of files from which the names of Boolean properties are obtained, along | ||
|  | # with a list of regex patterns for properties to be ignored, and a list of | ||
|  | # extra pattern names to add. | ||
|  | 
 | ||
|  | bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt'] | ||
|  | bool_propsignore = [r'^Other_', r'^Hyphen$'] | ||
|  | bool_propsextras = ['ASCII', 'Bidi_Mirrored'] | ||
|  | 
 | ||
|  | 
 | ||
|  | # --------------------------------------------------------------------------- | ||
|  | #                   GET BOOLEAN PROPERTY NAMES | ||
|  | # --------------------------------------------------------------------------- | ||
|  | 
 | ||
|  | # Get a list of Boolean property names from a number of files. | ||
|  | 
 | ||
|  | def getbpropslist(): | ||
|  |   bplist = [] | ||
|  |   bplast = "" | ||
|  | 
 | ||
|  |   for filename in bool_propsfiles: | ||
|  |     try: | ||
|  |       file = open('Unicode.tables/' + filename, 'r') | ||
|  |     except IOError: | ||
|  |       print(f"** Couldn't open {'Unicode.tables/' + filename}\n") | ||
|  |       sys.exit(1) | ||
|  | 
 | ||
|  |     for line in file: | ||
|  |       line = re.sub(r'#.*', '', line) | ||
|  |       data = list(map(str.strip, line.split(';'))) | ||
|  |       if len(data) <= 1 or data[1] == bplast: | ||
|  |         continue | ||
|  |       bplast = data[1] | ||
|  |       for pat in bool_propsignore: | ||
|  |         if re.match(pat, bplast) != None: | ||
|  |           break | ||
|  |       else: | ||
|  |         bplist.append(bplast) | ||
|  | 
 | ||
|  |     file.close() | ||
|  | 
 | ||
|  |   bplist.extend(bool_propsextras) | ||
|  |   bplist.sort() | ||
|  |   return bplist | ||
|  | 
 | ||
|  | bool_properties = getbpropslist() | ||
|  | bool_props_list_item_size = (len(bool_properties) + 31) // 32 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | # --------------------------------------------------------------------------- | ||
|  | #                  COLLECTING PROPERTY NAMES AND ALIASES | ||
|  | # --------------------------------------------------------------------------- | ||
|  | 
 | ||
|  | script_names = ['Unknown'] | ||
|  | abbreviations = {} | ||
|  | 
 | ||
|  | def collect_property_names(): | ||
|  |   global script_names | ||
|  |   global abbreviations | ||
|  | 
 | ||
|  |   names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #') | ||
|  | 
 | ||
|  |   last_script_name = "" | ||
|  |   with open("Unicode.tables/Scripts.txt") as f: | ||
|  |     for line in f: | ||
|  |       match_obj = names_re.match(line) | ||
|  | 
 | ||
|  |       if match_obj == None or match_obj.group(1) == last_script_name: | ||
|  |         continue | ||
|  | 
 | ||
|  |       last_script_name = match_obj.group(1) | ||
|  |       script_names.append(last_script_name) | ||
|  | 
 | ||
|  |   # Sometimes there is comment in the line | ||
|  |   # so splitting around semicolon is not enough | ||
|  |   value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?') | ||
|  | 
 | ||
|  |   with open("Unicode.tables/PropertyValueAliases.txt") as f: | ||
|  |     for line in f: | ||
|  |       match_obj = value_alias_re.match(line) | ||
|  | 
 | ||
|  |       if match_obj == None: | ||
|  |         continue | ||
|  | 
 | ||
|  |       if match_obj.group(1) == "sc": | ||
|  |         if match_obj.group(2) == match_obj.group(3): | ||
|  |           abbreviations[match_obj.group(3)] = () | ||
|  |         elif match_obj.group(4) == None: | ||
|  |           abbreviations[match_obj.group(3)] = (match_obj.group(2),) | ||
|  |         else: | ||
|  |           abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4)) | ||
|  | 
 | ||
|  |   # We can also collect Boolean property abbreviations into the same dictionary | ||
|  | 
 | ||
|  |   bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?') | ||
|  |   with open("Unicode.tables/PropertyAliases.txt") as f: | ||
|  |     for line in f: | ||
|  |       match_obj = bin_alias_re.match(line) | ||
|  |       if match_obj == None: | ||
|  |         continue | ||
|  | 
 | ||
|  |       if match_obj.group(2) in bool_properties: | ||
|  |         if match_obj.group(3) == None: | ||
|  |           abbreviations[match_obj.group(2)] = (match_obj.group(1),) | ||
|  |         else: | ||
|  |           abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3)) | ||
|  | 
 | ||
|  | collect_property_names() | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | # --------------------------------------------------------------------------- | ||
|  | #                      REORDERING SCRIPT NAMES | ||
|  | # --------------------------------------------------------------------------- | ||
|  | 
 | ||
|  | script_abbrevs = [] | ||
|  | 
 | ||
|  | def reorder_scripts(): | ||
|  |   global script_names | ||
|  |   global script_abbrevs | ||
|  |   global abbreviations | ||
|  | 
 | ||
|  |   for name in script_names: | ||
|  |     abbrevs = abbreviations[name] | ||
|  |     script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0]) | ||
|  | 
 | ||
|  |   extended_script_abbrevs = set() | ||
|  |   with open("Unicode.tables/ScriptExtensions.txt") as f: | ||
|  |     names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #') | ||
|  | 
 | ||
|  |     for line in f: | ||
|  |       match_obj = names_re.match(line) | ||
|  | 
 | ||
|  |       if match_obj == None: | ||
|  |         continue | ||
|  | 
 | ||
|  |       for name in match_obj.group(1).split(" "): | ||
|  |         extended_script_abbrevs.add(name) | ||
|  | 
 | ||
|  |   new_script_names = [] | ||
|  |   new_script_abbrevs = [] | ||
|  | 
 | ||
|  |   for idx, abbrev in enumerate(script_abbrevs): | ||
|  |     if abbrev in extended_script_abbrevs: | ||
|  |       new_script_names.append(script_names[idx]) | ||
|  |       new_script_abbrevs.append(abbrev) | ||
|  | 
 | ||
|  |   for idx, abbrev in enumerate(script_abbrevs): | ||
|  |     if abbrev not in extended_script_abbrevs: | ||
|  |       new_script_names.append(script_names[idx]) | ||
|  |       new_script_abbrevs.append(abbrev) | ||
|  | 
 | ||
|  |   script_names = new_script_names | ||
|  |   script_abbrevs = new_script_abbrevs | ||
|  | 
 | ||
|  | reorder_scripts() | ||
|  | script_list_item_size = (script_names.index('Unknown') + 31) // 32 | ||
|  | 
 | ||
|  | 
 | ||
|  | # --------------------------------------------------------------------------- | ||
|  | #                         DERIVED LISTS | ||
|  | # --------------------------------------------------------------------------- | ||
|  | 
 | ||
|  | # Create general character property names from the first letters of the | ||
|  | # particular categories. | ||
|  | 
 | ||
|  | gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2)) | ||
|  | general_category_names = list(gcn_set) | ||
|  | general_category_names.sort() | ||
|  | 
 | ||
|  | 
 | ||
|  | # --------------------------------------------------------------------------- | ||
|  | #                           FUNCTIONS | ||
|  | # --------------------------------------------------------------------------- | ||
|  | 
 | ||
|  | import sys | ||
|  | 
 | ||
|  | # Open an output file, using the command's argument or a default. Write common | ||
|  | # preliminary header information. | ||
|  | 
 | ||
|  | def open_output(default): | ||
|  |   if len(sys.argv) > 2: | ||
|  |     print('** Too many arguments: just give a file name') | ||
|  |     sys.exit(1) | ||
|  |   if len(sys.argv) == 2: | ||
|  |     output_name = sys.argv[1] | ||
|  |   else: | ||
|  |     output_name = default | ||
|  |   try: | ||
|  |     file = open(output_name, "w") | ||
|  |   except IOError: | ||
|  |     print ("** Couldn't open %s" % output_name) | ||
|  |     sys.exit(1) | ||
|  | 
 | ||
|  |   script_name = sys.argv[0] | ||
|  |   i = script_name.rfind('/') | ||
|  |   if i >= 0: | ||
|  |     script_name = script_name[i+1:] | ||
|  | 
 | ||
|  |   file.write("""\
 | ||
|  | /************************************************* | ||
|  | *      Perl-Compatible Regular Expressions       * | ||
|  | *************************************************/ | ||
|  | 
 | ||
|  | /* PCRE is a library of functions to support regular expressions whose syntax | ||
|  | and semantics are as close as possible to those of the Perl 5 language. | ||
|  | 
 | ||
|  |                        Written by Philip Hazel | ||
|  |      Original API code Copyright (c) 1997-2012 University of Cambridge | ||
|  |           New API code Copyright (c) 2016-2022 University of Cambridge | ||
|  | 
 | ||
|  | This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY! | ||
|  | """)
 | ||
|  | 
 | ||
|  |   file.write("Instead, modify the maint/%s script and run it to generate\n" | ||
|  |   "a new version of this code.\n\n" % script_name) | ||
|  | 
 | ||
|  |   file.write("""\
 | ||
|  | ----------------------------------------------------------------------------- | ||
|  | Redistribution and use in source and binary forms, with or without | ||
|  | modification, are permitted provided that the following conditions are met: | ||
|  | 
 | ||
|  |     * Redistributions of source code must retain the above copyright notice, | ||
|  |       this list of conditions and the following disclaimer. | ||
|  | 
 | ||
|  |     * Redistributions in binary form must reproduce the above copyright | ||
|  |       notice, this list of conditions and the following disclaimer in the | ||
|  |       documentation and/or other materials provided with the distribution. | ||
|  | 
 | ||
|  |     * Neither the name of the University of Cambridge nor the names of its | ||
|  |       contributors may be used to endorse or promote products derived from | ||
|  |       this software without specific prior written permission. | ||
|  | 
 | ||
|  | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
|  | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
|  | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
|  | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | ||
|  | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
|  | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
|  | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
|  | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
|  | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
|  | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
|  | POSSIBILITY OF SUCH DAMAGE. | ||
|  | ----------------------------------------------------------------------------- | ||
|  | */ | ||
|  | \n""")
 | ||
|  |   return file | ||
|  | 
 | ||
|  | # End of UcpCommon.py |