forked from LeenkxTeam/LNXSDK
		
	
		
			
				
	
	
		
			356 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			356 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #! /usr/bin/python
 | |
| 
 | |
| #                   PCRE2 UNICODE PROPERTY SUPPORT
 | |
| #                   ------------------------------
 | |
| 
 | |
| # This file is a Python module containing common lists and functions for the
 | |
| # GenerateXXX scripts that create various.c and .h files from Unicode data
 | |
| # files. It was created as part of a re-organizaton of these scripts in
 | |
| # December 2021.
 | |
| 
 | |
| 
 | |
| import re
 | |
| 
 | |
| 
 | |
| # ---------------------------------------------------------------------------
 | |
| #                             DATA LISTS
 | |
| # ---------------------------------------------------------------------------
 | |
| 
 | |
| # BIDI classes in the DerivedBidiClass.txt file, with comments.
 | |
| 
 | |
| bidi_classes = [
 | |
|   'AL',  'Arabic letter',
 | |
|   'AN',  'Arabic number',
 | |
|   'B',   'Paragraph separator',
 | |
|   'BN',  'Boundary neutral',
 | |
|   'CS',  'Common separator',
 | |
|   'EN',  'European number',
 | |
|   'ES',  'European separator',
 | |
|   'ET',  'European terminator',
 | |
|   'FSI', 'First strong isolate',
 | |
|   'L',   'Left to right',
 | |
|   'LRE', 'Left to right embedding',
 | |
|   'LRI', 'Left to right isolate',
 | |
|   'LRO', 'Left to right override',
 | |
|   'NSM', 'Non-spacing mark',
 | |
|   'ON',  'Other neutral',
 | |
|   'PDF', 'Pop directional format',
 | |
|   'PDI', 'Pop directional isolate',
 | |
|   'R',   'Right to left',
 | |
|   'RLE', 'Right to left embedding',
 | |
|   'RLI', 'Right to left isolate',
 | |
|   'RLO', 'Right to left override',
 | |
|   'S',   'Segment separator',
 | |
|   'WS',  'White space'
 | |
|   ]
 | |
| 
 | |
| # Particular category property names, with comments. NOTE: If ever this list
 | |
| # is changed, the table called "catposstab" in the pcre2_auto_possess.c file
 | |
| # must be edited to keep in step.
 | |
| 
 | |
| category_names = [
 | |
|   'Cc', 'Control',
 | |
|   'Cf', 'Format',
 | |
|   'Cn', 'Unassigned',
 | |
|   'Co', 'Private use',
 | |
|   'Cs', 'Surrogate',
 | |
|   'Ll', 'Lower case letter',
 | |
|   'Lm', 'Modifier letter',
 | |
|   'Lo', 'Other letter',
 | |
|   'Lt', 'Title case letter',
 | |
|   'Lu', 'Upper case letter',
 | |
|   'Mc', 'Spacing mark',
 | |
|   'Me', 'Enclosing mark',
 | |
|   'Mn', 'Non-spacing mark',
 | |
|   'Nd', 'Decimal number',
 | |
|   'Nl', 'Letter number',
 | |
|   'No', 'Other number',
 | |
|   'Pc', 'Connector punctuation',
 | |
|   'Pd', 'Dash punctuation',
 | |
|   'Pe', 'Close punctuation',
 | |
|   'Pf', 'Final punctuation',
 | |
|   'Pi', 'Initial punctuation',
 | |
|   'Po', 'Other punctuation',
 | |
|   'Ps', 'Open punctuation',
 | |
|   'Sc', 'Currency symbol',
 | |
|   'Sk', 'Modifier symbol',
 | |
|   'Sm', 'Mathematical symbol',
 | |
|   'So', 'Other symbol',
 | |
|   'Zl', 'Line separator',
 | |
|   'Zp', 'Paragraph separator',
 | |
|   'Zs', 'Space separator'
 | |
|   ]
 | |
| 
 | |
| # The Extended_Pictographic property is not found in the file where all the
 | |
| # others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
 | |
| # file, but we list it here so that the name has the correct index value.
 | |
| 
 | |
| break_properties = [
 | |
|   'CR',                    ' 0',
 | |
|   'LF',                    ' 1',
 | |
|   'Control',               ' 2',
 | |
|   'Extend',                ' 3',
 | |
|   'Prepend',               ' 4',
 | |
|   'SpacingMark',           ' 5',
 | |
|   'L',                     ' 6 Hangul syllable type L',
 | |
|   'V',                     ' 7 Hangul syllable type V',
 | |
|   'T',                     ' 8 Hangul syllable type T',
 | |
|   'LV',                    ' 9 Hangul syllable type LV',
 | |
|   'LVT',                   '10 Hangul syllable type LVT',
 | |
|   'Regional_Indicator',    '11',
 | |
|   'Other',                 '12',
 | |
|   'ZWJ',                   '13',
 | |
|   'Extended_Pictographic', '14'
 | |
|   ]
 | |
| 
 | |
| # List of files from which the names of Boolean properties are obtained, along
 | |
| # with a list of regex patterns for properties to be ignored, and a list of
 | |
| # extra pattern names to add.
 | |
| 
 | |
| bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt']
 | |
| bool_propsignore = [r'^Other_', r'^Hyphen$']
 | |
| bool_propsextras = ['ASCII', 'Bidi_Mirrored']
 | |
| 
 | |
| 
 | |
| # ---------------------------------------------------------------------------
 | |
| #                   GET BOOLEAN PROPERTY NAMES
 | |
| # ---------------------------------------------------------------------------
 | |
| 
 | |
| # Get a list of Boolean property names from a number of files.
 | |
| 
 | |
| def getbpropslist():
 | |
|   bplist = []
 | |
|   bplast = ""
 | |
| 
 | |
|   for filename in bool_propsfiles:
 | |
|     try:
 | |
|       file = open('Unicode.tables/' + filename, 'r')
 | |
|     except IOError:
 | |
|       print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
 | |
|       sys.exit(1)
 | |
| 
 | |
|     for line in file:
 | |
|       line = re.sub(r'#.*', '', line)
 | |
|       data = list(map(str.strip, line.split(';')))
 | |
|       if len(data) <= 1 or data[1] == bplast:
 | |
|         continue
 | |
|       bplast = data[1]
 | |
|       for pat in bool_propsignore:
 | |
|         if re.match(pat, bplast) != None:
 | |
|           break
 | |
|       else:
 | |
|         bplist.append(bplast)
 | |
| 
 | |
|     file.close()
 | |
| 
 | |
|   bplist.extend(bool_propsextras)
 | |
|   bplist.sort()
 | |
|   return bplist
 | |
| 
 | |
| bool_properties = getbpropslist()
 | |
| bool_props_list_item_size = (len(bool_properties) + 31) // 32
 | |
| 
 | |
| 
 | |
| 
 | |
| # ---------------------------------------------------------------------------
 | |
| #                  COLLECTING PROPERTY NAMES AND ALIASES
 | |
| # ---------------------------------------------------------------------------
 | |
| 
 | |
| script_names = ['Unknown']
 | |
| abbreviations = {}
 | |
| 
 | |
| def collect_property_names():
 | |
|   global script_names
 | |
|   global abbreviations
 | |
| 
 | |
|   names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #')
 | |
| 
 | |
|   last_script_name = ""
 | |
|   with open("Unicode.tables/Scripts.txt") as f:
 | |
|     for line in f:
 | |
|       match_obj = names_re.match(line)
 | |
| 
 | |
|       if match_obj == None or match_obj.group(1) == last_script_name:
 | |
|         continue
 | |
| 
 | |
|       last_script_name = match_obj.group(1)
 | |
|       script_names.append(last_script_name)
 | |
| 
 | |
|   # Sometimes there is comment in the line
 | |
|   # so splitting around semicolon is not enough
 | |
|   value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?')
 | |
| 
 | |
|   with open("Unicode.tables/PropertyValueAliases.txt") as f:
 | |
|     for line in f:
 | |
|       match_obj = value_alias_re.match(line)
 | |
| 
 | |
|       if match_obj == None:
 | |
|         continue
 | |
| 
 | |
|       if match_obj.group(1) == "sc":
 | |
|         if match_obj.group(2) == match_obj.group(3):
 | |
|           abbreviations[match_obj.group(3)] = ()
 | |
|         elif match_obj.group(4) == None:
 | |
|           abbreviations[match_obj.group(3)] = (match_obj.group(2),)
 | |
|         else:
 | |
|           abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4))
 | |
| 
 | |
|   # We can also collect Boolean property abbreviations into the same dictionary
 | |
| 
 | |
|   bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?')
 | |
|   with open("Unicode.tables/PropertyAliases.txt") as f:
 | |
|     for line in f:
 | |
|       match_obj = bin_alias_re.match(line)
 | |
|       if match_obj == None:
 | |
|         continue
 | |
| 
 | |
|       if match_obj.group(2) in bool_properties:
 | |
|         if match_obj.group(3) == None:
 | |
|           abbreviations[match_obj.group(2)] = (match_obj.group(1),)
 | |
|         else:
 | |
|           abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3))
 | |
| 
 | |
| collect_property_names()
 | |
| 
 | |
| 
 | |
| 
 | |
| # ---------------------------------------------------------------------------
 | |
| #                      REORDERING SCRIPT NAMES
 | |
| # ---------------------------------------------------------------------------
 | |
| 
 | |
| script_abbrevs = []
 | |
| 
 | |
| def reorder_scripts():
 | |
|   global script_names
 | |
|   global script_abbrevs
 | |
|   global abbreviations
 | |
| 
 | |
|   for name in script_names:
 | |
|     abbrevs = abbreviations[name]
 | |
|     script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0])
 | |
| 
 | |
|   extended_script_abbrevs = set()
 | |
|   with open("Unicode.tables/ScriptExtensions.txt") as f:
 | |
|     names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')
 | |
| 
 | |
|     for line in f:
 | |
|       match_obj = names_re.match(line)
 | |
| 
 | |
|       if match_obj == None:
 | |
|         continue
 | |
| 
 | |
|       for name in match_obj.group(1).split(" "):
 | |
|         extended_script_abbrevs.add(name)
 | |
| 
 | |
|   new_script_names = []
 | |
|   new_script_abbrevs = []
 | |
| 
 | |
|   for idx, abbrev in enumerate(script_abbrevs):
 | |
|     if abbrev in extended_script_abbrevs:
 | |
|       new_script_names.append(script_names[idx])
 | |
|       new_script_abbrevs.append(abbrev)
 | |
| 
 | |
|   for idx, abbrev in enumerate(script_abbrevs):
 | |
|     if abbrev not in extended_script_abbrevs:
 | |
|       new_script_names.append(script_names[idx])
 | |
|       new_script_abbrevs.append(abbrev)
 | |
| 
 | |
|   script_names = new_script_names
 | |
|   script_abbrevs = new_script_abbrevs
 | |
| 
 | |
| reorder_scripts()
 | |
| script_list_item_size = (script_names.index('Unknown') + 31) // 32
 | |
| 
 | |
| 
 | |
| # ---------------------------------------------------------------------------
 | |
| #                         DERIVED LISTS
 | |
| # ---------------------------------------------------------------------------
 | |
| 
 | |
| # Create general character property names from the first letters of the
 | |
| # particular categories.
 | |
| 
 | |
| gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2))
 | |
| general_category_names = list(gcn_set)
 | |
| general_category_names.sort()
 | |
| 
 | |
| 
 | |
| # ---------------------------------------------------------------------------
 | |
| #                           FUNCTIONS
 | |
| # ---------------------------------------------------------------------------
 | |
| 
 | |
| import sys
 | |
| 
 | |
| # Open an output file, using the command's argument or a default. Write common
 | |
| # preliminary header information.
 | |
| 
 | |
| def open_output(default):
 | |
|   if len(sys.argv) > 2:
 | |
|     print('** Too many arguments: just give a file name')
 | |
|     sys.exit(1)
 | |
|   if len(sys.argv) == 2:
 | |
|     output_name = sys.argv[1]
 | |
|   else:
 | |
|     output_name = default
 | |
|   try:
 | |
|     file = open(output_name, "w")
 | |
|   except IOError:
 | |
|     print ("** Couldn't open %s" % output_name)
 | |
|     sys.exit(1)
 | |
| 
 | |
|   script_name = sys.argv[0]
 | |
|   i = script_name.rfind('/')
 | |
|   if i >= 0:
 | |
|     script_name = script_name[i+1:]
 | |
| 
 | |
|   file.write("""\
 | |
| /*************************************************
 | |
| *      Perl-Compatible Regular Expressions       *
 | |
| *************************************************/
 | |
| 
 | |
| /* PCRE is a library of functions to support regular expressions whose syntax
 | |
| and semantics are as close as possible to those of the Perl 5 language.
 | |
| 
 | |
|                        Written by Philip Hazel
 | |
|      Original API code Copyright (c) 1997-2012 University of Cambridge
 | |
|           New API code Copyright (c) 2016-2022 University of Cambridge
 | |
| 
 | |
| This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
 | |
| """)
 | |
| 
 | |
|   file.write("Instead, modify the maint/%s script and run it to generate\n"
 | |
|   "a new version of this code.\n\n" % script_name)
 | |
| 
 | |
|   file.write("""\
 | |
| -----------------------------------------------------------------------------
 | |
| Redistribution and use in source and binary forms, with or without
 | |
| modification, are permitted provided that the following conditions are met:
 | |
| 
 | |
|     * Redistributions of source code must retain the above copyright notice,
 | |
|       this list of conditions and the following disclaimer.
 | |
| 
 | |
|     * Redistributions in binary form must reproduce the above copyright
 | |
|       notice, this list of conditions and the following disclaimer in the
 | |
|       documentation and/or other materials provided with the distribution.
 | |
| 
 | |
|     * Neither the name of the University of Cambridge nor the names of its
 | |
|       contributors may be used to endorse or promote products derived from
 | |
|       this software without specific prior written permission.
 | |
| 
 | |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | |
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | |
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | |
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 | |
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 | |
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 | |
| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 | |
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 | |
| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 | |
| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 | |
| POSSIBILITY OF SUCH DAMAGE.
 | |
| -----------------------------------------------------------------------------
 | |
| */
 | |
| \n""")
 | |
|   return file
 | |
| 
 | |
| # End of UcpCommon.py
 |