forked from LeenkxTeam/LNXSDK
		
	
		
			
	
	
		
			204 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			204 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| 
								 | 
							
								#! /usr/bin/python
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#                   PCRE2 UNICODE PROPERTY SUPPORT
							 | 
						||
| 
								 | 
							
								#                   ------------------------------
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# This script generates the pcre2_ucptables.c file, which contains tables for
							 | 
						||
| 
								 | 
							
								# recognizing Unicode property names. It is #included by pcre2_tables.c. In
							 | 
						||
| 
								 | 
							
								# order to reduce the number of relocations when loading the PCRE2 library, the
							 | 
						||
| 
								 | 
							
								# names are held as a single large string, with offsets in the table. This is
							 | 
						||
| 
								 | 
							
								# tedious to maintain by hand. Therefore, a script is used to generate the
							 | 
						||
| 
								 | 
							
								# table.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# This script was created in December 2021 based on the previous GenerateUtt
							 | 
						||
| 
								 | 
							
								# script, whose output had to be manually edited into pcre2_tables.c. Here is
							 | 
						||
| 
								 | 
							
								# the history of the original script:
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# -----------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								# Modified by PH 17-March-2009 to generate the more verbose form that works
							 | 
						||
| 
								 | 
							
								# for UTF-support in EBCDIC as well as ASCII environments.
							 | 
						||
| 
								 | 
							
								# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
							 | 
						||
| 
								 | 
							
								# Modified by PH 04-May-2010 to add new "X.." special categories.
							 | 
						||
| 
								 | 
							
								# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
							 | 
						||
| 
								 | 
							
								# Modified by ChPe 30-September-2012 to add this note; no other changes were
							 | 
						||
| 
								 | 
							
								# necessary for Unicode 6.2.0 support.
							 | 
						||
| 
								 | 
							
								# Modfied by PH 26-February-2013 to add the Xuc special category.
							 | 
						||
| 
								 | 
							
								# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
							 | 
						||
| 
								 | 
							
								# Script updated to Python 3 by running it through the 2to3 converter.
							 | 
						||
| 
								 | 
							
								# Added script names for Unicode 7.0.0, 20-June-2014.
							 | 
						||
| 
								 | 
							
								# Added script names for Unicode 8.0.0, 19-June-2015.
							 | 
						||
| 
								 | 
							
								# Added script names for Unicode 10.0.0, 02-July-2017.
							 | 
						||
| 
								 | 
							
								# Added script names for Unicode 11.0.0, 03-July-2018.
							 | 
						||
| 
								 | 
							
								# Added 'Unknown' script, 01-October-2018.
							 | 
						||
| 
								 | 
							
								# Added script names for Unicode 12.1.0, 27-July-2019.
							 | 
						||
| 
								 | 
							
								# Added script names for Unicode 13.0.0, 10-March-2020.
							 | 
						||
| 
								 | 
							
								# Added Script names for Unicode 14.0.0, PCRE2-10.39
							 | 
						||
| 
								 | 
							
								# Added support for bidi class and bidi control, 06-December-2021
							 | 
						||
| 
								 | 
							
								#   This also involved lower casing strings and removing underscores, in
							 | 
						||
| 
								 | 
							
								#   accordance with Unicode's "loose matching" rules, which Perl observes.
							 | 
						||
| 
								 | 
							
								# Changed default script type from PT_SC to PT_SCX, 18-December-2021
							 | 
						||
| 
								 | 
							
								# -----------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Note subsequent changes here:
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# 27-December-2021: Added support for 4-letter script abbreviations.
							 | 
						||
| 
								 | 
							
								# 10-January-2022:  Further updates for Boolean property support
							 | 
						||
| 
								 | 
							
								# -----------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Import common data lists and functions
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								from GenerateCommon import \
							 | 
						||
| 
								 | 
							
								  abbreviations, \
							 | 
						||
| 
								 | 
							
								  bool_properties, \
							 | 
						||
| 
								 | 
							
								  bidi_classes, \
							 | 
						||
| 
								 | 
							
								  category_names, \
							 | 
						||
| 
								 | 
							
								  general_category_names, \
							 | 
						||
| 
								 | 
							
								  script_names, \
							 | 
						||
| 
								 | 
							
								  open_output
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Open the output file (no return on failure). This call also writes standard
							 | 
						||
| 
								 | 
							
								# header boilerplate.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								f = open_output("pcre2_ucptables.c")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# The list in bidi_classes contains just the Unicode classes such as AN, LRE,
							 | 
						||
| 
								 | 
							
								# etc., along with comments. We need to add "bidi" in front of each value, in
							 | 
						||
| 
								 | 
							
								# order to create names that don't clash with other types of property.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								bidi_class_names = []
							 | 
						||
| 
								 | 
							
								for i in range(0, len(bidi_classes), 2):
							 | 
						||
| 
								 | 
							
								  bidi_class_names.append("bidi" + bidi_classes[i])
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Remove the comments from other lists that contain them.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								category_names = category_names[::2]
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Create standardized versions of the names by lowercasing and removing
							 | 
						||
| 
								 | 
							
								# underscores.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def stdname(x):
							 | 
						||
| 
								 | 
							
								  return x.lower().replace('_', '')
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def stdnames(x):
							 | 
						||
| 
								 | 
							
								  y = [''] * len(x)
							 | 
						||
| 
								 | 
							
								  for i in range(len(x)):
							 | 
						||
| 
								 | 
							
								    y[i] = stdname(x[i])
							 | 
						||
| 
								 | 
							
								  return y
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								std_category_names = stdnames(category_names)
							 | 
						||
| 
								 | 
							
								std_general_category_names = stdnames(general_category_names)
							 | 
						||
| 
								 | 
							
								std_bidi_class_names = stdnames(bidi_class_names)
							 | 
						||
| 
								 | 
							
								std_bool_properties = stdnames(bool_properties)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Create the table, starting with the Unicode script, category and bidi class
							 | 
						||
| 
								 | 
							
								# names. We keep both the standardized name and the original, because the
							 | 
						||
| 
								 | 
							
								# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
							 | 
						||
| 
								 | 
							
								# still use the full original names.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								utt_table = []
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								scx_end = script_names.index('Unknown')
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								for idx, name in enumerate(script_names):
							 | 
						||
| 
								 | 
							
								  pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC'
							 | 
						||
| 
								 | 
							
								  utt_table.append((stdname(name), name, pt_type))
							 | 
						||
| 
								 | 
							
								  for abbrev in abbreviations[name]:
							 | 
						||
| 
								 | 
							
								    utt_table.append((stdname(abbrev), name, pt_type))
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Add the remaining property lists
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names)))
							 | 
						||
| 
								 | 
							
								utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
							 | 
						||
| 
								 | 
							
								utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								for name in bool_properties:
							 | 
						||
| 
								 | 
							
								  utt_table.append((stdname(name), name, 'PT_BOOL'))
							 | 
						||
| 
								 | 
							
								  if name in abbreviations: 
							 | 
						||
| 
								 | 
							
								    for abbrev in abbreviations[name]:
							 | 
						||
| 
								 | 
							
								      utt_table.append((stdname(abbrev), name, 'PT_BOOL'))
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Now add specials and synonyms. Note both the standardized and capitalized
							 | 
						||
| 
								 | 
							
								# forms are needed.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								utt_table.append(('any', 'Any', 'PT_ANY'))
							 | 
						||
| 
								 | 
							
								utt_table.append(('l&',  'L&',  'PT_LAMP'))
							 | 
						||
| 
								 | 
							
								utt_table.append(('lc',  'LC',  'PT_LAMP'))
							 | 
						||
| 
								 | 
							
								utt_table.append(('xan', 'Xan', 'PT_ALNUM'))
							 | 
						||
| 
								 | 
							
								utt_table.append(('xps', 'Xps', 'PT_PXSPACE'))
							 | 
						||
| 
								 | 
							
								utt_table.append(('xsp', 'Xsp', 'PT_SPACE'))
							 | 
						||
| 
								 | 
							
								utt_table.append(('xuc', 'Xuc', 'PT_UCNC'))
							 | 
						||
| 
								 | 
							
								utt_table.append(('xwd', 'Xwd', 'PT_WORD'))
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Remove duplicates from the table and then sort it.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								utt_table = list(set(utt_table)) 
							 | 
						||
| 
								 | 
							
								utt_table.sort()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Output file-specific heading
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								f.write("""\
							 | 
						||
| 
								 | 
							
								#ifdef SUPPORT_UNICODE
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/* The PRIV(utt)[] table below translates Unicode property names into type and
							 | 
						||
| 
								 | 
							
								code values. It is searched by binary chop, so must be in collating sequence of
							 | 
						||
| 
								 | 
							
								name. Originally, the table contained pointers to the name strings in the first
							 | 
						||
| 
								 | 
							
								field of each entry. However, that leads to a large number of relocations when
							 | 
						||
| 
								 | 
							
								a shared library is dynamically loaded. A significant reduction is made by
							 | 
						||
| 
								 | 
							
								putting all the names into a single, large string and using offsets instead.
							 | 
						||
| 
								 | 
							
								All letters are lower cased, and underscores are removed, in accordance with
							 | 
						||
| 
								 | 
							
								the "loose matching" rules that Unicode advises and Perl uses. */
							 | 
						||
| 
								 | 
							
								\n""")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# We have to use STR_ macros to define the strings so that it all works in
							 | 
						||
| 
								 | 
							
								# UTF-8 mode on EBCDIC platforms.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								for utt in utt_table:
							 | 
						||
| 
								 | 
							
								  f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')))
							 | 
						||
| 
								 | 
							
								  for c in utt[0]:
							 | 
						||
| 
								 | 
							
								    if c == '&':
							 | 
						||
| 
								 | 
							
								      f.write(' STR_AMPERSAND')
							 | 
						||
| 
								 | 
							
								    else:
							 | 
						||
| 
								 | 
							
								      f.write(' STR_%s' % c);
							 | 
						||
| 
								 | 
							
								  f.write(' "\\0"\n')
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Output the long string of concatenated names
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								f.write('\nconst char PRIV(utt_names)[] =\n');
							 | 
						||
| 
								 | 
							
								last = ''
							 | 
						||
| 
								 | 
							
								for utt in utt_table:
							 | 
						||
| 
								 | 
							
								  if utt == utt_table[-1]:
							 | 
						||
| 
								 | 
							
								    last = ';'
							 | 
						||
| 
								 | 
							
								  f.write('  STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last))
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Output the property type table
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								f.write('\nconst ucp_type_table PRIV(utt)[] = {\n')
							 | 
						||
| 
								 | 
							
								offset = 0
							 | 
						||
| 
								 | 
							
								last = ','
							 | 
						||
| 
								 | 
							
								for utt in utt_table:
							 | 
						||
| 
								 | 
							
								  if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
							 | 
						||
| 
								 | 
							
								      'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
							 | 
						||
| 
								 | 
							
								    value = '0'
							 | 
						||
| 
								 | 
							
								  else:
							 | 
						||
| 
								 | 
							
								    value = 'ucp_' + utt[1]
							 | 
						||
| 
								 | 
							
								  if utt == utt_table[-1]:
							 | 
						||
| 
								 | 
							
								    last = ''
							 | 
						||
| 
								 | 
							
								  f.write('  { %3d, %s, %s }%s\n' % (offset, utt[2], value, last))
							 | 
						||
| 
								 | 
							
								  offset += len(utt[0]) + 1
							 | 
						||
| 
								 | 
							
								f.write('};\n\n')
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# Ending text
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								f.write("""\
							 | 
						||
| 
								 | 
							
								const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#endif /* SUPPORT_UNICODE */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/* End of pcre2_ucptables.c */
							 | 
						||
| 
								 | 
							
								""")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								f.close
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# End
							 |