204 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			204 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|  | #! /usr/bin/python | ||
|  | 
 | ||
|  | #                   PCRE2 UNICODE PROPERTY SUPPORT | ||
|  | #                   ------------------------------ | ||
|  | 
 | ||
|  | # This script generates the pcre2_ucptables.c file, which contains tables for | ||
|  | # recognizing Unicode property names. It is #included by pcre2_tables.c. In | ||
|  | # order to reduce the number of relocations when loading the PCRE2 library, the | ||
|  | # names are held as a single large string, with offsets in the table. This is | ||
|  | # tedious to maintain by hand. Therefore, a script is used to generate the | ||
|  | # table. | ||
|  | 
 | ||
|  | # This script was created in December 2021 based on the previous GenerateUtt | ||
|  | # script, whose output had to be manually edited into pcre2_tables.c. Here is | ||
|  | # the history of the original script: | ||
|  | 
 | ||
|  | # ----------------------------------------------------------------------------- | ||
|  | # Modified by PH 17-March-2009 to generate the more verbose form that works | ||
|  | # for UTF-support in EBCDIC as well as ASCII environments. | ||
|  | # Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0. | ||
|  | # Modified by PH 04-May-2010 to add new "X.." special categories. | ||
|  | # Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0 | ||
|  | # Modified by ChPe 30-September-2012 to add this note; no other changes were | ||
|  | # necessary for Unicode 6.2.0 support. | ||
|  | # Modfied by PH 26-February-2013 to add the Xuc special category. | ||
|  | # Comment modified by PH 13-May-2014 to update to PCRE2 file names. | ||
|  | # Script updated to Python 3 by running it through the 2to3 converter. | ||
|  | # Added script names for Unicode 7.0.0, 20-June-2014. | ||
|  | # Added script names for Unicode 8.0.0, 19-June-2015. | ||
|  | # Added script names for Unicode 10.0.0, 02-July-2017. | ||
|  | # Added script names for Unicode 11.0.0, 03-July-2018. | ||
|  | # Added 'Unknown' script, 01-October-2018. | ||
|  | # Added script names for Unicode 12.1.0, 27-July-2019. | ||
|  | # Added script names for Unicode 13.0.0, 10-March-2020. | ||
|  | # Added Script names for Unicode 14.0.0, PCRE2-10.39 | ||
|  | # Added support for bidi class and bidi control, 06-December-2021 | ||
|  | #   This also involved lower casing strings and removing underscores, in | ||
|  | #   accordance with Unicode's "loose matching" rules, which Perl observes. | ||
|  | # Changed default script type from PT_SC to PT_SCX, 18-December-2021 | ||
|  | # ----------------------------------------------------------------------------- | ||
|  | # | ||
|  | # Note subsequent changes here: | ||
|  | # | ||
|  | # 27-December-2021: Added support for 4-letter script abbreviations. | ||
|  | # 10-January-2022:  Further updates for Boolean property support | ||
|  | # ----------------------------------------------------------------------------- | ||
|  | 
 | ||
|  | 
 | ||
|  | # Import common data lists and functions | ||
|  | 
 | ||
|  | from GenerateCommon import \ | ||
|  |   abbreviations, \ | ||
|  |   bool_properties, \ | ||
|  |   bidi_classes, \ | ||
|  |   category_names, \ | ||
|  |   general_category_names, \ | ||
|  |   script_names, \ | ||
|  |   open_output | ||
|  | 
 | ||
|  | # Open the output file (no return on failure). This call also writes standard | ||
|  | # header boilerplate. | ||
|  | 
 | ||
|  | f = open_output("pcre2_ucptables.c") | ||
|  | 
 | ||
|  | # The list in bidi_classes contains just the Unicode classes such as AN, LRE, | ||
|  | # etc., along with comments. We need to add "bidi" in front of each value, in | ||
|  | # order to create names that don't clash with other types of property. | ||
|  | 
 | ||
|  | bidi_class_names = [] | ||
|  | for i in range(0, len(bidi_classes), 2): | ||
|  |   bidi_class_names.append("bidi" + bidi_classes[i]) | ||
|  | 
 | ||
|  | # Remove the comments from other lists that contain them. | ||
|  | 
 | ||
|  | category_names = category_names[::2] | ||
|  | 
 | ||
|  | # Create standardized versions of the names by lowercasing and removing | ||
|  | # underscores. | ||
|  | 
 | ||
|  | def stdname(x): | ||
|  |   return x.lower().replace('_', '') | ||
|  | 
 | ||
|  | def stdnames(x): | ||
|  |   y = [''] * len(x) | ||
|  |   for i in range(len(x)): | ||
|  |     y[i] = stdname(x[i]) | ||
|  |   return y | ||
|  | 
 | ||
|  | std_category_names = stdnames(category_names) | ||
|  | std_general_category_names = stdnames(general_category_names) | ||
|  | std_bidi_class_names = stdnames(bidi_class_names) | ||
|  | std_bool_properties = stdnames(bool_properties) | ||
|  | 
 | ||
|  | # Create the table, starting with the Unicode script, category and bidi class | ||
|  | # names. We keep both the standardized name and the original, because the | ||
|  | # latter is used for the ucp_xx names. NOTE: for the script abbreviations, we | ||
|  | # still use the full original names. | ||
|  | 
 | ||
|  | utt_table = [] | ||
|  | 
 | ||
|  | scx_end = script_names.index('Unknown') | ||
|  | 
 | ||
|  | for idx, name in enumerate(script_names): | ||
|  |   pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC' | ||
|  |   utt_table.append((stdname(name), name, pt_type)) | ||
|  |   for abbrev in abbreviations[name]: | ||
|  |     utt_table.append((stdname(abbrev), name, pt_type)) | ||
|  | 
 | ||
|  | # Add the remaining property lists | ||
|  | 
 | ||
|  | utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names))) | ||
|  | utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names))) | ||
|  | utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names))) | ||
|  | 
 | ||
|  | for name in bool_properties: | ||
|  |   utt_table.append((stdname(name), name, 'PT_BOOL')) | ||
|  |   if name in abbreviations:  | ||
|  |     for abbrev in abbreviations[name]: | ||
|  |       utt_table.append((stdname(abbrev), name, 'PT_BOOL')) | ||
|  | 
 | ||
|  | # Now add specials and synonyms. Note both the standardized and capitalized | ||
|  | # forms are needed. | ||
|  | 
 | ||
|  | utt_table.append(('any', 'Any', 'PT_ANY')) | ||
|  | utt_table.append(('l&',  'L&',  'PT_LAMP')) | ||
|  | utt_table.append(('lc',  'LC',  'PT_LAMP')) | ||
|  | utt_table.append(('xan', 'Xan', 'PT_ALNUM')) | ||
|  | utt_table.append(('xps', 'Xps', 'PT_PXSPACE')) | ||
|  | utt_table.append(('xsp', 'Xsp', 'PT_SPACE')) | ||
|  | utt_table.append(('xuc', 'Xuc', 'PT_UCNC')) | ||
|  | utt_table.append(('xwd', 'Xwd', 'PT_WORD')) | ||
|  | 
 | ||
|  | # Remove duplicates from the table and then sort it. | ||
|  | 
 | ||
|  | utt_table = list(set(utt_table))  | ||
|  | utt_table.sort() | ||
|  | 
 | ||
|  | # Output file-specific heading | ||
|  | 
 | ||
|  | f.write("""\
 | ||
|  | #ifdef SUPPORT_UNICODE | ||
|  | 
 | ||
|  | /* The PRIV(utt)[] table below translates Unicode property names into type and | ||
|  | code values. It is searched by binary chop, so must be in collating sequence of | ||
|  | name. Originally, the table contained pointers to the name strings in the first | ||
|  | field of each entry. However, that leads to a large number of relocations when | ||
|  | a shared library is dynamically loaded. A significant reduction is made by | ||
|  | putting all the names into a single, large string and using offsets instead. | ||
|  | All letters are lower cased, and underscores are removed, in accordance with | ||
|  | the "loose matching" rules that Unicode advises and Perl uses. */ | ||
|  | \n""")
 | ||
|  | 
 | ||
|  | # We have to use STR_ macros to define the strings so that it all works in | ||
|  | # UTF-8 mode on EBCDIC platforms. | ||
|  | 
 | ||
|  | for utt in utt_table: | ||
|  |   f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND'))) | ||
|  |   for c in utt[0]: | ||
|  |     if c == '&': | ||
|  |       f.write(' STR_AMPERSAND') | ||
|  |     else: | ||
|  |       f.write(' STR_%s' % c); | ||
|  |   f.write(' "\\0"\n') | ||
|  | 
 | ||
|  | # Output the long string of concatenated names | ||
|  | 
 | ||
|  | f.write('\nconst char PRIV(utt_names)[] =\n'); | ||
|  | last = '' | ||
|  | for utt in utt_table: | ||
|  |   if utt == utt_table[-1]: | ||
|  |     last = ';' | ||
|  |   f.write('  STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last)) | ||
|  | 
 | ||
|  | # Output the property type table | ||
|  | 
 | ||
|  | f.write('\nconst ucp_type_table PRIV(utt)[] = {\n') | ||
|  | offset = 0 | ||
|  | last = ',' | ||
|  | for utt in utt_table: | ||
|  |   if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE', | ||
|  |       'PT_SPACE', 'PT_UCNC', 'PT_WORD'): | ||
|  |     value = '0' | ||
|  |   else: | ||
|  |     value = 'ucp_' + utt[1] | ||
|  |   if utt == utt_table[-1]: | ||
|  |     last = '' | ||
|  |   f.write('  { %3d, %s, %s }%s\n' % (offset, utt[2], value, last)) | ||
|  |   offset += len(utt[0]) + 1 | ||
|  | f.write('};\n\n') | ||
|  | 
 | ||
|  | # Ending text | ||
|  | 
 | ||
|  | f.write("""\
 | ||
|  | const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table); | ||
|  | 
 | ||
|  | #endif /* SUPPORT_UNICODE */ | ||
|  | 
 | ||
|  | /* End of pcre2_ucptables.c */ | ||
|  | """)
 | ||
|  | 
 | ||
|  | f.close | ||
|  | 
 | ||
|  | # End |