189 lines
		
	
	
		
			5.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			189 lines
		
	
	
		
			5.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|  | #! /usr/bin/python | ||
|  | 
 | ||
|  | #                   PCRE2 UNICODE PROPERTY SUPPORT | ||
|  | #                   ------------------------------ | ||
|  | # | ||
|  | # This file auto-generates unicode property tests and their expected output. | ||
|  | # It is recommended to re-run this generator after the unicode files are | ||
|  | # updated. The names of the generated files are `testinput26` and `testoutput26` | ||
|  | 
 | ||
|  | import re | ||
|  | import sys | ||
|  | 
 | ||
|  | from GenerateCommon import \ | ||
|  |   script_names, \ | ||
|  |   script_abbrevs | ||
|  | 
 | ||
|  | def write_both(text): | ||
|  |   input_file.write(text) | ||
|  |   output_file.write(text) | ||
|  | 
 | ||
|  | def to_string_char(ch_idx): | ||
|  |   if ch_idx < 128: | ||
|  |     if ch_idx < 16: | ||
|  |       return "\\x{0%x}" % ch_idx | ||
|  |     if ch_idx >= 32: | ||
|  |       return chr(ch_idx) | ||
|  |   return "\\x{%x}" % ch_idx | ||
|  | 
 | ||
|  | output_directory = "" | ||
|  | 
 | ||
|  | if len(sys.argv) > 2: | ||
|  |   print('** Too many arguments: just give a directory name') | ||
|  |   sys.exit(1) | ||
|  | if len(sys.argv) == 2: | ||
|  |   output_directory = sys.argv[1] | ||
|  |   if not output_directory.endswith("/"): | ||
|  |     output_directory += "/" | ||
|  | 
 | ||
|  | try: | ||
|  |   input_file = open(output_directory + "testinput26", "w") | ||
|  |   output_file = open(output_directory + "testoutput26", "w") | ||
|  | except IOError: | ||
|  |   print ("** Couldn't open output files") | ||
|  |   sys.exit(1) | ||
|  | 
 | ||
|  | write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n") | ||
|  | 
 | ||
|  | # --------------------------------------------------------------------------- | ||
|  | #                      UNICODE SCRIPT EXTENSION TESTS | ||
|  | # --------------------------------------------------------------------------- | ||
|  | 
 | ||
|  | write_both("# Unicode Script Extension tests.\n\n") | ||
|  | 
 | ||
|  | def gen_script_tests(): | ||
|  |   script_data = [None] * len(script_names) | ||
|  |   char_data = [None] * 0x110000 | ||
|  | 
 | ||
|  |   property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #") | ||
|  |   prev_name = "" | ||
|  |   script_idx = -1 | ||
|  | 
 | ||
|  |   with open("Unicode.tables/Scripts.txt") as f: | ||
|  |     for line in f: | ||
|  |       match_obj = property_re.match(line) | ||
|  | 
 | ||
|  |       if match_obj == None: | ||
|  |         continue | ||
|  | 
 | ||
|  |       name = match_obj.group(3) | ||
|  |       if name != prev_name: | ||
|  |         script_idx = script_names.index(name) | ||
|  |         prev_name = name | ||
|  | 
 | ||
|  |       low = int(match_obj.group(1), 16) | ||
|  |       high = low | ||
|  |       char_data[low] = name | ||
|  | 
 | ||
|  |       if match_obj.group(2) != None: | ||
|  |         high = int(match_obj.group(2), 16) | ||
|  |         for idx in range(low + 1, high + 1): | ||
|  |            char_data[idx] = name | ||
|  | 
 | ||
|  |       if script_data[script_idx] == None: | ||
|  |         script_data[script_idx] = [low, None, None, None, None] | ||
|  |       script_data[script_idx][1] = high | ||
|  | 
 | ||
|  |   extended_script_indicies = {} | ||
|  | 
 | ||
|  |   with open("Unicode.tables/ScriptExtensions.txt") as f: | ||
|  |     for line in f: | ||
|  |       match_obj = property_re.match(line) | ||
|  | 
 | ||
|  |       if match_obj == None: | ||
|  |         continue | ||
|  | 
 | ||
|  |       low = int(match_obj.group(1), 16) | ||
|  |       high = low | ||
|  |       if match_obj.group(2) != None: | ||
|  |         high = int(match_obj.group(2), 16) | ||
|  | 
 | ||
|  |       for abbrev in match_obj.group(3).split(" "): | ||
|  |         if abbrev not in extended_script_indicies: | ||
|  |           idx = script_abbrevs.index(abbrev) | ||
|  |           extended_script_indicies[abbrev] = idx | ||
|  |           rec = script_data[idx] | ||
|  |           rec[2] = low | ||
|  |           rec[3] = high | ||
|  |         else: | ||
|  |           idx = extended_script_indicies[abbrev] | ||
|  |           rec = script_data[idx] | ||
|  |           if rec[2] > low: | ||
|  |             rec[2] = low | ||
|  |           if rec[3] < high: | ||
|  |             rec[3] = high | ||
|  | 
 | ||
|  |         if rec[4] == None: | ||
|  |           name = script_names[idx] | ||
|  |           for idx in range(low, high + 1): | ||
|  |             if char_data[idx] != name: | ||
|  |               rec[4] = idx | ||
|  |               break | ||
|  | 
 | ||
|  |   long_property_name = False | ||
|  | 
 | ||
|  |   for idx, rec in enumerate(script_data): | ||
|  |     script_name = script_names[idx] | ||
|  | 
 | ||
|  |     if script_name == "Unknown": | ||
|  |       continue | ||
|  | 
 | ||
|  |     script_abbrev = script_abbrevs[idx] | ||
|  | 
 | ||
|  |     write_both("# Base script check\n") | ||
|  |     write_both("/^\\p{sc=%s}/utf\n" % script_name) | ||
|  |     write_both("  %s\n" % to_string_char(rec[0])) | ||
|  |     output_file.write(" 0: %s\n" % to_string_char(rec[0])) | ||
|  |     write_both("\n") | ||
|  | 
 | ||
|  |     write_both("/^\\p{Script=%s}/utf\n" % script_abbrev) | ||
|  |     write_both("  %s\n" % to_string_char(rec[1])) | ||
|  |     output_file.write(" 0: %s\n" % to_string_char(rec[1])) | ||
|  |     write_both("\n") | ||
|  | 
 | ||
|  |     if rec[2] != None: | ||
|  |       property_name = "scx" | ||
|  |       if long_property_name: | ||
|  |         property_name = "Script_Extensions" | ||
|  | 
 | ||
|  |       write_both("# Script extension check\n") | ||
|  |       write_both("/^\\p{%s}/utf\n" % script_name) | ||
|  |       write_both("  %s\n" % to_string_char(rec[2])) | ||
|  |       output_file.write(" 0: %s\n" % to_string_char(rec[2])) | ||
|  |       write_both("\n") | ||
|  | 
 | ||
|  |       write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev)) | ||
|  |       write_both("  %s\n" % to_string_char(rec[3])) | ||
|  |       output_file.write(" 0: %s\n" % to_string_char(rec[3])) | ||
|  |       write_both("\n") | ||
|  | 
 | ||
|  |       long_property_name = not long_property_name | ||
|  | 
 | ||
|  |       if rec[4] != None: | ||
|  |         write_both("# Script extension only character\n") | ||
|  |         write_both("/^\\p{%s}/utf\n" % script_name) | ||
|  |         write_both("  %s\n" % to_string_char(rec[4])) | ||
|  |         output_file.write(" 0: %s\n" % to_string_char(rec[4])) | ||
|  |         write_both("\n") | ||
|  | 
 | ||
|  |         write_both("/^\\p{sc=%s}/utf\n" % script_name) | ||
|  |         write_both("  %s\n" % to_string_char(rec[4])) | ||
|  |         output_file.write("No match\n") | ||
|  |         write_both("\n") | ||
|  |       else: | ||
|  |         print("External character has not found for %s" % script_name) | ||
|  | 
 | ||
|  |     high = rec[1] | ||
|  |     if rec[3] != None and rec[3] > rec[1]: | ||
|  |       high = rec[3] | ||
|  |     write_both("# Character not in script\n") | ||
|  |     write_both("/^\\p{%s}/utf\n" % script_name) | ||
|  |     write_both("  %s\n" % to_string_char(high + 1)) | ||
|  |     output_file.write("No match\n") | ||
|  |     write_both("\n") | ||
|  | 
 | ||
|  | 
 | ||
|  | gen_script_tests() | ||
|  | 
 | ||
|  | write_both("# End of testinput26\n") |