189 lines
		
	
	
		
			5.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			189 lines
		
	
	
		
			5.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #! /usr/bin/python
 | |
| 
 | |
| #                   PCRE2 UNICODE PROPERTY SUPPORT
 | |
| #                   ------------------------------
 | |
| #
 | |
| # This file auto-generates unicode property tests and their expected output.
 | |
| # It is recommended to re-run this generator after the unicode files are
 | |
| # updated. The names of the generated files are `testinput26` and `testoutput26`
 | |
| 
 | |
| import re
 | |
| import sys
 | |
| 
 | |
| from GenerateCommon import \
 | |
|   script_names, \
 | |
|   script_abbrevs
 | |
| 
 | |
| def write_both(text):
 | |
|   input_file.write(text)
 | |
|   output_file.write(text)
 | |
| 
 | |
| def to_string_char(ch_idx):
 | |
|   if ch_idx < 128:
 | |
|     if ch_idx < 16:
 | |
|       return "\\x{0%x}" % ch_idx
 | |
|     if ch_idx >= 32:
 | |
|       return chr(ch_idx)
 | |
|   return "\\x{%x}" % ch_idx
 | |
| 
 | |
| output_directory = ""
 | |
| 
 | |
| if len(sys.argv) > 2:
 | |
|   print('** Too many arguments: just give a directory name')
 | |
|   sys.exit(1)
 | |
| if len(sys.argv) == 2:
 | |
|   output_directory = sys.argv[1]
 | |
|   if not output_directory.endswith("/"):
 | |
|     output_directory += "/"
 | |
| 
 | |
| try:
 | |
|   input_file = open(output_directory + "testinput26", "w")
 | |
|   output_file = open(output_directory + "testoutput26", "w")
 | |
| except IOError:
 | |
|   print ("** Couldn't open output files")
 | |
|   sys.exit(1)
 | |
| 
 | |
| write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")
 | |
| 
 | |
| # ---------------------------------------------------------------------------
 | |
| #                      UNICODE SCRIPT EXTENSION TESTS
 | |
| # ---------------------------------------------------------------------------
 | |
| 
 | |
| write_both("# Unicode Script Extension tests.\n\n")
 | |
| 
 | |
| def gen_script_tests():
 | |
|   script_data = [None] * len(script_names)
 | |
|   char_data = [None] * 0x110000
 | |
| 
 | |
|   property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
 | |
|   prev_name = ""
 | |
|   script_idx = -1
 | |
| 
 | |
|   with open("Unicode.tables/Scripts.txt") as f:
 | |
|     for line in f:
 | |
|       match_obj = property_re.match(line)
 | |
| 
 | |
|       if match_obj == None:
 | |
|         continue
 | |
| 
 | |
|       name = match_obj.group(3)
 | |
|       if name != prev_name:
 | |
|         script_idx = script_names.index(name)
 | |
|         prev_name = name
 | |
| 
 | |
|       low = int(match_obj.group(1), 16)
 | |
|       high = low
 | |
|       char_data[low] = name
 | |
| 
 | |
|       if match_obj.group(2) != None:
 | |
|         high = int(match_obj.group(2), 16)
 | |
|         for idx in range(low + 1, high + 1):
 | |
|            char_data[idx] = name
 | |
| 
 | |
|       if script_data[script_idx] == None:
 | |
|         script_data[script_idx] = [low, None, None, None, None]
 | |
|       script_data[script_idx][1] = high
 | |
| 
 | |
|   extended_script_indicies = {}
 | |
| 
 | |
|   with open("Unicode.tables/ScriptExtensions.txt") as f:
 | |
|     for line in f:
 | |
|       match_obj = property_re.match(line)
 | |
| 
 | |
|       if match_obj == None:
 | |
|         continue
 | |
| 
 | |
|       low = int(match_obj.group(1), 16)
 | |
|       high = low
 | |
|       if match_obj.group(2) != None:
 | |
|         high = int(match_obj.group(2), 16)
 | |
| 
 | |
|       for abbrev in match_obj.group(3).split(" "):
 | |
|         if abbrev not in extended_script_indicies:
 | |
|           idx = script_abbrevs.index(abbrev)
 | |
|           extended_script_indicies[abbrev] = idx
 | |
|           rec = script_data[idx]
 | |
|           rec[2] = low
 | |
|           rec[3] = high
 | |
|         else:
 | |
|           idx = extended_script_indicies[abbrev]
 | |
|           rec = script_data[idx]
 | |
|           if rec[2] > low:
 | |
|             rec[2] = low
 | |
|           if rec[3] < high:
 | |
|             rec[3] = high
 | |
| 
 | |
|         if rec[4] == None:
 | |
|           name = script_names[idx]
 | |
|           for idx in range(low, high + 1):
 | |
|             if char_data[idx] != name:
 | |
|               rec[4] = idx
 | |
|               break
 | |
| 
 | |
|   long_property_name = False
 | |
| 
 | |
|   for idx, rec in enumerate(script_data):
 | |
|     script_name = script_names[idx]
 | |
| 
 | |
|     if script_name == "Unknown":
 | |
|       continue
 | |
| 
 | |
|     script_abbrev = script_abbrevs[idx]
 | |
| 
 | |
|     write_both("# Base script check\n")
 | |
|     write_both("/^\\p{sc=%s}/utf\n" % script_name)
 | |
|     write_both("  %s\n" % to_string_char(rec[0]))
 | |
|     output_file.write(" 0: %s\n" % to_string_char(rec[0]))
 | |
|     write_both("\n")
 | |
| 
 | |
|     write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
 | |
|     write_both("  %s\n" % to_string_char(rec[1]))
 | |
|     output_file.write(" 0: %s\n" % to_string_char(rec[1]))
 | |
|     write_both("\n")
 | |
| 
 | |
|     if rec[2] != None:
 | |
|       property_name = "scx"
 | |
|       if long_property_name:
 | |
|         property_name = "Script_Extensions"
 | |
| 
 | |
|       write_both("# Script extension check\n")
 | |
|       write_both("/^\\p{%s}/utf\n" % script_name)
 | |
|       write_both("  %s\n" % to_string_char(rec[2]))
 | |
|       output_file.write(" 0: %s\n" % to_string_char(rec[2]))
 | |
|       write_both("\n")
 | |
| 
 | |
|       write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
 | |
|       write_both("  %s\n" % to_string_char(rec[3]))
 | |
|       output_file.write(" 0: %s\n" % to_string_char(rec[3]))
 | |
|       write_both("\n")
 | |
| 
 | |
|       long_property_name = not long_property_name
 | |
| 
 | |
|       if rec[4] != None:
 | |
|         write_both("# Script extension only character\n")
 | |
|         write_both("/^\\p{%s}/utf\n" % script_name)
 | |
|         write_both("  %s\n" % to_string_char(rec[4]))
 | |
|         output_file.write(" 0: %s\n" % to_string_char(rec[4]))
 | |
|         write_both("\n")
 | |
| 
 | |
|         write_both("/^\\p{sc=%s}/utf\n" % script_name)
 | |
|         write_both("  %s\n" % to_string_char(rec[4]))
 | |
|         output_file.write("No match\n")
 | |
|         write_both("\n")
 | |
|       else:
 | |
|         print("External character has not found for %s" % script_name)
 | |
| 
 | |
|     high = rec[1]
 | |
|     if rec[3] != None and rec[3] > rec[1]:
 | |
|       high = rec[3]
 | |
|     write_both("# Character not in script\n")
 | |
|     write_both("/^\\p{%s}/utf\n" % script_name)
 | |
|     write_both("  %s\n" % to_string_char(high + 1))
 | |
|     output_file.write("No match\n")
 | |
|     write_both("\n")
 | |
| 
 | |
| 
 | |
| gen_script_tests()
 | |
| 
 | |
| write_both("# End of testinput26\n")
 |