189 lines
		
	
	
		
			5.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			189 lines
		
	
	
		
			5.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| 
								 | 
							
								#! /usr/bin/python
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#                   PCRE2 UNICODE PROPERTY SUPPORT
							 | 
						||
| 
								 | 
							
								#                   ------------------------------
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# This file auto-generates unicode property tests and their expected output.
							 | 
						||
| 
								 | 
							
								# It is recommended to re-run this generator after the unicode files are
							 | 
						||
| 
								 | 
							
								# updated. The names of the generated files are `testinput26` and `testoutput26`
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								import re
							 | 
						||
| 
								 | 
							
								import sys
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								from GenerateCommon import \
							 | 
						||
| 
								 | 
							
								  script_names, \
							 | 
						||
| 
								 | 
							
								  script_abbrevs
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def write_both(text):
							 | 
						||
| 
								 | 
							
								  input_file.write(text)
							 | 
						||
| 
								 | 
							
								  output_file.write(text)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def to_string_char(ch_idx):
							 | 
						||
| 
								 | 
							
								  if ch_idx < 128:
							 | 
						||
| 
								 | 
							
								    if ch_idx < 16:
							 | 
						||
| 
								 | 
							
								      return "\\x{0%x}" % ch_idx
							 | 
						||
| 
								 | 
							
								    if ch_idx >= 32:
							 | 
						||
| 
								 | 
							
								      return chr(ch_idx)
							 | 
						||
| 
								 | 
							
								  return "\\x{%x}" % ch_idx
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								output_directory = ""
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								if len(sys.argv) > 2:
							 | 
						||
| 
								 | 
							
								  print('** Too many arguments: just give a directory name')
							 | 
						||
| 
								 | 
							
								  sys.exit(1)
							 | 
						||
| 
								 | 
							
								if len(sys.argv) == 2:
							 | 
						||
| 
								 | 
							
								  output_directory = sys.argv[1]
							 | 
						||
| 
								 | 
							
								  if not output_directory.endswith("/"):
							 | 
						||
| 
								 | 
							
								    output_directory += "/"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								try:
							 | 
						||
| 
								 | 
							
								  input_file = open(output_directory + "testinput26", "w")
							 | 
						||
| 
								 | 
							
								  output_file = open(output_directory + "testoutput26", "w")
							 | 
						||
| 
								 | 
							
								except IOError:
							 | 
						||
| 
								 | 
							
								  print ("** Couldn't open output files")
							 | 
						||
| 
								 | 
							
								  sys.exit(1)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# ---------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								#                      UNICODE SCRIPT EXTENSION TESTS
							 | 
						||
| 
								 | 
							
								# ---------------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								write_both("# Unicode Script Extension tests.\n\n")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def gen_script_tests():
							 | 
						||
| 
								 | 
							
								  script_data = [None] * len(script_names)
							 | 
						||
| 
								 | 
							
								  char_data = [None] * 0x110000
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
							 | 
						||
| 
								 | 
							
								  prev_name = ""
							 | 
						||
| 
								 | 
							
								  script_idx = -1
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  with open("Unicode.tables/Scripts.txt") as f:
							 | 
						||
| 
								 | 
							
								    for line in f:
							 | 
						||
| 
								 | 
							
								      match_obj = property_re.match(line)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								      if match_obj == None:
							 | 
						||
| 
								 | 
							
								        continue
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								      name = match_obj.group(3)
							 | 
						||
| 
								 | 
							
								      if name != prev_name:
							 | 
						||
| 
								 | 
							
								        script_idx = script_names.index(name)
							 | 
						||
| 
								 | 
							
								        prev_name = name
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								      low = int(match_obj.group(1), 16)
							 | 
						||
| 
								 | 
							
								      high = low
							 | 
						||
| 
								 | 
							
								      char_data[low] = name
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								      if match_obj.group(2) != None:
							 | 
						||
| 
								 | 
							
								        high = int(match_obj.group(2), 16)
							 | 
						||
| 
								 | 
							
								        for idx in range(low + 1, high + 1):
							 | 
						||
| 
								 | 
							
								           char_data[idx] = name
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								      if script_data[script_idx] == None:
							 | 
						||
| 
								 | 
							
								        script_data[script_idx] = [low, None, None, None, None]
							 | 
						||
| 
								 | 
							
								      script_data[script_idx][1] = high
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  extended_script_indicies = {}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  with open("Unicode.tables/ScriptExtensions.txt") as f:
							 | 
						||
| 
								 | 
							
								    for line in f:
							 | 
						||
| 
								 | 
							
								      match_obj = property_re.match(line)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								      if match_obj == None:
							 | 
						||
| 
								 | 
							
								        continue
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								      low = int(match_obj.group(1), 16)
							 | 
						||
| 
								 | 
							
								      high = low
							 | 
						||
| 
								 | 
							
								      if match_obj.group(2) != None:
							 | 
						||
| 
								 | 
							
								        high = int(match_obj.group(2), 16)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								      for abbrev in match_obj.group(3).split(" "):
							 | 
						||
| 
								 | 
							
								        if abbrev not in extended_script_indicies:
							 | 
						||
| 
								 | 
							
								          idx = script_abbrevs.index(abbrev)
							 | 
						||
| 
								 | 
							
								          extended_script_indicies[abbrev] = idx
							 | 
						||
| 
								 | 
							
								          rec = script_data[idx]
							 | 
						||
| 
								 | 
							
								          rec[2] = low
							 | 
						||
| 
								 | 
							
								          rec[3] = high
							 | 
						||
| 
								 | 
							
								        else:
							 | 
						||
| 
								 | 
							
								          idx = extended_script_indicies[abbrev]
							 | 
						||
| 
								 | 
							
								          rec = script_data[idx]
							 | 
						||
| 
								 | 
							
								          if rec[2] > low:
							 | 
						||
| 
								 | 
							
								            rec[2] = low
							 | 
						||
| 
								 | 
							
								          if rec[3] < high:
							 | 
						||
| 
								 | 
							
								            rec[3] = high
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        if rec[4] == None:
							 | 
						||
| 
								 | 
							
								          name = script_names[idx]
							 | 
						||
| 
								 | 
							
								          for idx in range(low, high + 1):
							 | 
						||
| 
								 | 
							
								            if char_data[idx] != name:
							 | 
						||
| 
								 | 
							
								              rec[4] = idx
							 | 
						||
| 
								 | 
							
								              break
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  long_property_name = False
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  for idx, rec in enumerate(script_data):
							 | 
						||
| 
								 | 
							
								    script_name = script_names[idx]
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    if script_name == "Unknown":
							 | 
						||
| 
								 | 
							
								      continue
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    script_abbrev = script_abbrevs[idx]
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    write_both("# Base script check\n")
							 | 
						||
| 
								 | 
							
								    write_both("/^\\p{sc=%s}/utf\n" % script_name)
							 | 
						||
| 
								 | 
							
								    write_both("  %s\n" % to_string_char(rec[0]))
							 | 
						||
| 
								 | 
							
								    output_file.write(" 0: %s\n" % to_string_char(rec[0]))
							 | 
						||
| 
								 | 
							
								    write_both("\n")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
							 | 
						||
| 
								 | 
							
								    write_both("  %s\n" % to_string_char(rec[1]))
							 | 
						||
| 
								 | 
							
								    output_file.write(" 0: %s\n" % to_string_char(rec[1]))
							 | 
						||
| 
								 | 
							
								    write_both("\n")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    if rec[2] != None:
							 | 
						||
| 
								 | 
							
								      property_name = "scx"
							 | 
						||
| 
								 | 
							
								      if long_property_name:
							 | 
						||
| 
								 | 
							
								        property_name = "Script_Extensions"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								      write_both("# Script extension check\n")
							 | 
						||
| 
								 | 
							
								      write_both("/^\\p{%s}/utf\n" % script_name)
							 | 
						||
| 
								 | 
							
								      write_both("  %s\n" % to_string_char(rec[2]))
							 | 
						||
| 
								 | 
							
								      output_file.write(" 0: %s\n" % to_string_char(rec[2]))
							 | 
						||
| 
								 | 
							
								      write_both("\n")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								      write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
							 | 
						||
| 
								 | 
							
								      write_both("  %s\n" % to_string_char(rec[3]))
							 | 
						||
| 
								 | 
							
								      output_file.write(" 0: %s\n" % to_string_char(rec[3]))
							 | 
						||
| 
								 | 
							
								      write_both("\n")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								      long_property_name = not long_property_name
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								      if rec[4] != None:
							 | 
						||
| 
								 | 
							
								        write_both("# Script extension only character\n")
							 | 
						||
| 
								 | 
							
								        write_both("/^\\p{%s}/utf\n" % script_name)
							 | 
						||
| 
								 | 
							
								        write_both("  %s\n" % to_string_char(rec[4]))
							 | 
						||
| 
								 | 
							
								        output_file.write(" 0: %s\n" % to_string_char(rec[4]))
							 | 
						||
| 
								 | 
							
								        write_both("\n")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        write_both("/^\\p{sc=%s}/utf\n" % script_name)
							 | 
						||
| 
								 | 
							
								        write_both("  %s\n" % to_string_char(rec[4]))
							 | 
						||
| 
								 | 
							
								        output_file.write("No match\n")
							 | 
						||
| 
								 | 
							
								        write_both("\n")
							 | 
						||
| 
								 | 
							
								      else:
							 | 
						||
| 
								 | 
							
								        print("External character has not found for %s" % script_name)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    high = rec[1]
							 | 
						||
| 
								 | 
							
								    if rec[3] != None and rec[3] > rec[1]:
							 | 
						||
| 
								 | 
							
								      high = rec[3]
							 | 
						||
| 
								 | 
							
								    write_both("# Character not in script\n")
							 | 
						||
| 
								 | 
							
								    write_both("/^\\p{%s}/utf\n" % script_name)
							 | 
						||
| 
								 | 
							
								    write_both("  %s\n" % to_string_char(high + 1))
							 | 
						||
| 
								 | 
							
								    output_file.write("No match\n")
							 | 
						||
| 
								 | 
							
								    write_both("\n")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								gen_script_tests()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								write_both("# End of testinput26\n")
							 |