diff --git a/core/string/char_range.inc b/core/string/char_range.inc index efae7578028..6cd1e6ac8c1 100644 --- a/core/string/char_range.inc +++ b/core/string/char_range.inc @@ -28,6 +28,8 @@ /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /**************************************************************************/ +// This file was generated using the `misc/scripts/char_range_fetch.py` script. + #ifndef CHAR_RANGE_INC #define CHAR_RANGE_INC @@ -43,7 +45,7 @@ struct CharRange { constexpr inline CharRange xid_start[] = { { 0x41, 0x5a }, - { 0x5f, 0x5f }, // Underscore technically isn't in XID_Start, but for our purposes it's included. + { 0x5f, 0x5f }, { 0x61, 0x7a }, { 0xaa, 0xaa }, { 0xb5, 0xb5 }, diff --git a/misc/scripts/char_range_fetch.py b/misc/scripts/char_range_fetch.py new file mode 100755 index 00000000000..fda8da37f08 --- /dev/null +++ b/misc/scripts/char_range_fetch.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 + +# Script used to dump char ranges for specific properties from +# the Unicode Character Database to the `char_range.inc` file. +# NOTE: This script is deliberately not integrated into the build system; +# you should run it manually whenever you want to update the data. + +import os +import sys +from typing import Final, List, Tuple +from urllib.request import urlopen + +if __name__ == "__main__": + sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../")) + +from methods import generate_copyright_header + +URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/DerivedCoreProperties.txt" + + +xid_start: List[Tuple[int, int]] = [] +xid_continue: List[Tuple[int, int]] = [] +uppercase_letter: List[Tuple[int, int]] = [] +lowercase_letter: List[Tuple[int, int]] = [] +unicode_letter: List[Tuple[int, int]] = [] + + +def merge_ranges(ranges: List[Tuple[int, int]]) -> None: + if len(ranges) < 2: + return + + last_start: int = ranges[0][0] + last_end: int = ranges[0][1] + original_ranges: List[Tuple[int, int]] = ranges[1:] + + ranges.clear() + + for curr_range in original_ranges: + curr_start: int = curr_range[0] + curr_end: int = curr_range[1] + if last_end + 1 != curr_start: + ranges.append((last_start, last_end)) + last_start = curr_start + last_end = curr_end + + ranges.append((last_start, last_end)) + + +def parse_unicode_data() -> None: + lines: List[str] = [line.decode("utf-8") for line in urlopen(URL)] + + for line in lines: + if line.startswith("#") or not line.strip(): + continue + + split_line: List[str] = line.split(";") + + char_range: str = split_line[0].strip() + char_property: str = split_line[1].strip().split("#")[0].strip() + + range_start: str = char_range + range_end: str = char_range + if ".." in char_range: + range_start, range_end = char_range.split("..") + + range_tuple: Tuple[int, int] = (int(range_start, 16), int(range_end, 16)) + + if char_property == "XID_Start": + xid_start.append(range_tuple) + elif char_property == "XID_Continue": + xid_continue.append(range_tuple) + elif char_property == "Uppercase": + uppercase_letter.append(range_tuple) + elif char_property == "Lowercase": + lowercase_letter.append(range_tuple) + elif char_property == "Alphabetic": + unicode_letter.append(range_tuple) + + # Underscore technically isn't in XID_Start, but for our purposes it's included. + xid_start.append((0x005F, 0x005F)) + xid_start.sort(key=lambda x: x[0]) + + merge_ranges(xid_start) + merge_ranges(xid_continue) + merge_ranges(uppercase_letter) + merge_ranges(lowercase_letter) + merge_ranges(unicode_letter) + + +def make_array(array_name: str, range_list: List[Tuple[int, int]]) -> str: + result: str = f"constexpr inline CharRange {array_name}[] = {{\n" + + for start, end in range_list: + result += f"\t{{ 0x{start:x}, 0x{end:x} }},\n" + + result += "};\n\n" + + return result + + +def generate_char_range_inc() -> None: + parse_unicode_data() + + source: str = generate_copyright_header("char_range.inc") + + source += f""" +// This file was generated using the `misc/scripts/char_range_fetch.py` script. + +#ifndef CHAR_RANGE_INC +#define CHAR_RANGE_INC + +#include "core/typedefs.h" + +// Unicode Derived Core Properties +// Source: {URL} + +struct CharRange {{ +\tchar32_t start; +\tchar32_t end; +}};\n\n""" + + source += make_array("xid_start", xid_start) + source += make_array("xid_continue", xid_continue) + source += make_array("uppercase_letter", uppercase_letter) + source += make_array("lowercase_letter", lowercase_letter) + source += make_array("unicode_letter", unicode_letter) + + source += "#endif // CHAR_RANGE_INC\n" + + char_range_path: str = os.path.join(os.path.dirname(__file__), "../../core/string/char_range.inc") + with open(char_range_path, "w", newline="\n") as f: + f.write(source) + + print("`char_range.inc` generated successfully.") + + +if __name__ == "__main__": + generate_char_range_inc()