/* * Copyright (C) 2016 Southern Storm Software, Pty Ltd. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ // Generates the Terminal::isWideCharacter() function from the data at: // http://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt // http://www.unicode.org/reports/tr11/ #include #include #include #define MAX_UNICODE 0x10FFFF #define NUM_UNICODE (MAX_UNICODE + 1) static unsigned char *masks; static void mark_range(long code1, long code2) { while (code1 <= code2) { if (code1 > MAX_UNICODE) break; masks[code1 / 8] |= (1 << (code1 % 8)); ++code1; } } static void unmark_range(long code1, long code2) { while (code1 <= code2) { if (code1 > MAX_UNICODE) break; masks[code1 / 8] &= ~(1 << (code1 % 8)); ++code1; } } static void dump_ranges(void) { long code; int index, sum; unsigned char *prevptr = 0; unsigned char *ptr; int dotdot = 0; for (code = 0; code <= MAX_UNICODE; code += 0x100) { ptr = masks + (code / 8); sum = 0; for (index = 0; index < 32; ++index) sum += ptr[index]; if (sum == 0 || sum == (0xFF * 32)) { if (prevptr && !memcmp(ptr, prevptr, 32)) { if (!dotdot) { dotdot = 1; printf("..\n"); } continue; } } dotdot = 0; printf("%06lX: ", code); for (index = 0; index < 32; ++index) printf("%02X", ptr[index]); printf("\n"); prevptr = ptr; } printf("\n"); } static void print_lookup_table(const char *name, long first, long last) { long index, size; unsigned char *ptr = masks + first / 8; size = (last - first + 1) / 8; printf(" static unsigned char const %s[%ld] PROGMEM = {\n", name, size); for (index = 0; index < size; ++index) { if ((index % 8) == 0) printf(" "); printf("0x%02X", ptr[index]); if (index < (size - 1)) { if ((index % 8) == 7) printf(",\n"); else printf(", "); } else { printf("\n"); } } printf(" };\n"); } static void recognizer(void) { long code; int first = 1; printf("bool Terminal::isWideCharacter(long code)\n{\n"); printf(" // This function was automatically generated by genwcwidth.c\n"); print_lookup_table("range3000", 0x3000, 0x30FF); print_lookup_table("rangeFE00", 0xFE00, 0xFFFF); printf(" unsigned c;\n"); // Bail out early for Latin character sets. printf(" if (code < 0x2300) {\n"); printf(" return false;\n"); // Densely populated ranges. printf(" } else if (code >= 0x3000 && code <= 0x30FF) {\n"); printf(" c = (unsigned)(code - 0x3000);\n"); printf(" return (pgm_read_byte(range3000 + (c / 8)) & (1 << (c %% 8))) != 0;\n"); printf(" } else if (code >= 0xFE00 && code <= 0xFFFF) {\n"); printf(" c = (unsigned)(code - 0xFE00);\n"); printf(" return (pgm_read_byte(rangeFE00 + (c / 8)) & (1 << (c %% 8))) != 0;\n"); // Deal with the main wide character ranges. printf(" } else if (code >= 0x3400 && code <= 0x4DBF) {\n"); printf(" return true;\n"); printf(" } else if (code >= 0x4E00 && code <= 0x9FFF) {\n"); printf(" return true;\n"); printf(" } else if (code >= 0xF900 && code <= 0xFAFF) {\n"); printf(" return true;\n"); printf(" } else if (code >= 0x20000 && code <= 0x2FFFD) {\n"); printf(" return true;\n"); printf(" } else if (code >= 0x30000 && code <= 0x3FFFD) {\n"); printf(" return true;\n"); printf(" } else if ("); // Deal with the left-overs. unmark_range(0x3000, 0x30FF); unmark_range(0xFE00, 0xFFFF); for (code = 0; code <= MAX_UNICODE; ++code) { if (masks[code / 8] & (1 << (code % 8))) { if (!first) printf(" ||\n "); else first = 0; printf("code == 0x%04lX", code); } } printf(") {\n"); printf(" return true;\n"); printf(" }\n"); printf(" return false;\n"); printf("}\n"); } int main(int argc, char *argv[]) { FILE *file; char buffer[BUFSIZ]; // Allocate memory for the "is this a wide character?" mask array. masks = calloc(NUM_UNICODE / 8, sizeof(unsigned char)); if (!masks) { fprintf(stderr, "out of memory\n"); return 1; } // Load the contents of "EastAsianWidth.txt". if (argc < 2) { fprintf(stderr, "Usage: %s EastAsianWidth.txt\n", argv[0]); return 1; } if ((file = fopen(argv[1], "r")) == NULL) { perror(argv[1]); return 1; } while (fgets(buffer, sizeof(buffer), file)) { if ((buffer[0] >= '0' && buffer[0] <= '9') || (buffer[0] >= 'A' && buffer[0] <= 'F')) { long code1 = 0; long code2 = 0; char *endptr = NULL; code1 = strtol(buffer, &endptr, 16); if (endptr[0] == '.' && endptr[1] == '.') { endptr += 2; code2 = strtol(buffer, &endptr, 16); } else { code2 = code1; } if (endptr[0] == ';') { // Recognise 'W' and 'F' as wide characters. It is possible // that 'A' (ambiguous) characters may also be wide but only // in East Asian contexts, which we assume we're not for now. if (endptr[1] == 'W' || endptr[1] == 'F') { mark_range(code1, code2); } } } } fclose(file); // Some special ranges that are implicitly all-wide even if the // code points aren't currently allocated by the Unicode standard. mark_range(0x3400, 0x4DBF); mark_range(0x4E00, 0x9FFF); mark_range(0xF900, 0xFAFF); mark_range(0x20000, 0x2FFFD); mark_range(0x30000, 0x3FFFD); // Dump the ranges. dump_ranges(); // Unmark the special ranges to make it easier to find the left-overs. unmark_range(0x3400, 0x4DBF); unmark_range(0x4E00, 0x9FFF); unmark_range(0xF900, 0xFAFF); unmark_range(0x20000, 0x2FFFD); unmark_range(0x30000, 0x3FFFD); // Create the recognition tree. recognizer(); // Clean up and exit. free(masks); return 0; }