1
0
mirror of https://github.com/taigrr/arduinolibs synced 2025-01-18 04:33:12 -08:00
arduinolibs/gen/genwcwidth.c
2016-03-05 13:21:25 +10:00

236 lines
7.4 KiB
C

/*
* Copyright (C) 2016 Southern Storm Software, Pty Ltd.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
// Generates the Terminal::isWideCharacter() function from the data at:
// http://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
// http://www.unicode.org/reports/tr11/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_UNICODE 0x10FFFF
#define NUM_UNICODE (MAX_UNICODE + 1)
static unsigned char *masks;
static void mark_range(long code1, long code2)
{
while (code1 <= code2) {
if (code1 > MAX_UNICODE)
break;
masks[code1 / 8] |= (1 << (code1 % 8));
++code1;
}
}
static void unmark_range(long code1, long code2)
{
while (code1 <= code2) {
if (code1 > MAX_UNICODE)
break;
masks[code1 / 8] &= ~(1 << (code1 % 8));
++code1;
}
}
static void dump_ranges(void)
{
long code;
int index, sum;
unsigned char *prevptr = 0;
unsigned char *ptr;
int dotdot = 0;
for (code = 0; code <= MAX_UNICODE; code += 0x100) {
ptr = masks + (code / 8);
sum = 0;
for (index = 0; index < 32; ++index)
sum += ptr[index];
if (sum == 0 || sum == (0xFF * 32)) {
if (prevptr && !memcmp(ptr, prevptr, 32)) {
if (!dotdot) {
dotdot = 1;
printf("..\n");
}
continue;
}
}
dotdot = 0;
printf("%06lX: ", code);
for (index = 0; index < 32; ++index)
printf("%02X", ptr[index]);
printf("\n");
prevptr = ptr;
}
printf("\n");
}
static void print_lookup_table(const char *name, long first, long last)
{
long index, size;
unsigned char *ptr = masks + first / 8;
size = (last - first + 1) / 8;
printf(" static unsigned char const %s[%ld] PROGMEM = {\n", name, size);
for (index = 0; index < size; ++index) {
if ((index % 8) == 0)
printf(" ");
printf("0x%02X", ptr[index]);
if (index < (size - 1)) {
if ((index % 8) == 7)
printf(",\n");
else
printf(", ");
} else {
printf("\n");
}
}
printf(" };\n");
}
static void recognizer(void)
{
long code;
int first = 1;
printf("bool Terminal::isWideCharacter(long code)\n{\n");
printf(" // This function was automatically generated by genwcwidth.c\n");
print_lookup_table("range3000", 0x3000, 0x30FF);
print_lookup_table("rangeFE00", 0xFE00, 0xFFFF);
printf(" unsigned c;\n");
// Bail out early for Latin character sets.
printf(" if (code < 0x2300) {\n");
printf(" return false;\n");
// Densely populated ranges.
printf(" } else if (code >= 0x3000 && code <= 0x30FF) {\n");
printf(" c = (unsigned)(code - 0x3000);\n");
printf(" return (pgm_read_byte(range3000 + (c / 8)) & (1 << (c %% 8))) != 0;\n");
printf(" } else if (code >= 0xFE00 && code <= 0xFFFF) {\n");
printf(" c = (unsigned)(code - 0xFE00);\n");
printf(" return (pgm_read_byte(rangeFE00 + (c / 8)) & (1 << (c %% 8))) != 0;\n");
// Deal with the main wide character ranges.
printf(" } else if (code >= 0x3400 && code <= 0x4DBF) {\n");
printf(" return true;\n");
printf(" } else if (code >= 0x4E00 && code <= 0x9FFF) {\n");
printf(" return true;\n");
printf(" } else if (code >= 0xF900 && code <= 0xFAFF) {\n");
printf(" return true;\n");
printf(" } else if (code >= 0x20000 && code <= 0x2FFFD) {\n");
printf(" return true;\n");
printf(" } else if (code >= 0x30000 && code <= 0x3FFFD) {\n");
printf(" return true;\n");
printf(" } else if (");
// Deal with the left-overs.
unmark_range(0x3000, 0x30FF);
unmark_range(0xFE00, 0xFFFF);
for (code = 0; code <= MAX_UNICODE; ++code) {
if (masks[code / 8] & (1 << (code % 8))) {
if (!first)
printf(" ||\n ");
else
first = 0;
printf("code == 0x%04lX", code);
}
}
printf(") {\n");
printf(" return true;\n");
printf(" }\n");
printf(" return false;\n");
printf("}\n");
}
int main(int argc, char *argv[])
{
FILE *file;
char buffer[BUFSIZ];
// Allocate memory for the "is this a wide character?" mask array.
masks = calloc(NUM_UNICODE / 8, sizeof(unsigned char));
if (!masks) {
fprintf(stderr, "out of memory\n");
return 1;
}
// Load the contents of "EastAsianWidth.txt".
if (argc < 2) {
fprintf(stderr, "Usage: %s EastAsianWidth.txt\n", argv[0]);
return 1;
}
if ((file = fopen(argv[1], "r")) == NULL) {
perror(argv[1]);
return 1;
}
while (fgets(buffer, sizeof(buffer), file)) {
if ((buffer[0] >= '0' && buffer[0] <= '9') ||
(buffer[0] >= 'A' && buffer[0] <= 'F')) {
long code1 = 0;
long code2 = 0;
char *endptr = NULL;
code1 = strtol(buffer, &endptr, 16);
if (endptr[0] == '.' && endptr[1] == '.') {
endptr += 2;
code2 = strtol(buffer, &endptr, 16);
} else {
code2 = code1;
}
if (endptr[0] == ';') {
// Recognise 'W' and 'F' as wide characters. It is possible
// that 'A' (ambiguous) characters may also be wide but only
// in East Asian contexts, which we assume we're not for now.
if (endptr[1] == 'W' || endptr[1] == 'F') {
mark_range(code1, code2);
}
}
}
}
fclose(file);
// Some special ranges that are implicitly all-wide even if the
// code points aren't currently allocated by the Unicode standard.
mark_range(0x3400, 0x4DBF);
mark_range(0x4E00, 0x9FFF);
mark_range(0xF900, 0xFAFF);
mark_range(0x20000, 0x2FFFD);
mark_range(0x30000, 0x3FFFD);
// Dump the ranges.
dump_ranges();
// Unmark the special ranges to make it easier to find the left-overs.
unmark_range(0x3400, 0x4DBF);
unmark_range(0x4E00, 0x9FFF);
unmark_range(0xF900, 0xFAFF);
unmark_range(0x20000, 0x2FFFD);
unmark_range(0x30000, 0x3FFFD);
// Create the recognition tree.
recognizer();
// Clean up and exit.
free(masks);
return 0;
}