arduinolibs/gen/genwcwidth.c

/*
 * Copyright (C) 2016 Southern Storm Software, Pty Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

// Generates the Terminal::isWideCharacter() function from the data at:
// http://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
// http://www.unicode.org/reports/tr11/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_UNICODE 0x10FFFF
#define NUM_UNICODE (MAX_UNICODE + 1)

static unsigned char *masks;

static void mark_range(long code1, long code2)
{
    while (code1 <= code2) {
        if (code1 > MAX_UNICODE)
            break;
        masks[code1 / 8] |= (1 << (code1 % 8));
        ++code1;
    }
}

static void unmark_range(long code1, long code2)
{
    while (code1 <= code2) {
        if (code1 > MAX_UNICODE)
            break;
        masks[code1 / 8] &= ~(1 << (code1 % 8));
        ++code1;
    }
}

static void dump_ranges(void)
{
    long code;
    int index, sum;
    unsigned char *prevptr = 0;
    unsigned char *ptr;
    int dotdot = 0;
    for (code = 0; code <= MAX_UNICODE; code += 0x100) {
        ptr = masks + (code / 8);
        sum = 0;
        for (index = 0; index < 32; ++index)
            sum += ptr[index];
        if (sum == 0 || sum == (0xFF * 32)) {
            if (prevptr && !memcmp(ptr, prevptr, 32)) {
                if (!dotdot) {
                    dotdot = 1;
                    printf("..\n");
                }
                continue;
            }
        }
        dotdot = 0;
        printf("%06lX: ", code);
        for (index = 0; index < 32; ++index)
            printf("%02X", ptr[index]);
        printf("\n");
        prevptr = ptr;
    }
    printf("\n");
}

static void print_lookup_table(const char *name, long first, long last)
{
    long index, size;
    unsigned char *ptr = masks + first / 8;
    size = (last - first + 1) / 8;
    printf("    static unsigned char const %s[%ld] PROGMEM = {\n", name, size);
    for (index = 0; index < size; ++index) {
        if ((index % 8) == 0)
            printf("        ");
        printf("0x%02X", ptr[index]);
        if (index < (size - 1)) {
            if ((index % 8) == 7)
                printf(",\n");
            else
                printf(", ");
        } else {
            printf("\n");
        }
    }
    printf("    };\n");
}

static void recognizer(void)
{
    long code;
    int first = 1;

    printf("bool Terminal::isWideCharacter(long code)\n{\n");
    printf("    // This function was automatically generated by genwcwidth.c\n");
    print_lookup_table("range3000", 0x3000, 0x30FF);
    print_lookup_table("rangeFE00", 0xFE00, 0xFFFF);
    printf("    unsigned c;\n");

    // Bail out early for Latin character sets.
    printf("    if (code < 0x2300) {\n");
    printf("        return false;\n");

    // Densely populated ranges.
    printf("    } else if (code >= 0x3000 && code <= 0x30FF) {\n");
    printf("        c = (unsigned)(code - 0x3000);\n");
    printf("        return (pgm_read_byte(range3000 + (c / 8)) & (1 << (c %% 8))) != 0;\n");
    printf("    } else if (code >= 0xFE00 && code <= 0xFFFF) {\n");
    printf("        c = (unsigned)(code - 0xFE00);\n");
    printf("        return (pgm_read_byte(rangeFE00 + (c / 8)) & (1 << (c %% 8))) != 0;\n");

    // Deal with the main wide character ranges.
    printf("    } else if (code >= 0x3400 && code <= 0x4DBF) {\n");
    printf("        return true;\n");
    printf("    } else if (code >= 0x4E00 && code <= 0x9FFF) {\n");
    printf("        return true;\n");
    printf("    } else if (code >= 0xF900 && code <= 0xFAFF) {\n");
    printf("        return true;\n");
    printf("    } else if (code >= 0x20000 && code <= 0x2FFFD) {\n");
    printf("        return true;\n");
    printf("    } else if (code >= 0x30000 && code <= 0x3FFFD) {\n");
    printf("        return true;\n");
    printf("    } else if (");

    // Deal with the left-overs.
    unmark_range(0x3000, 0x30FF);
    unmark_range(0xFE00, 0xFFFF);
    for (code = 0; code <= MAX_UNICODE; ++code) {
        if (masks[code / 8] & (1 << (code % 8))) {
            if (!first)
                printf(" ||\n               ");
            else
                first = 0;
            printf("code == 0x%04lX", code);
        }
    }
    printf(") {\n");
    printf("        return true;\n");
    printf("    }\n");

    printf("    return false;\n");
    printf("}\n");
}

int main(int argc, char *argv[])
{
    FILE *file;
    char buffer[BUFSIZ];

    // Allocate memory for the "is this a wide character?" mask array.
    masks = calloc(NUM_UNICODE / 8, sizeof(unsigned char));
    if (!masks) {
        fprintf(stderr, "out of memory\n");
        return 1;
    }

    // Load the contents of "EastAsianWidth.txt".
    if (argc < 2) {
        fprintf(stderr, "Usage: %s EastAsianWidth.txt\n", argv[0]);
        return 1;
    }
    if ((file = fopen(argv[1], "r")) == NULL) {
        perror(argv[1]);
        return 1;
    }
    while (fgets(buffer, sizeof(buffer), file)) {
        if ((buffer[0] >= '0' && buffer[0] <= '9') ||
                (buffer[0] >= 'A' && buffer[0] <= 'F')) {
            long code1 = 0;
            long code2 = 0;
            char *endptr = NULL;
            code1 = strtol(buffer, &endptr, 16);
            if (endptr[0] == '.' && endptr[1] == '.') {
                endptr += 2;
                code2 = strtol(buffer, &endptr, 16);
            } else {
                code2 = code1;
            }
            if (endptr[0] == ';') {
                // Recognise 'W' and 'F' as wide characters.  It is possible
                // that 'A' (ambiguous) characters may also be wide but only
                // in East Asian contexts, which we assume we're not for now.
                if (endptr[1] == 'W' || endptr[1] == 'F') {
                    mark_range(code1, code2);
                }
            }
        }
    }
    fclose(file);

    // Some special ranges that are implicitly all-wide even if the
    // code points aren't currently allocated by the Unicode standard.
    mark_range(0x3400, 0x4DBF);
    mark_range(0x4E00, 0x9FFF);
    mark_range(0xF900, 0xFAFF);
    mark_range(0x20000, 0x2FFFD);
    mark_range(0x30000, 0x3FFFD);

    // Dump the ranges.
    dump_ranges();

    // Unmark the special ranges to make it easier to find the left-overs.
    unmark_range(0x3400, 0x4DBF);
    unmark_range(0x4E00, 0x9FFF);
    unmark_range(0xF900, 0xFAFF);
    unmark_range(0x20000, 0x2FFFD);
    unmark_range(0x30000, 0x3FFFD);

    // Create the recognition tree.
    recognizer();

    // Clean up and exit.
    free(masks);
    return 0;
}