/*
* Copyright (C) 2012-2017 Reece H. Dunn
*
* This file is part of ucd-tools.
*
* ucd-tools is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* ucd-tools is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with ucd-tools. If not, see .
*/
#include "ucd/ucd.h"
#include
#include
void fput_utf8c(FILE *out, ucd::codepoint_t c)
{
if (c < 0x80)
fputc((uint8_t)c, out);
else if (c < 0x800)
{
fputc(0xC0 | (c >> 6), out);
fputc(0x80 + (c & 0x3F), out);
}
else if (c < 0x10000)
{
fputc(0xE0 | (c >> 12), out);
fputc(0x80 + ((c >> 6) & 0x3F), out);
fputc(0x80 + (c & 0x3F), out);
}
else if (c < 0x200000)
{
fputc(0xF0 | (c >> 18), out);
fputc(0x80 + ((c >> 12) & 0x3F), out);
fputc(0x80 + ((c >> 6) & 0x3F), out);
fputc(0x80 + (c & 0x3F), out);
}
}
bool fget_utf8c(FILE *in, ucd::codepoint_t &c)
{
int ch = EOF;
if ((ch = fgetc(in)) == EOF) return false;
if (uint8_t(ch) < 0x80)
c = uint8_t(ch);
else switch (uint8_t(ch) & 0xF0)
{
default:
c = uint8_t(ch) & 0x1F;
if ((ch = fgetc(in)) == EOF) return false;
c = (c << 6) + (uint8_t(ch) & 0x3F);
break;
case 0xE0:
c = uint8_t(ch) & 0x0F;
if ((ch = fgetc(in)) == EOF) return false;
c = (c << 6) + (uint8_t(ch) & 0x3F);
if ((ch = fgetc(in)) == EOF) return false;
c = (c << 6) + (uint8_t(ch) & 0x3F);
break;
case 0xF0:
c = uint8_t(ch) & 0x07;
if ((ch = fgetc(in)) == EOF) return false;
c = (c << 6) + (uint8_t(ch) & 0x3F);
if ((ch = fgetc(in)) == EOF) return false;
c = (c << 6) + (uint8_t(ch) & 0x3F);
if ((ch = fgetc(in)) == EOF) return false;
c = (c << 6) + (uint8_t(ch) & 0x3F);
break;
}
return true;
}
void uprintf_codepoint(FILE *out, ucd::codepoint_t c, char mode)
{
switch (mode)
{
case 'c': // character
switch (c)
{
case '\t': fputs("\\t", out); break;
case '\r': fputs("\\r", out); break;
case '\n': fputs("\\n", out); break;
default: fput_utf8c(out, c); break;
}
break;
case 'h': // hexadecimal (lower)
fprintf(out, "%06x", c);
break;
case 'H': // hexadecimal (upper)
fprintf(out, "%06X", c);
break;
}
}
void uprintf_is(FILE *out, ucd::codepoint_t c, char mode)
{
switch (mode)
{
case 'A': // alpha-numeric
fputc(ucd::isalnum(c) ? '1' : '0', out);
break;
case 'a': // alpha
fputc(ucd::isalpha(c) ? '1' : '0', out);
break;
case 'b': // blank
fputc(ucd::isblank(c) ? '1' : '0', out);
break;
case 'c': // control
fputc(ucd::iscntrl(c) ? '1' : '0', out);
break;
case 'd': // numeric
fputc(ucd::isdigit(c) ? '1' : '0', out);
break;
case 'g': // glyph
fputc(ucd::isgraph(c) ? '1' : '0', out);
break;
case 'l': // lower case
fputc(ucd::islower(c) ? '1' : '0', out);
break;
case 'P': // printable
fputc(ucd::isprint(c) ? '1' : '0', out);
break;
case 'p': // punctuation
fputc(ucd::ispunct(c) ? '1' : '0', out);
break;
case 's': // whitespace
fputc(ucd::isspace(c) ? '1' : '0', out);
break;
case 'u': // upper case
fputc(ucd::isupper(c) ? '1' : '0', out);
break;
}
}
void uprintf(FILE *out, ucd::codepoint_t c, const char *format)
{
while (*format) switch (*format)
{
case '%':
switch (*++format)
{
case 'c': // category
fputs(ucd::get_category_string(ucd::lookup_category(c)), out);
break;
case 'C': // category group
fputs(ucd::get_category_group_string(ucd::lookup_category_group(c)), out);
break;
case 'p': // codepoint
uprintf_codepoint(out, c, *++format);
break;
case 'i': // is*
uprintf_is(out, c, *++format);
break;
case 'L': // lowercase
uprintf_codepoint(out, ucd::tolower(c), *++format);
break;
case 's': // script
fputs(ucd::get_script_string(ucd::lookup_script(c)), out);
break;
case 'T': // titlecase
uprintf_codepoint(out, ucd::totitle(c), *++format);
break;
case 'U': // uppercase
uprintf_codepoint(out, ucd::toupper(c), *++format);
break;
case 'W': // whitespace
if (ucd::isspace(c))
fputs("White_Space", out);
break;
}
++format;
break;
case '\\':
switch (*++format) {
case 0:
break;
case 't':
fputc('\t', out);
++format;
break;
case 'r':
fputc('\r', out);
++format;
break;
case 'n':
fputc('\n', out);
++format;
break;
default:
fputc(*format, out);
++format;
break;
}
break;
default:
fputc(*format, out);
++format;
break;
}
}
void print_file(FILE *in, const char *format)
{
ucd::codepoint_t c = 0;
while (fget_utf8c(in, c))
uprintf(stdout, c, format ? format : "%pc\t%pH\t%s\t%c\t%Uc\t%Lc\t%Tc\t%W\n");
}
int main(int argc, char **argv)
{
FILE *in = NULL;
const char *format = NULL;
for (int argn = 1; argn != argc; ++argn)
{
const char *arg = argv[argn];
if (!strcmp(arg, "--stdin") || !strcmp(arg, "-"))
in = stdin;
else if (!strncmp(arg, "--format=", 9))
format = arg + 9;
else if (in == NULL)
{
in = fopen(arg, "r");
if (!in)
fprintf(stdout, "cannot open `%s`\n", argv[1]);
}
}
if (in == stdin)
print_file(stdin, format);
else if (in != NULL)
{
print_file(in, format);
fclose(in);
}
else
{
for (ucd::codepoint_t c = 0; c <= 0x10FFFF; ++c)
uprintf(stdout, c, format ? format : "%pH %s %C %c %UH %LH %TH %W\n");
}
return 0;
}