eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

printcdata.c 6.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. /*
  2. * Copyright (C) 2012-2017 Reece H. Dunn
  3. *
  4. * This file is part of ucd-tools.
  5. *
  6. * ucd-tools is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * ucd-tools is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #include "config.h"
  20. #include "ucd/ucd.h"
  21. #include <locale.h>
  22. #include <string.h>
  23. #include <stdio.h>
  24. #include <wchar.h>
  25. #include <wctype.h>
  26. #ifndef HAVE_ISWBLANK
  27. static int iswblank(wint_t c)
  28. {
  29. return iswspace(c) && !(c >= 0x0A && c <= 0x0D);
  30. }
  31. #endif
  32. static void fput_utf8c(FILE *out, codepoint_t c)
  33. {
  34. if (c < 0x80)
  35. fputc((uint8_t)c, out);
  36. else if (c < 0x800)
  37. {
  38. fputc(0xC0 | (c >> 6), out);
  39. fputc(0x80 + (c & 0x3F), out);
  40. }
  41. else if (c < 0x10000)
  42. {
  43. fputc(0xE0 | (c >> 12), out);
  44. fputc(0x80 + ((c >> 6) & 0x3F), out);
  45. fputc(0x80 + (c & 0x3F), out);
  46. }
  47. else if (c < 0x200000)
  48. {
  49. fputc(0xF0 | (c >> 18), out);
  50. fputc(0x80 + ((c >> 12) & 0x3F), out);
  51. fputc(0x80 + ((c >> 6) & 0x3F), out);
  52. fputc(0x80 + (c & 0x3F), out);
  53. }
  54. }
  55. static int fget_utf8c(FILE *in, codepoint_t *c)
  56. {
  57. int ch = EOF;
  58. if ((ch = fgetc(in)) == EOF) return 0;
  59. if ((uint8_t)ch < 0x80)
  60. *c = (uint8_t)ch;
  61. else switch ((uint8_t)ch & 0xF0)
  62. {
  63. default:
  64. *c = (uint8_t)ch & 0x1F;
  65. if ((ch = fgetc(in)) == EOF) return 0;
  66. *c = (*c << 6) + ((uint8_t)ch & 0x3F);
  67. break;
  68. case 0xE0:
  69. *c = (uint8_t)ch & 0x0F;
  70. if ((ch = fgetc(in)) == EOF) return 0;
  71. *c = (*c << 6) + ((uint8_t)ch & 0x3F);
  72. if ((ch = fgetc(in)) == EOF) return 0;
  73. *c = (*c << 6) + ((uint8_t)ch & 0x3F);
  74. break;
  75. case 0xF0:
  76. *c = (uint8_t)ch & 0x07;
  77. if ((ch = fgetc(in)) == EOF) return 0;
  78. *c = (*c << 6) + ((uint8_t)ch & 0x3F);
  79. if ((ch = fgetc(in)) == EOF) return 0;
  80. *c = (*c << 6) + ((uint8_t)ch & 0x3F);
  81. if ((ch = fgetc(in)) == EOF) return 0;
  82. *c = (*c << 6) + ((uint8_t)ch & 0x3F);
  83. break;
  84. }
  85. return 1;
  86. }
  87. static void uprintf_codepoint(FILE *out, codepoint_t c, char mode)
  88. {
  89. switch (mode)
  90. {
  91. case 'c': /* character */
  92. switch (c)
  93. {
  94. case '\t': fputs("\\t", out); break;
  95. case '\r': fputs("\\r", out); break;
  96. case '\n': fputs("\\n", out); break;
  97. default: fput_utf8c(out, c); break;
  98. }
  99. break;
  100. case 'h': /* hexadecimal (lower) */
  101. fprintf(out, "%06x", c);
  102. break;
  103. case 'H': /* hexadecimal (upper) */
  104. fprintf(out, "%06X", c);
  105. break;
  106. }
  107. }
  108. static void uprintf_is(FILE *out, codepoint_t c, char mode)
  109. {
  110. switch (mode)
  111. {
  112. case 'A': /* alpha-numeric */
  113. fputc(iswalnum(c) ? '1' : '0', out);
  114. break;
  115. case 'a': /* alpha */
  116. fputc(iswalpha(c) ? '1' : '0', out);
  117. break;
  118. case 'b': /* blank */
  119. fputc(iswblank(c) ? '1' : '0', out);
  120. break;
  121. case 'c': /* control */
  122. fputc(iswcntrl(c) ? '1' : '0', out);
  123. break;
  124. case 'd': /* numeric */
  125. fputc(iswdigit(c) ? '1' : '0', out);
  126. break;
  127. case 'g': /* glyph */
  128. fputc(iswgraph(c) ? '1' : '0', out);
  129. break;
  130. case 'l': /* lower case */
  131. fputc(iswlower(c) ? '1' : '0', out);
  132. break;
  133. case 'P': /* printable */
  134. fputc(iswprint(c) ? '1' : '0', out);
  135. break;
  136. case 'p': /* punctuation */
  137. fputc(iswpunct(c) ? '1' : '0', out);
  138. break;
  139. case 's': /* whitespace */
  140. fputc(iswspace(c) ? '1' : '0', out);
  141. break;
  142. case 'u': /* upper case */
  143. fputc(iswupper(c) ? '1' : '0', out);
  144. break;
  145. case 'x': /* xdigit */
  146. fputc(iswxdigit(c) ? '1' : '0', out);
  147. break;
  148. }
  149. }
  150. static void uprintf(FILE *out, codepoint_t c, const char *format)
  151. {
  152. while (*format) switch (*format)
  153. {
  154. case '%':
  155. switch (*++format)
  156. {
  157. case 'c': /* category */
  158. fputs(ucd_get_category_string(ucd_lookup_category(c)), out);
  159. break;
  160. case 'C': /* category group */
  161. fputs(ucd_get_category_group_string(ucd_lookup_category_group(c)), out);
  162. break;
  163. case 'p': /* codepoint */
  164. uprintf_codepoint(out, c, *++format);
  165. break;
  166. case 'P': /* properties */
  167. fprintf(out, "%016llx", ucd_properties(c, ucd_lookup_category(c)));
  168. break;
  169. case 'i': /* is* */
  170. uprintf_is(out, c, *++format);
  171. break;
  172. case 'L': /* lowercase */
  173. uprintf_codepoint(out, towlower(c), *++format);
  174. break;
  175. case 's': /* script */
  176. fputs(ucd_get_script_string(ucd_lookup_script(c)), out);
  177. break;
  178. case 'T': /* titlecase */
  179. uprintf_codepoint(out, ucd_totitle(c), *++format);
  180. break;
  181. case 'U': /* uppercase */
  182. uprintf_codepoint(out, towupper(c), *++format);
  183. break;
  184. }
  185. ++format;
  186. break;
  187. case '\\':
  188. switch (*++format) {
  189. case 0:
  190. break;
  191. case 't':
  192. fputc('\t', out);
  193. ++format;
  194. break;
  195. case 'r':
  196. fputc('\r', out);
  197. ++format;
  198. break;
  199. case 'n':
  200. fputc('\n', out);
  201. ++format;
  202. break;
  203. default:
  204. fputc(*format, out);
  205. ++format;
  206. break;
  207. }
  208. break;
  209. default:
  210. fputc(*format, out);
  211. ++format;
  212. break;
  213. }
  214. }
  215. static void print_file(FILE *in, const char *format)
  216. {
  217. codepoint_t c = 0;
  218. while (fget_utf8c(in, &c))
  219. uprintf(stdout, c, format ? format : "%pc\t%pH\t%s\t%c\t%Uc\t%Lc\t%Tc\t%is\n");
  220. }
  221. int main(int argc, char **argv)
  222. {
  223. FILE *in = NULL;
  224. const char *format = NULL;
  225. int argn;
  226. for (argn = 1; argn != argc; ++argn)
  227. {
  228. const char *arg = argv[argn];
  229. if (!strcmp(arg, "--stdin") || !strcmp(arg, "-"))
  230. in = stdin;
  231. else if (!strncmp(arg, "--format=", 9))
  232. format = arg + 9;
  233. else if (!strncmp(arg, "--locale=", 9))
  234. setlocale(LC_CTYPE, arg + 9);
  235. else if (in == NULL)
  236. {
  237. in = fopen(arg, "r");
  238. if (!in)
  239. fprintf(stdout, "cannot open `%s`\n", argv[1]);
  240. }
  241. }
  242. if (in == stdin)
  243. print_file(stdin, format);
  244. else if (in != NULL)
  245. {
  246. print_file(in, format);
  247. fclose(in);
  248. }
  249. else
  250. {
  251. codepoint_t c;
  252. for (c = 0; c <= 0x10FFFF; ++c)
  253. uprintf(stdout, c, format ? format :
  254. "%pH %s %C %c %UH %LH %TH %id %ix %ic %is %ib %ip %iP %ig %iA %ia %iu %il %P\n");
  255. }
  256. return 0;
  257. }