eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

printcdata.c 5.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. /*
  2. * Copyright (C) 2012-2017 Reece H. Dunn
  3. *
  4. * This file is part of ucd-tools.
  5. *
  6. * ucd-tools is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * ucd-tools is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #include "ucd/ucd.h"
  20. #include <locale.h>
  21. #include <string.h>
  22. #include <stdio.h>
  23. #include <wchar.h>
  24. #include <wctype.h>
  25. void fput_utf8c(FILE *out, codepoint_t c)
  26. {
  27. if (c < 0x80)
  28. fputc((uint8_t)c, out);
  29. else if (c < 0x800)
  30. {
  31. fputc(0xC0 | (c >> 6), out);
  32. fputc(0x80 + (c & 0x3F), out);
  33. }
  34. else if (c < 0x10000)
  35. {
  36. fputc(0xE0 | (c >> 12), out);
  37. fputc(0x80 + ((c >> 6) & 0x3F), out);
  38. fputc(0x80 + (c & 0x3F), out);
  39. }
  40. else if (c < 0x200000)
  41. {
  42. fputc(0xF0 | (c >> 18), out);
  43. fputc(0x80 + ((c >> 12) & 0x3F), out);
  44. fputc(0x80 + ((c >> 6) & 0x3F), out);
  45. fputc(0x80 + (c & 0x3F), out);
  46. }
  47. }
  48. int fget_utf8c(FILE *in, codepoint_t *c)
  49. {
  50. int ch = EOF;
  51. if ((ch = fgetc(in)) == EOF) return 0;
  52. if ((uint8_t)ch < 0x80)
  53. *c = (uint8_t)ch;
  54. else switch ((uint8_t)ch & 0xF0)
  55. {
  56. default:
  57. *c = (uint8_t)ch & 0x1F;
  58. if ((ch = fgetc(in)) == EOF) return 0;
  59. *c = (*c << 6) + ((uint8_t)ch & 0x3F);
  60. break;
  61. case 0xE0:
  62. *c = (uint8_t)ch & 0x0F;
  63. if ((ch = fgetc(in)) == EOF) return 0;
  64. *c = (*c << 6) + ((uint8_t)ch & 0x3F);
  65. if ((ch = fgetc(in)) == EOF) return 0;
  66. *c = (*c << 6) + ((uint8_t)ch & 0x3F);
  67. break;
  68. case 0xF0:
  69. *c = (uint8_t)ch & 0x07;
  70. if ((ch = fgetc(in)) == EOF) return 0;
  71. *c = (*c << 6) + ((uint8_t)ch & 0x3F);
  72. if ((ch = fgetc(in)) == EOF) return 0;
  73. *c = (*c << 6) + ((uint8_t)ch & 0x3F);
  74. if ((ch = fgetc(in)) == EOF) return 0;
  75. *c = (*c << 6) + ((uint8_t)ch & 0x3F);
  76. break;
  77. }
  78. return 1;
  79. }
  80. void uprintf_codepoint(FILE *out, codepoint_t c, char mode)
  81. {
  82. switch (mode)
  83. {
  84. case 'c': // character
  85. switch (c)
  86. {
  87. case '\t': fputs("\\t", out); break;
  88. case '\r': fputs("\\r", out); break;
  89. case '\n': fputs("\\n", out); break;
  90. default: fput_utf8c(out, c); break;
  91. }
  92. break;
  93. case 'h': // hexadecimal (lower)
  94. fprintf(out, "%06x", c);
  95. break;
  96. case 'H': // hexadecimal (upper)
  97. fprintf(out, "%06X", c);
  98. break;
  99. }
  100. }
  101. void uprintf_is(FILE *out, codepoint_t c, char mode)
  102. {
  103. switch (mode)
  104. {
  105. case 'A': // alpha-numeric
  106. fputc(iswalnum(c) ? '1' : '0', out);
  107. break;
  108. case 'a': // alpha
  109. fputc(iswalpha(c) ? '1' : '0', out);
  110. break;
  111. case 'b': // blank
  112. fputc(iswblank(c) ? '1' : '0', out);
  113. break;
  114. case 'c': // control
  115. fputc(iswcntrl(c) ? '1' : '0', out);
  116. break;
  117. case 'd': // numeric
  118. fputc(iswdigit(c) ? '1' : '0', out);
  119. break;
  120. case 'g': // glyph
  121. fputc(iswgraph(c) ? '1' : '0', out);
  122. break;
  123. case 'l': // lower case
  124. fputc(iswlower(c) ? '1' : '0', out);
  125. break;
  126. case 'P': // printable
  127. fputc(iswprint(c) ? '1' : '0', out);
  128. break;
  129. case 'p': // punctuation
  130. fputc(iswpunct(c) ? '1' : '0', out);
  131. break;
  132. case 's': // whitespace
  133. fputc(iswspace(c) ? '1' : '0', out);
  134. break;
  135. case 'u': // upper case
  136. fputc(iswupper(c) ? '1' : '0', out);
  137. break;
  138. case 'x': // xdigit
  139. fputc(iswxdigit(c) ? '1' : '0', out);
  140. break;
  141. }
  142. }
  143. void uprintf(FILE *out, codepoint_t c, const char *format)
  144. {
  145. while (*format) switch (*format)
  146. {
  147. case '%':
  148. switch (*++format)
  149. {
  150. case 'c': // category
  151. fputs(ucd_get_category_string(ucd_lookup_category(c)), out);
  152. break;
  153. case 'C': // category group
  154. fputs(ucd_get_category_group_string(ucd_lookup_category_group(c)), out);
  155. break;
  156. case 'p': // codepoint
  157. uprintf_codepoint(out, c, *++format);
  158. break;
  159. case 'P': // properties
  160. fprintf(out, "%016llx", ucd_properties(c, ucd_lookup_category(c)));
  161. break;
  162. case 'i': // is*
  163. uprintf_is(out, c, *++format);
  164. break;
  165. case 'L': // lowercase
  166. uprintf_codepoint(out, towlower(c), *++format);
  167. break;
  168. case 's': // script
  169. fputs(ucd_get_script_string(ucd_lookup_script(c)), out);
  170. break;
  171. case 'T': // titlecase
  172. uprintf_codepoint(out, ucd_totitle(c), *++format);
  173. break;
  174. case 'U': // uppercase
  175. uprintf_codepoint(out, towupper(c), *++format);
  176. break;
  177. }
  178. ++format;
  179. break;
  180. case '\\':
  181. switch (*++format) {
  182. case 0:
  183. break;
  184. case 't':
  185. fputc('\t', out);
  186. ++format;
  187. break;
  188. case 'r':
  189. fputc('\r', out);
  190. ++format;
  191. break;
  192. case 'n':
  193. fputc('\n', out);
  194. ++format;
  195. break;
  196. default:
  197. fputc(*format, out);
  198. ++format;
  199. break;
  200. }
  201. break;
  202. default:
  203. fputc(*format, out);
  204. ++format;
  205. break;
  206. }
  207. }
  208. void print_file(FILE *in, const char *format)
  209. {
  210. codepoint_t c = 0;
  211. while (fget_utf8c(in, &c))
  212. uprintf(stdout, c, format ? format : "%pc\t%pH\t%s\t%c\t%Uc\t%Lc\t%Tc\t%is\n");
  213. }
  214. int main(int argc, char **argv)
  215. {
  216. FILE *in = NULL;
  217. const char *format = NULL;
  218. for (int argn = 1; argn != argc; ++argn)
  219. {
  220. const char *arg = argv[argn];
  221. if (!strcmp(arg, "--stdin") || !strcmp(arg, "-"))
  222. in = stdin;
  223. else if (!strncmp(arg, "--format=", 9))
  224. format = arg + 9;
  225. else if (!strncmp(arg, "--locale=", 9))
  226. setlocale(LC_CTYPE, arg + 9);
  227. else if (in == NULL)
  228. {
  229. in = fopen(arg, "r");
  230. if (!in)
  231. fprintf(stdout, "cannot open `%s`\n", argv[1]);
  232. }
  233. }
  234. if (in == stdin)
  235. print_file(stdin, format);
  236. else if (in != NULL)
  237. {
  238. print_file(in, format);
  239. fclose(in);
  240. }
  241. else
  242. {
  243. for (codepoint_t c = 0; c <= 0x10FFFF; ++c)
  244. uprintf(stdout, c, format ? format :
  245. "%pH %s %C %c %UH %LH %TH %id %ix %ic %is %ib %ip %iP %ig %iA %ia %iu %il %P\n");
  246. }
  247. return 0;
  248. }