eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

printucddata_cpp.cpp 4.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. /*
  2. * Copyright (C) 2012-2017 Reece H. Dunn
  3. *
  4. * This file is part of ucd-tools.
  5. *
  6. * ucd-tools is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * ucd-tools is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #include "ucd/ucd.h"
  20. #include <string.h>
  21. #include <stdio.h>
  22. void fput_utf8c(FILE *out, ucd::codepoint_t c)
  23. {
  24. if (c < 0x80)
  25. fputc((uint8_t)c, out);
  26. else if (c < 0x800)
  27. {
  28. fputc(0xC0 | (c >> 6), out);
  29. fputc(0x80 + (c & 0x3F), out);
  30. }
  31. else if (c < 0x10000)
  32. {
  33. fputc(0xE0 | (c >> 12), out);
  34. fputc(0x80 + ((c >> 6) & 0x3F), out);
  35. fputc(0x80 + (c & 0x3F), out);
  36. }
  37. else if (c < 0x200000)
  38. {
  39. fputc(0xF0 | (c >> 18), out);
  40. fputc(0x80 + ((c >> 12) & 0x3F), out);
  41. fputc(0x80 + ((c >> 6) & 0x3F), out);
  42. fputc(0x80 + (c & 0x3F), out);
  43. }
  44. }
  45. bool fget_utf8c(FILE *in, ucd::codepoint_t &c)
  46. {
  47. int ch = EOF;
  48. if ((ch = fgetc(in)) == EOF) return false;
  49. if (uint8_t(ch) < 0x80)
  50. c = uint8_t(ch);
  51. else switch (uint8_t(ch) & 0xF0)
  52. {
  53. default:
  54. c = uint8_t(ch) & 0x1F;
  55. if ((ch = fgetc(in)) == EOF) return false;
  56. c = (c << 6) + (uint8_t(ch) & 0x3F);
  57. break;
  58. case 0xE0:
  59. c = uint8_t(ch) & 0x0F;
  60. if ((ch = fgetc(in)) == EOF) return false;
  61. c = (c << 6) + (uint8_t(ch) & 0x3F);
  62. if ((ch = fgetc(in)) == EOF) return false;
  63. c = (c << 6) + (uint8_t(ch) & 0x3F);
  64. break;
  65. case 0xF0:
  66. c = uint8_t(ch) & 0x07;
  67. if ((ch = fgetc(in)) == EOF) return false;
  68. c = (c << 6) + (uint8_t(ch) & 0x3F);
  69. if ((ch = fgetc(in)) == EOF) return false;
  70. c = (c << 6) + (uint8_t(ch) & 0x3F);
  71. if ((ch = fgetc(in)) == EOF) return false;
  72. c = (c << 6) + (uint8_t(ch) & 0x3F);
  73. break;
  74. }
  75. return true;
  76. }
  77. void uprintf_codepoint(FILE *out, ucd::codepoint_t c, char mode)
  78. {
  79. switch (mode)
  80. {
  81. case 'c': // character
  82. switch (c)
  83. {
  84. case '\t': fputs("\\t", out); break;
  85. case '\r': fputs("\\r", out); break;
  86. case '\n': fputs("\\n", out); break;
  87. default: fput_utf8c(out, c); break;
  88. }
  89. break;
  90. case 'h': // hexadecimal (lower)
  91. fprintf(out, "%06x", c);
  92. break;
  93. case 'H': // hexadecimal (upper)
  94. fprintf(out, "%06X", c);
  95. break;
  96. }
  97. }
  98. void uprintf(FILE *out, ucd::codepoint_t c, const char *format)
  99. {
  100. while (*format) switch (*format)
  101. {
  102. case '%':
  103. switch (*++format)
  104. {
  105. case 'c': // category
  106. fputs(ucd::get_category_string(ucd::lookup_category(c)), out);
  107. break;
  108. case 'C': // category group
  109. fputs(ucd::get_category_group_string(ucd::lookup_category_group(c)), out);
  110. break;
  111. case 'p': // codepoint
  112. uprintf_codepoint(out, c, *++format);
  113. break;
  114. case 'L': // lowercase
  115. uprintf_codepoint(out, ucd::tolower(c), *++format);
  116. break;
  117. case 's': // script
  118. fputs(ucd::get_script_string(ucd::lookup_script(c)), out);
  119. break;
  120. case 'T': // titlecase
  121. uprintf_codepoint(out, ucd::totitle(c), *++format);
  122. break;
  123. case 'U': // uppercase
  124. uprintf_codepoint(out, ucd::toupper(c), *++format);
  125. break;
  126. case 'W': // whitespace
  127. if (ucd::isspace(c))
  128. fputs("White_Space", out);
  129. break;
  130. }
  131. ++format;
  132. break;
  133. case '\\':
  134. switch (*++format) {
  135. case 0:
  136. break;
  137. case 't':
  138. fputc('\t', out);
  139. ++format;
  140. break;
  141. case 'r':
  142. fputc('\r', out);
  143. ++format;
  144. break;
  145. case 'n':
  146. fputc('\n', out);
  147. ++format;
  148. break;
  149. default:
  150. fputc(*format, out);
  151. ++format;
  152. break;
  153. }
  154. break;
  155. default:
  156. fputc(*format, out);
  157. ++format;
  158. break;
  159. }
  160. }
  161. void print_file(FILE *in, const char *format)
  162. {
  163. ucd::codepoint_t c = 0;
  164. while (fget_utf8c(in, c))
  165. uprintf(stdout, c, format ? format : "%pc\t%pH\t%s\t%c\t%Uc\t%Lc\t%Tc\t%W\n");
  166. }
  167. int main(int argc, char **argv)
  168. {
  169. FILE *in = NULL;
  170. const char *format = NULL;
  171. for (int argn = 1; argn != argc; ++argn)
  172. {
  173. const char *arg = argv[argn];
  174. if (!strcmp(arg, "--stdin") || !strcmp(arg, "-"))
  175. in = stdin;
  176. else if (!strncmp(arg, "--format=", 9))
  177. format = arg + 9;
  178. else if (in == NULL)
  179. {
  180. in = fopen(arg, "r");
  181. if (!in)
  182. fprintf(stdout, "cannot open `%s`\n", argv[1]);
  183. }
  184. }
  185. if (in == stdin)
  186. print_file(stdin, format);
  187. else if (in != NULL)
  188. {
  189. print_file(in, format);
  190. fclose(in);
  191. }
  192. else
  193. {
  194. for (ucd::codepoint_t c = 0; c <= 0x10FFFF; ++c)
  195. uprintf(stdout, c, format ? format : "%pH %s %C %c %UH %LH %TH %W\n");
  196. }
  197. return 0;
  198. }