eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

printucddata.cpp 3.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. /*
  2. * Copyright (C) 2012-2015 Reece H. Dunn
  3. *
  4. * This file is part of ucd-tools.
  5. *
  6. * ucd-tools is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * ucd-tools is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #include "ucd/ucd.h"
  20. #include <string.h>
  21. #include <stdio.h>
  22. bool fget_utf8c(FILE *in, ucd::codepoint_t &c)
  23. {
  24. int ch = EOF;
  25. if ((ch = fgetc(in)) == EOF) return false;
  26. if (uint8_t(ch) < 0x80)
  27. c = uint8_t(ch);
  28. else switch (uint8_t(ch) & 0xF0)
  29. {
  30. default:
  31. c = uint8_t(ch) & 0x1F;
  32. if ((ch = fgetc(in)) == EOF) return false;
  33. c = (c << 6) + (uint8_t(ch) & 0x3F);
  34. break;
  35. case 0xE0:
  36. c = uint8_t(ch) & 0x0F;
  37. if ((ch = fgetc(in)) == EOF) return false;
  38. c = (c << 6) + (uint8_t(ch) & 0x3F);
  39. if ((ch = fgetc(in)) == EOF) return false;
  40. c = (c << 6) + (uint8_t(ch) & 0x3F);
  41. break;
  42. case 0xF0:
  43. c = uint8_t(ch) & 0x07;
  44. if ((ch = fgetc(in)) == EOF) return false;
  45. c = (c << 6) + (uint8_t(ch) & 0x3F);
  46. if ((ch = fgetc(in)) == EOF) return false;
  47. c = (c << 6) + (uint8_t(ch) & 0x3F);
  48. if ((ch = fgetc(in)) == EOF) return false;
  49. c = (c << 6) + (uint8_t(ch) & 0x3F);
  50. break;
  51. }
  52. return true;
  53. }
  54. void uprintf_codepoint(FILE *out, ucd::codepoint_t c, char mode)
  55. {
  56. switch (mode)
  57. {
  58. case 'h': // hexadecimal (lower)
  59. fprintf(out, "%06x", c);
  60. break;
  61. case 'H': // hexadecimal (upper)
  62. fprintf(out, "%06X", c);
  63. break;
  64. }
  65. }
  66. void uprintf(FILE *out, ucd::codepoint_t c, const char *format)
  67. {
  68. while (*format) switch (*format)
  69. {
  70. case '%':
  71. switch (*++format)
  72. {
  73. case 'c': // category
  74. fputs(ucd::get_category_string(ucd::lookup_category(c)), out);
  75. break;
  76. case 'C': // category group
  77. fputs(ucd::get_category_group_string(ucd::lookup_category_group(c)), out);
  78. break;
  79. case 'p': // codepoint
  80. uprintf_codepoint(out, c, *++format);
  81. break;
  82. case 'L': // lowercase
  83. uprintf_codepoint(out, ucd::tolower(c), *++format);
  84. break;
  85. case 's': // script
  86. fputs(ucd::get_script_string(ucd::lookup_script(c)), out);
  87. break;
  88. case 'T': // titlecase
  89. uprintf_codepoint(out, ucd::totitle(c), *++format);
  90. break;
  91. case 'U': // uppercase
  92. uprintf_codepoint(out, ucd::toupper(c), *++format);
  93. break;
  94. case 'W': // whitespace
  95. if (ucd::isspace(c))
  96. fputs("White_Space", out);
  97. break;
  98. }
  99. ++format;
  100. break;
  101. default:
  102. fputc(*format, out);
  103. ++format;
  104. break;
  105. }
  106. }
  107. void print_file(FILE *in)
  108. {
  109. ucd::codepoint_t c = 0;
  110. while (fget_utf8c(in, c))
  111. uprintf(stdout, c, "%pH %s %C %c %UH %LH %TH %W\n");
  112. }
  113. int main(int argc, char **argv)
  114. {
  115. if (argc == 2)
  116. {
  117. if (!strcmp(argv[1], "--stdin") || !strcmp(argv[1], "-"))
  118. print_file(stdin);
  119. else
  120. {
  121. FILE *in = fopen(argv[1], "r");
  122. if (in)
  123. {
  124. print_file(in);
  125. fclose(in);
  126. }
  127. else
  128. fprintf(stdout, "cannot open `%s`\n", argv[1]);
  129. }
  130. }
  131. else
  132. {
  133. for (ucd::codepoint_t c = 0; c <= 0x10FFFF; ++c)
  134. uprintf(stdout, c, "%pH %s %C %c %UH %LH %TH %W\n");
  135. }
  136. return 0;
  137. }