eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

common.c 7.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. /*
  2. * Copyright (C) 2005 to 2013 by Jonathan Duddington
  3. * email: [email protected]
  4. * Copyright (C) 2013-2017 Reece H. Dunn
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program; if not, see: <http://www.gnu.org/licenses/>.
  18. */
  19. #include "config.h"
  20. #include <ctype.h>
  21. #include <errno.h>
  22. #include <locale.h>
  23. #include <stdbool.h>
  24. #include <stdint.h>
  25. #include <stdio.h>
  26. #include <stdlib.h>
  27. #include <string.h>
  28. #include <sys/stat.h>
  29. #include <wctype.h>
  30. #include <espeak-ng/espeak_ng.h>
  31. #include <espeak-ng/speak_lib.h>
  32. #include <espeak-ng/encoding.h>
  33. #include <ucd/ucd.h>
  34. #include "common.h"
  35. #include "translate.h"
  36. #pragma GCC visibility push(default)
  37. int GetFileLength(const char *filename)
  38. {
  39. struct stat statbuf;
  40. if (stat(filename, &statbuf) != 0)
  41. return -errno;
  42. if (S_ISDIR(statbuf.st_mode))
  43. return -EISDIR;
  44. return statbuf.st_size;
  45. }
  46. void strncpy0(char *to, const char *from, int size)
  47. {
  48. // strcpy with limit, ensures a zero terminator
  49. strncpy(to, from, size);
  50. to[size-1] = 0;
  51. }
  52. int utf8_in(int *c, const char *buf)
  53. {
  54. /* Read a unicode characater from a UTF8 string
  55. * Returns the number of UTF8 bytes used.
  56. * buf: position of buffer is moved, if character is read
  57. * c: holds UTF-16 representation of multibyte character by
  58. * skipping UTF-8 header bits of bytes in following way:
  59. * 2-byte character "ā":
  60. * hex binary
  61. * c481 1100010010000001
  62. * | 11000100 000001
  63. * V \ \ | |
  64. * 0101 0000000100000001
  65. * 3-byte character "ꙅ":
  66. * ea9985 111010101001100110000101
  67. * 1010 011001 000101
  68. * | + +--.\ \ | |
  69. * V `--. \`. `.| |
  70. * A645 1010011001000101
  71. * 4-byte character "𠜎":
  72. * f0a09c8e 11110000101000001001110010001110
  73. * V 000 100000 011100 001110
  74. * 02070e 000000100000011100001110
  75. */
  76. return utf8_in2(c, buf, 0);
  77. }
  78. #pragma GCC visibility pop
  79. int utf8_out(unsigned int c, char *buf)
  80. {
  81. // write a UTF-16 character into a buffer as UTF-8
  82. // returns the number of bytes written
  83. int n_bytes;
  84. int j;
  85. int shift;
  86. static const char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 };
  87. if (c < 0x80) {
  88. buf[0] = c;
  89. return 1;
  90. }
  91. if (c >= 0x110000) {
  92. buf[0] = ' '; // out of range character code
  93. return 1;
  94. }
  95. if (c < 0x0800)
  96. n_bytes = 1;
  97. else if (c < 0x10000)
  98. n_bytes = 2;
  99. else
  100. n_bytes = 3;
  101. shift = 6*n_bytes;
  102. buf[0] = code[n_bytes] | (c >> shift);
  103. for (j = 0; j < n_bytes; j++) {
  104. shift -= 6;
  105. buf[j+1] = 0x80 + ((c >> shift) & 0x3f);
  106. }
  107. return n_bytes+1;
  108. }
  109. int utf8_in2(int *c, const char *buf, int backwards)
  110. {
  111. // Reads a unicode characater from a UTF8 string
  112. // Returns the number of UTF8 bytes used.
  113. // c: holds integer representation of multibyte character
  114. // buf: position of buffer is moved, if character is read
  115. // backwards: set if we are moving backwards through the UTF8 string
  116. int c1;
  117. int n_bytes;
  118. static const unsigned char mask[4] = { 0xff, 0x1f, 0x0f, 0x07 };
  119. // find the start of the next/previous character
  120. while ((*buf & 0xc0) == 0x80) {
  121. // skip over non-initial bytes of a multi-byte utf8 character
  122. if (backwards)
  123. buf--;
  124. else
  125. buf++;
  126. }
  127. n_bytes = 0;
  128. if ((c1 = *buf++) & 0x80) {
  129. if ((c1 & 0xe0) == 0xc0)
  130. n_bytes = 1;
  131. else if ((c1 & 0xf0) == 0xe0)
  132. n_bytes = 2;
  133. else if ((c1 & 0xf8) == 0xf0)
  134. n_bytes = 3;
  135. c1 &= mask[n_bytes];
  136. int ix;
  137. for (ix = 0; ix < n_bytes; ix++)
  138. {
  139. if (!*buf)
  140. /* Oops, truncated */
  141. break;
  142. c1 = (c1 << 6) + (*buf++ & 0x3f);
  143. }
  144. n_bytes = ix;
  145. }
  146. *c = c1;
  147. return n_bytes+1;
  148. }
  149. int IsAlpha(unsigned int c)
  150. {
  151. // Replacement for iswalph() which also checks for some in-word symbols
  152. static const unsigned short extra_indic_alphas[] = {
  153. 0xa70, 0xa71, // Gurmukhi: tippi, addak
  154. 0
  155. };
  156. if (iswalpha(c))
  157. return 1;
  158. if (c < 0x300)
  159. return 0;
  160. if ((c >= 0x901) && (c <= 0xdf7)) {
  161. // Indic scripts: Devanagari, Tamil, etc
  162. if ((c & 0x7f) < 0x64)
  163. return 1;
  164. if (lookupwchar(extra_indic_alphas, c) != 0)
  165. return 1;
  166. if ((c >= 0xd7a) && (c <= 0xd7f))
  167. return 1; // malaytalam chillu characters
  168. return 0;
  169. }
  170. if ((c >= 0x5b0) && (c <= 0x5c2))
  171. return 1; // Hebrew vowel marks
  172. if (c == 0x0605)
  173. return 1;
  174. if ((c == 0x670) || ((c >= 0x64b) && (c <= 0x65e)))
  175. return 1; // arabic vowel marks
  176. if ((c >= 0x300) && (c <= 0x36f))
  177. return 1; // combining accents
  178. if ((c >= 0xf40) && (c <= 0xfbc))
  179. return 1; // tibetan
  180. if ((c >= 0x1100) && (c <= 0x11ff))
  181. return 1; // Korean jamo
  182. if ((c >= 0x2800) && (c <= 0x28ff))
  183. return 1; // braille
  184. if ((c > 0x3040) && (c <= 0xa700))
  185. return 1; // Chinese/Japanese. Should never get here, but Mac OS 10.4's iswalpha seems to be broken, so just make sure
  186. return 0;
  187. }
  188. // brackets, also 0x2014 to 0x021f which don't need to be in this list
  189. static const unsigned short brackets[] = {
  190. '(', ')', '[', ']', '{', '}', '<', '>', '"', '\'', '`',
  191. 0xab, 0xbb, // double angle brackets
  192. 0x300a, 0x300b, // double angle brackets (ideograph)
  193. 0xe000+'<', // private usage area
  194. 0
  195. };
  196. int IsBracket(int c)
  197. {
  198. if ((c >= 0x2014) && (c <= 0x201f))
  199. return 1;
  200. return lookupwchar(brackets, c);
  201. }
  202. int IsDigit09(unsigned int c)
  203. {
  204. if ((c >= '0') && (c <= '9'))
  205. return 1;
  206. return 0;
  207. }
  208. int IsDigit(unsigned int c)
  209. {
  210. if (iswdigit(c))
  211. return 1;
  212. if ((c >= 0x966) && (c <= 0x96f))
  213. return 1;
  214. return 0;
  215. }
  216. int IsSpace(unsigned int c)
  217. {
  218. if (c == 0)
  219. return 0;
  220. if ((c >= 0x2500) && (c < 0x25a0))
  221. return 1; // box drawing characters
  222. if ((c >= 0xfff9) && (c <= 0xffff))
  223. return 1; // unicode specials
  224. return iswspace(c);
  225. }
  226. int isspace2(unsigned int c)
  227. {
  228. // can't use isspace() because on Windows, isspace(0xe1) gives TRUE !
  229. if ( ((c & 0xff) == 0) || (c > ' '))
  230. return 0;
  231. return 1;
  232. }
  233. int is_str_totally_null(const char* str, int size) {
  234. // Tests if all bytes of str are null up to size
  235. // This should never be reimplemented with integers, because
  236. // this function has to work with unaligned char*
  237. // (casting to int when unaligned may result in ungaranteed behaviors)
  238. return (*str == 0 && memcmp(str, str+1, size-1) == 0);
  239. }
  240. int Read4Bytes(FILE *f)
  241. {
  242. // Read 4 bytes (least significant first) into a word
  243. int ix;
  244. int acc = 0;
  245. for (ix = 0; ix < 4; ix++) {
  246. unsigned char c;
  247. c = fgetc(f) & 0xff;
  248. acc += (c << (ix*8));
  249. }
  250. return acc;
  251. }
  252. unsigned int StringToWord(const char *string)
  253. {
  254. // Pack 4 characters into a word
  255. int ix;
  256. unsigned char c;
  257. unsigned int word;
  258. if (string == NULL)
  259. return 0;
  260. word = 0;
  261. for (ix = 0; ix < 4; ix++) {
  262. if (string[ix] == 0) break;
  263. c = string[ix];
  264. word |= (c << (ix*8));
  265. }
  266. return word;
  267. }
  268. int towlower2(unsigned int c, Translator *translator)
  269. {
  270. // check for non-standard upper to lower case conversions
  271. if (c == 'I' && translator->langopts.dotless_i)
  272. return 0x131; // I -> ı
  273. return ucd_tolower(c);
  274. }