eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

encoding.c 37KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818
  1. /*
  2. * Copyright (C) 2017 Reece H. Dunn
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 3 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, see: <http://www.gnu.org/licenses/>.
  16. */
  17. #include "config.h"
  18. #include <string.h>
  19. #include <stdint.h>
  20. #include <stdlib.h>
  21. #include <wchar.h>
  22. #include <espeak-ng/espeak_ng.h>
  23. #include "speech.h"
  24. #include "encoding.h"
  25. #define LEADING_2_BITS 0xC0 // 0b11000000
  26. #define UTF8_TAIL_BITS 0x80 // 0b10000000
  27. int utf8_out(unsigned int c, char *buf)
  28. {
  29. // write a unicode character into a buffer as utf8
  30. // returns the number of bytes written
  31. int n_bytes;
  32. int j;
  33. int shift;
  34. static char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 };
  35. if (c < 0x80) {
  36. buf[0] = c;
  37. return 1;
  38. }
  39. if (c >= 0x110000) {
  40. buf[0] = ' '; // out of range character code
  41. return 1;
  42. }
  43. if (c < 0x0800)
  44. n_bytes = 1;
  45. else if (c < 0x10000)
  46. n_bytes = 2;
  47. else
  48. n_bytes = 3;
  49. shift = 6*n_bytes;
  50. buf[0] = code[n_bytes] | (c >> shift);
  51. for (j = 0; j < n_bytes; j++) {
  52. shift -= 6;
  53. buf[j+1] = 0x80 + ((c >> shift) & 0x3f);
  54. }
  55. return n_bytes+1;
  56. }
  57. // http://www.iana.org/assignments/character-sets/character-sets.xhtml
  58. MNEM_TAB mnem_encoding[] = {
  59. { "ANSI_X3.4-1968", ESPEAKNG_ENCODING_US_ASCII },
  60. { "ANSI_X3.4-1986", ESPEAKNG_ENCODING_US_ASCII },
  61. { "ASMO-708", ESPEAKNG_ENCODING_ISO_8859_6 },
  62. { "ECMA-114", ESPEAKNG_ENCODING_ISO_8859_6 },
  63. { "ECMA-118", ESPEAKNG_ENCODING_ISO_8859_7 },
  64. { "ELOT_928", ESPEAKNG_ENCODING_ISO_8859_7 },
  65. { "IBM367", ESPEAKNG_ENCODING_US_ASCII },
  66. { "IBM819", ESPEAKNG_ENCODING_ISO_8859_1 },
  67. { "ISCII", ESPEAKNG_ENCODING_ISCII },
  68. { "ISO_646.irv:1991", ESPEAKNG_ENCODING_US_ASCII },
  69. { "ISO_8859-1", ESPEAKNG_ENCODING_ISO_8859_1 },
  70. { "ISO_8859-1:1987", ESPEAKNG_ENCODING_ISO_8859_1 },
  71. { "ISO_8859-2", ESPEAKNG_ENCODING_ISO_8859_2 },
  72. { "ISO_8859-2:1987", ESPEAKNG_ENCODING_ISO_8859_2 },
  73. { "ISO_8859-3", ESPEAKNG_ENCODING_ISO_8859_3 },
  74. { "ISO_8859-3:1988", ESPEAKNG_ENCODING_ISO_8859_3 },
  75. { "ISO_8859-4", ESPEAKNG_ENCODING_ISO_8859_4 },
  76. { "ISO_8859-4:1988", ESPEAKNG_ENCODING_ISO_8859_4 },
  77. { "ISO_8859-5", ESPEAKNG_ENCODING_ISO_8859_5 },
  78. { "ISO_8859-5:1988", ESPEAKNG_ENCODING_ISO_8859_5 },
  79. { "ISO_8859-6", ESPEAKNG_ENCODING_ISO_8859_6 },
  80. { "ISO_8859-6:1987", ESPEAKNG_ENCODING_ISO_8859_6 },
  81. { "ISO_8859-7", ESPEAKNG_ENCODING_ISO_8859_7 },
  82. { "ISO_8859-7:1987", ESPEAKNG_ENCODING_ISO_8859_7 },
  83. { "ISO_8859-8", ESPEAKNG_ENCODING_ISO_8859_8 },
  84. { "ISO_8859-8:1988", ESPEAKNG_ENCODING_ISO_8859_8 },
  85. { "ISO_8859-9", ESPEAKNG_ENCODING_ISO_8859_9 },
  86. { "ISO_8859-9:1989", ESPEAKNG_ENCODING_ISO_8859_9 },
  87. { "ISO_8859-10", ESPEAKNG_ENCODING_ISO_8859_10 },
  88. { "ISO_8859-10:1992", ESPEAKNG_ENCODING_ISO_8859_10 },
  89. { "ISO_8859-14", ESPEAKNG_ENCODING_ISO_8859_14 },
  90. { "ISO_8859-14:1998", ESPEAKNG_ENCODING_ISO_8859_14 },
  91. { "ISO_8859-15", ESPEAKNG_ENCODING_ISO_8859_15 },
  92. { "ISO_8859-16", ESPEAKNG_ENCODING_ISO_8859_16 },
  93. { "ISO_8859-16:2001", ESPEAKNG_ENCODING_ISO_8859_16 },
  94. { "ISO646-US", ESPEAKNG_ENCODING_US_ASCII },
  95. { "ISO-10646-UCS-2", ESPEAKNG_ENCODING_ISO_10646_UCS_2 },
  96. { "ISO-8859-1", ESPEAKNG_ENCODING_ISO_8859_1 },
  97. { "ISO-8859-2", ESPEAKNG_ENCODING_ISO_8859_2 },
  98. { "ISO-8859-3", ESPEAKNG_ENCODING_ISO_8859_3 },
  99. { "ISO-8859-4", ESPEAKNG_ENCODING_ISO_8859_4 },
  100. { "ISO-8859-5", ESPEAKNG_ENCODING_ISO_8859_5 },
  101. { "ISO-8859-6", ESPEAKNG_ENCODING_ISO_8859_6 },
  102. { "ISO-8859-7", ESPEAKNG_ENCODING_ISO_8859_7 },
  103. { "ISO-8859-8", ESPEAKNG_ENCODING_ISO_8859_8 },
  104. { "ISO-8859-9", ESPEAKNG_ENCODING_ISO_8859_9 },
  105. { "ISO-8859-10", ESPEAKNG_ENCODING_ISO_8859_10 },
  106. { "ISO-8859-11", ESPEAKNG_ENCODING_ISO_8859_11 },
  107. { "ISO-8859-13", ESPEAKNG_ENCODING_ISO_8859_13 },
  108. { "ISO-8859-14", ESPEAKNG_ENCODING_ISO_8859_14 },
  109. { "ISO-8859-15", ESPEAKNG_ENCODING_ISO_8859_15 },
  110. { "ISO-8859-16", ESPEAKNG_ENCODING_ISO_8859_16 },
  111. { "KOI8-R", ESPEAKNG_ENCODING_KOI8_R },
  112. { "Latin-9", ESPEAKNG_ENCODING_ISO_8859_15 },
  113. { "TIS-620", ESPEAKNG_ENCODING_ISO_8859_11 },
  114. { "US-ASCII", ESPEAKNG_ENCODING_US_ASCII },
  115. { "UTF-8", ESPEAKNG_ENCODING_UTF_8 },
  116. { "cp367", ESPEAKNG_ENCODING_US_ASCII },
  117. { "cp819", ESPEAKNG_ENCODING_ISO_8859_1 },
  118. { "csASCII", ESPEAKNG_ENCODING_US_ASCII },
  119. { "csISO885913", ESPEAKNG_ENCODING_ISO_8859_13 },
  120. { "csISO885914", ESPEAKNG_ENCODING_ISO_8859_14 },
  121. { "csISO885915", ESPEAKNG_ENCODING_ISO_8859_15 },
  122. { "csISO885916", ESPEAKNG_ENCODING_ISO_8859_16 },
  123. { "csISOLatin1", ESPEAKNG_ENCODING_ISO_8859_1 },
  124. { "csISOLatin2", ESPEAKNG_ENCODING_ISO_8859_2 },
  125. { "csISOLatin3", ESPEAKNG_ENCODING_ISO_8859_3 },
  126. { "csISOLatin4", ESPEAKNG_ENCODING_ISO_8859_4 },
  127. { "csISOLatin5", ESPEAKNG_ENCODING_ISO_8859_9 },
  128. { "csISOLatin6", ESPEAKNG_ENCODING_ISO_8859_10 },
  129. { "csISOLatinArabic", ESPEAKNG_ENCODING_ISO_8859_6 },
  130. { "csISOLatinCyrillic",ESPEAKNG_ENCODING_ISO_8859_5 },
  131. { "csISOLatinGreek", ESPEAKNG_ENCODING_ISO_8859_7 },
  132. { "csISOLatinHebrew", ESPEAKNG_ENCODING_ISO_8859_8 },
  133. { "csKOI8R", ESPEAKNG_ENCODING_KOI8_R },
  134. { "csTIS620", ESPEAKNG_ENCODING_ISO_8859_11 },
  135. { "csUTF8", ESPEAKNG_ENCODING_UTF_8 },
  136. { "csUnicode", ESPEAKNG_ENCODING_ISO_10646_UCS_2 },
  137. { "arabic", ESPEAKNG_ENCODING_ISO_8859_6 },
  138. { "cyrillic", ESPEAKNG_ENCODING_ISO_8859_5 },
  139. { "greek", ESPEAKNG_ENCODING_ISO_8859_7 },
  140. { "greek8", ESPEAKNG_ENCODING_ISO_8859_7 },
  141. { "hebrew", ESPEAKNG_ENCODING_ISO_8859_8 },
  142. { "iso-celtic", ESPEAKNG_ENCODING_ISO_8859_14 },
  143. { "iso-ir-6", ESPEAKNG_ENCODING_US_ASCII },
  144. { "iso-ir-100", ESPEAKNG_ENCODING_ISO_8859_1 },
  145. { "iso-ir-101", ESPEAKNG_ENCODING_ISO_8859_2 },
  146. { "iso-ir-109", ESPEAKNG_ENCODING_ISO_8859_3 },
  147. { "iso-ir-110", ESPEAKNG_ENCODING_ISO_8859_4 },
  148. { "iso-ir-126", ESPEAKNG_ENCODING_ISO_8859_7 },
  149. { "iso-ir-127", ESPEAKNG_ENCODING_ISO_8859_6 },
  150. { "iso-ir-138", ESPEAKNG_ENCODING_ISO_8859_8 },
  151. { "iso-ir-144", ESPEAKNG_ENCODING_ISO_8859_5 },
  152. { "iso-ir-148", ESPEAKNG_ENCODING_ISO_8859_9 },
  153. { "iso-ir-157", ESPEAKNG_ENCODING_ISO_8859_10 },
  154. { "iso-ir-199", ESPEAKNG_ENCODING_ISO_8859_14 },
  155. { "iso-ir-226", ESPEAKNG_ENCODING_ISO_8859_16 },
  156. { "latin1", ESPEAKNG_ENCODING_ISO_8859_1 },
  157. { "latin2", ESPEAKNG_ENCODING_ISO_8859_2 },
  158. { "latin3", ESPEAKNG_ENCODING_ISO_8859_3 },
  159. { "latin4", ESPEAKNG_ENCODING_ISO_8859_4 },
  160. { "latin5", ESPEAKNG_ENCODING_ISO_8859_9 },
  161. { "latin6", ESPEAKNG_ENCODING_ISO_8859_10 },
  162. { "latin8", ESPEAKNG_ENCODING_ISO_8859_14 },
  163. { "latin10", ESPEAKNG_ENCODING_ISO_8859_16 },
  164. { "l1", ESPEAKNG_ENCODING_ISO_8859_1 },
  165. { "l2", ESPEAKNG_ENCODING_ISO_8859_2 },
  166. { "l3", ESPEAKNG_ENCODING_ISO_8859_3 },
  167. { "l4", ESPEAKNG_ENCODING_ISO_8859_4 },
  168. { "l5", ESPEAKNG_ENCODING_ISO_8859_9 },
  169. { "l6", ESPEAKNG_ENCODING_ISO_8859_10 },
  170. { "l8", ESPEAKNG_ENCODING_ISO_8859_14 },
  171. { "l10", ESPEAKNG_ENCODING_ISO_8859_16 },
  172. { "us", ESPEAKNG_ENCODING_US_ASCII },
  173. { NULL, ESPEAKNG_ENCODING_UNKNOWN }
  174. };
  175. #pragma GCC visibility push(default)
  176. espeak_ng_ENCODING
  177. espeak_ng_EncodingFromName(const char *encoding)
  178. {
  179. return LookupMnem(mnem_encoding, encoding);
  180. }
  181. #pragma GCC visibility pop
  182. struct espeak_ng_TEXT_DECODER_
  183. {
  184. const uint8_t *current;
  185. const uint8_t *end;
  186. uint32_t (*get)(espeak_ng_TEXT_DECODER *decoder);
  187. const uint16_t *codepage;
  188. };
  189. // Reference: http://www.iana.org/go/rfc1345
  190. // Reference: http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT
  191. static const uint16_t ISO_8859_1[0x80] = {
  192. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, // 80
  193. 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, // 88
  194. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, // 90
  195. 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, // 98
  196. 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, // a0
  197. 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, // a8
  198. 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, // b0
  199. 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, // b8
  200. 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // c0
  201. 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
  202. 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, // d0
  203. 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, // d8
  204. 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, // e0
  205. 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
  206. 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, // f0
  207. 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, // f8
  208. };
  209. // Reference: http://www.iana.org/go/rfc1345
  210. // Reference: http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-2.TXT
  211. static const uint16_t ISO_8859_2[0x80] = {
  212. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, // 80
  213. 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, // 88
  214. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, // 90
  215. 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, // 98
  216. 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7, // a0
  217. 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b, // a8
  218. 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7, // b0
  219. 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c, // b8
  220. 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, // c0
  221. 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, // c8
  222. 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, // d0
  223. 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, // d8
  224. 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, // e0
  225. 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, // e8
  226. 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, // f0
  227. 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9, // f8
  228. };
  229. // Reference: http://www.iana.org/go/rfc1345
  230. // Reference: http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-3.TXT
  231. static const uint16_t ISO_8859_3[0x80] = {
  232. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, // 80
  233. 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, // 88
  234. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, // 90
  235. 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, // 98
  236. 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, 0xfffd, 0x0124, 0x00a7, // a0
  237. 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, 0xfffd, 0x017b, // a8
  238. 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7, // b0
  239. 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, 0xfffd, 0x017c, // b8
  240. 0x00c0, 0x00c1, 0x00c2, 0xfffd, 0x00c4, 0x010a, 0x0108, 0x00c7, // c0
  241. 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
  242. 0xfffd, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7, // d0
  243. 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df, // d8
  244. 0x00e0, 0x00e1, 0x00e2, 0xfffd, 0x00e4, 0x010b, 0x0109, 0x00e7, // e0
  245. 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
  246. 0xfffd, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7, // f0
  247. 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9, // f8
  248. };
  249. // Reference: http://www.iana.org/go/rfc1345
  250. // Reference: http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-4.TXT
  251. static const uint16_t ISO_8859_4[0x80] = {
  252. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, // 80
  253. 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, // 88
  254. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, // 90
  255. 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, // 98
  256. 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7, // a0
  257. 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af, // a8
  258. 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7, // b0
  259. 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b, // b8
  260. 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, // c0
  261. 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a, // c8
  262. 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7, // d0
  263. 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df, // d8
  264. 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, // e0
  265. 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b, // e8
  266. 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7, // f0
  267. 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9, // f8
  268. };
  269. // Reference: http://www.iana.org/go/rfc1345
  270. // Reference: http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-5.TXT
  271. static const uint16_t ISO_8859_5[0x80] = {
  272. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, // 80
  273. 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, // 88
  274. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, // 90
  275. 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, // 98
  276. 0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, // a0
  277. 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x00ad, 0x040e, 0x040f, // a8
  278. 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, // b0
  279. 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, // b8
  280. 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, // c0
  281. 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, // c8
  282. 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, // d0
  283. 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, // d8
  284. 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, // e0
  285. 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, // e8
  286. 0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, // f0
  287. 0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f, // f8
  288. };
  289. // Reference: http://www.iana.org/go/rfc1345
  290. // Reference: http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-6.TXT
  291. static const uint16_t ISO_8859_6[0x80] = {
  292. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, // 80
  293. 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, // 88
  294. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, // 90
  295. 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, // 98
  296. 0x00a0, 0xfffd, 0xfffd, 0xfffd, 0x00a4, 0xfffd, 0xfffd, 0xfffd, // a0
  297. 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x060c, 0x00ad, 0xfffd, 0xfffd, // a8
  298. 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // b0
  299. 0xfffd, 0xfffd, 0xfffd, 0x061b, 0xfffd, 0xfffd, 0xfffd, 0x061f, // b8
  300. 0xfffd, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627, // c0
  301. 0x0628, 0x0629, 0x062a, 0x062b, 0x062c, 0x062d, 0x062e, 0x062f, // c8
  302. 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637, // d0
  303. 0x0638, 0x0639, 0x063a, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // d8
  304. 0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647, // e0
  305. 0x0648, 0x0649, 0x064a, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, // e8
  306. 0x0650, 0x0651, 0x0652, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // f0
  307. 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // f8
  308. };
  309. // Reference: http://www.iana.org/go/rfc1345
  310. // Reference: http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT
  311. static const uint16_t ISO_8859_7[0x80] = {
  312. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, // 80
  313. 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, // 88
  314. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, // 90
  315. 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, // 98
  316. 0x00a0, 0x2018, 0x2019, 0x00a3, 0x20ac, 0x20af, 0x00a6, 0x00a7, // a0
  317. 0x00a8, 0x00a9, 0x037a, 0x00ab, 0x00ac, 0x00ad, 0xfffd, 0x2015, // a8
  318. 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x0385, 0x0386, 0x00b7, // b0
  319. 0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f, // b8
  320. 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, // c0
  321. 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, // c8
  322. 0x03a0, 0x03a1, 0xfffd, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, // d0
  323. 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af, // d8
  324. 0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7, // e0
  325. 0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, // e8
  326. 0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7, // f0
  327. 0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, 0xfffd, // f8
  328. };
  329. // Reference: http://www.iana.org/go/rfc1345
  330. // Reference: http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-8.TXT
  331. static const uint16_t ISO_8859_8[0x80] = {
  332. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, // 80
  333. 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, // 88
  334. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, // 90
  335. 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, // 98
  336. 0x00a0, 0xfffd, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, // a0
  337. 0x00a8, 0x00a9, 0x00d7, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, // a8
  338. 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, // b0
  339. 0x00b8, 0x00b9, 0x00f7, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0xfffd, // b8
  340. 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // c0
  341. 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // c8
  342. 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // d0
  343. 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x2017, // d8
  344. 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x05d5, 0x05d6, 0x05d7, // e0
  345. 0x05d8, 0x05d9, 0x05da, 0x05db, 0x05dc, 0x05dd, 0x05de, 0x05df, // e8
  346. 0x05e0, 0x05e1, 0x05e2, 0x05e3, 0x05e4, 0x05e5, 0x05e6, 0x05e7, // f0
  347. 0x05e8, 0x05e9, 0x05ea, 0xfffd, 0xfffd, 0x200e, 0x200f, 0xfffd, // f8
  348. };
  349. // Reference: http://www.iana.org/go/rfc1345
  350. // Reference: http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-9.TXT
  351. static const uint16_t ISO_8859_9[0x80] = {
  352. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, // 80
  353. 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, // 88
  354. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, // 90
  355. 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, // 98
  356. 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, // a0
  357. 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, // a8
  358. 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, // b0
  359. 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, // b8
  360. 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // c0
  361. 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
  362. 0x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, // d0
  363. 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df, // d8
  364. 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, // e0
  365. 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
  366. 0x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, // f0
  367. 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff, // f8
  368. };
  369. // Reference: http://www.iana.org/go/rfc1345
  370. // Reference: http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-10.TXT
  371. static const uint16_t ISO_8859_10[0x80] = {
  372. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, // 80
  373. 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, // 88
  374. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, // 90
  375. 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, // 98
  376. 0x00a0, 0x0104, 0x0112, 0x0122, 0x012a, 0x0128, 0x0136, 0x00a7, // a0
  377. 0x013b, 0x0110, 0x0160, 0x0166, 0x017d, 0x00ad, 0x016a, 0x014a, // a8
  378. 0x00b0, 0x0105, 0x0113, 0x0123, 0x012b, 0x0129, 0x0137, 0x00b7, // b0
  379. 0x013c, 0x0111, 0x0161, 0x0167, 0x017e, 0x2015, 0x016b, 0x014b, // b8
  380. 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, // c0
  381. 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x00cf, // c8
  382. 0x00d0, 0x0145, 0x014c, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x0168, // d0
  383. 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, // d8
  384. 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, // e0
  385. 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x00ef, // e8
  386. 0x00f0, 0x0146, 0x014d, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x0169, // f0
  387. 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x0138, // f8
  388. };
  389. // Reference: http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-11.TXT
  390. static const uint16_t ISO_8859_11[0x80] = {
  391. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, // 80
  392. 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, // 88
  393. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, // 90
  394. 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, // 98
  395. 0x00a0, 0x0e01, 0x0e02, 0x0e03, 0x0e04, 0x0e05, 0x0e06, 0x0e07, // a0
  396. 0x0e08, 0x0e09, 0x0e0a, 0x0e0b, 0x0e0c, 0x0e0d, 0x0e0e, 0x0e0f, // a8
  397. 0x0e10, 0x0e11, 0x0e12, 0x0e13, 0x0e14, 0x0e15, 0x0e16, 0x0e17, // b0
  398. 0x0e18, 0x0e19, 0x0e1a, 0x0e1b, 0x0e1c, 0x0e1d, 0x0e1e, 0x0e1f, // b8
  399. 0x0e20, 0x0e21, 0x0e22, 0x0e23, 0x0e24, 0x0e25, 0x0e26, 0x0e27, // c0
  400. 0x0e28, 0x0e29, 0x0e2a, 0x0e2b, 0x0e2c, 0x0e2d, 0x0e2e, 0x0e2f, // c8
  401. 0x0e30, 0x0e31, 0x0e32, 0x0e33, 0x0e34, 0x0e35, 0x0e36, 0x0e37, // d0
  402. 0x0e38, 0x0e39, 0x0e3a, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x0e3f, // d8
  403. 0x0e40, 0x0e41, 0x0e42, 0x0e43, 0x0e44, 0x0e45, 0x0e46, 0x0e47, // e0
  404. 0x0e48, 0x0e49, 0x0e4a, 0x0e4b, 0x0e4c, 0x0e4d, 0x0e4e, 0x0e4f, // e8
  405. 0x0e50, 0x0e51, 0x0e52, 0x0e53, 0x0e54, 0x0e55, 0x0e56, 0x0e57, // f0
  406. 0x0e58, 0x0e59, 0x0e5a, 0x0e5b, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // f8
  407. };
  408. // Reference: http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-13.TXT
  409. static const uint16_t ISO_8859_13[0x80] = {
  410. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, // 80
  411. 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, // 88
  412. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, // 90
  413. 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, // 98
  414. 0x00a0, 0x201d, 0x00a2, 0x00a3, 0x00a4, 0x201e, 0x00a6, 0x00a7, // a0
  415. 0x00d8, 0x00a9, 0x0156, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00c6, // a8
  416. 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x201c, 0x00b5, 0x00b6, 0x00b7, // b0
  417. 0x00f8, 0x00b9, 0x0157, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00e6, // b8
  418. 0x0104, 0x012e, 0x0100, 0x0106, 0x00c4, 0x00c5, 0x0118, 0x0112, // c0
  419. 0x010c, 0x00c9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012a, 0x013b, // c8
  420. 0x0160, 0x0143, 0x0145, 0x00d3, 0x014c, 0x00d5, 0x00d6, 0x00d7, // d0
  421. 0x0172, 0x0141, 0x015a, 0x016a, 0x00dc, 0x017b, 0x017d, 0x00df, // d8
  422. 0x0105, 0x012f, 0x0101, 0x0107, 0x00e4, 0x00e5, 0x0119, 0x0113, // e0
  423. 0x010d, 0x00e9, 0x017a, 0x0117, 0x0123, 0x0137, 0x012b, 0x013c, // e8
  424. 0x0161, 0x0144, 0x0146, 0x00f3, 0x014d, 0x00f5, 0x00f6, 0x00f7, // f0
  425. 0x0173, 0x0142, 0x015b, 0x016b, 0x00fc, 0x017c, 0x017e, 0x2019, // f8
  426. };
  427. // Reference: http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-14.TXT
  428. static const uint16_t ISO_8859_14[0x80] = {
  429. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, // 80
  430. 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, // 88
  431. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, // 90
  432. 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, // 98
  433. 0x00a0, 0x1e02, 0x1e03, 0x00a3, 0x010a, 0x010b, 0x1e0a, 0x00a7, // a0
  434. 0x1e80, 0x00a9, 0x1e82, 0x1e0b, 0x1ef2, 0x00ad, 0x00ae, 0x0178, // a8
  435. 0x1e1e, 0x1e1f, 0x0120, 0x0121, 0x1e40, 0x1e41, 0x00b6, 0x1e56, // b0
  436. 0x1e81, 0x1e57, 0x1e83, 0x1e60, 0x1ef3, 0x1e84, 0x1e85, 0x1e61, // b8
  437. 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // c0
  438. 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
  439. 0x0174, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x1e6a, // d0
  440. 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x0176, 0x00df, // d8
  441. 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, // e0
  442. 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
  443. 0x0175, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x1e6b, // f0
  444. 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x0177, 0x00ff, // f8
  445. };
  446. // Reference: http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-15.TXT
  447. static const uint16_t ISO_8859_15[0x80] = {
  448. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, // 80
  449. 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, // 88
  450. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, // 90
  451. 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, // 98
  452. 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x20ac, 0x00a5, 0x0160, 0x00a7, // a0
  453. 0x0161, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, // a8
  454. 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x017d, 0x00b5, 0x00b6, 0x00b7, // b0
  455. 0x017e, 0x00b9, 0x00ba, 0x00bb, 0x0152, 0x0153, 0x0178, 0x00bf, // b8
  456. 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // c0
  457. 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
  458. 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, // d0
  459. 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, // d8
  460. 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, // e0
  461. 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
  462. 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, // f0
  463. 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, // f8
  464. };
  465. // Reference: http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-16.TXT
  466. static const uint16_t ISO_8859_16[0x80] = {
  467. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, // 80
  468. 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, // 88
  469. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, // 90
  470. 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, // 98
  471. 0x00a0, 0x0104, 0x0105, 0x0141, 0x20ac, 0x201e, 0x0160, 0x00a7, // a0
  472. 0x0161, 0x00a9, 0x0218, 0x00ab, 0x0179, 0x00ad, 0x017a, 0x017b, // a8
  473. 0x00b0, 0x00b1, 0x010c, 0x0142, 0x017d, 0x201d, 0x00b6, 0x00b7, // b0
  474. 0x017e, 0x010d, 0x0219, 0x00bb, 0x0152, 0x0153, 0x0178, 0x017c, // b8
  475. 0x00c0, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0106, 0x00c6, 0x00c7, // c0
  476. 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
  477. 0x0110, 0x0143, 0x00d2, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x015a, // d0
  478. 0x0170, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0118, 0x021a, 0x00df, // d8
  479. 0x00e0, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x0107, 0x00e6, 0x00e7, // e0
  480. 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
  481. 0x0111, 0x0144, 0x00f2, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x015b, // f0
  482. 0x0171, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0119, 0x021b, 0x00ff, // f8
  483. };
  484. // Reference: http://www.iana.org/go/rfc1489
  485. // Reference: http://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-R.TXT
  486. static const uint16_t KOI8_R[0x80] = {
  487. 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, // 80
  488. 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, // 88
  489. 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, // 90
  490. 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, // 98
  491. 0x00a0, 0x0104, 0x0105, 0x0141, 0x20ac, 0x201e, 0x0160, 0x00a7, // a0
  492. 0x0161, 0x00a9, 0x0218, 0x00ab, 0x0179, 0x00ad, 0x017a, 0x017b, // a8
  493. 0x00b0, 0x00b1, 0x010c, 0x0142, 0x017d, 0x201d, 0x00b6, 0x00b7, // b0
  494. 0x017e, 0x010d, 0x0219, 0x00bb, 0x0152, 0x0153, 0x0178, 0x017c, // b8
  495. 0x00c0, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0106, 0x00c6, 0x00c7, // c0
  496. 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
  497. 0x0110, 0x0143, 0x00d2, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x015a, // d0
  498. 0x0170, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0118, 0x021a, 0x00df, // d8
  499. 0x00e0, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x0107, 0x00e6, 0x00e7, // e0
  500. 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
  501. 0x0111, 0x0144, 0x00f2, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x015b, // f0
  502. 0x0171, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0119, 0x021b, 0x00ff, // f8
  503. };
  504. // Reference: http://varamozhi.sourceforge.net/iscii91.pdf
  505. static const uint16_t ISCII[0x80] = {
  506. 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // 80
  507. 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // 88
  508. 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // 90
  509. 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // 98
  510. 0xfffd, 0x0901, 0x0902, 0x0903, 0x0905, 0x0906, 0x0907, 0x0908, // a0
  511. 0x0909, 0x090a, 0x090b, 0x090e, 0x090f, 0x0910, 0x090d, 0x0912, // a8
  512. 0x0913, 0x0914, 0x0911, 0x0915, 0x0916, 0x0917, 0x0918, 0x0919, // b0
  513. 0x091a, 0x091b, 0x091c, 0x091d, 0x091e, 0x091f, 0x0920, 0x0921, // b8
  514. 0x0922, 0x0923, 0x0924, 0x0925, 0x0926, 0x0927, 0x0928, 0x0929, // c0
  515. 0x092a, 0x092b, 0x092c, 0x092d, 0x092e, 0x092f, 0x095f, 0x0930, // c8
  516. 0x0931, 0x0932, 0x0933, 0x0934, 0x0935, 0x0936, 0x0937, 0x0938, // d0
  517. 0x0939, 0x0020, 0x093e, 0x093f, 0x0940, 0x0941, 0x0942, 0x0943, // d8
  518. 0x0946, 0x0947, 0x0948, 0x0945, 0x094a, 0x094b, 0x094c, 0x0949, // e0
  519. 0x094d, 0x093c, 0x0964, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // e8
  520. 0x0020, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, // f0
  521. 0x0037, 0x0038, 0x0039, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, // f8
  522. };
  523. static uint32_t
  524. string_decoder_getc_us_ascii(espeak_ng_TEXT_DECODER *decoder)
  525. {
  526. uint8_t c = *decoder->current++;
  527. return (c >= 0x80) ? 0xFFFD : c;
  528. }
  529. static uint32_t
  530. string_decoder_getc_codepage(espeak_ng_TEXT_DECODER *decoder)
  531. {
  532. uint8_t c = *decoder->current++;
  533. return (c >= 0x80) ? decoder->codepage[c - 0x80] : c;
  534. }
  535. static uint32_t
  536. string_decoder_getc_utf_8(espeak_ng_TEXT_DECODER *decoder)
  537. {
  538. uint8_t c = *decoder->current++;
  539. uint32_t ret;
  540. switch (c & 0xF0)
  541. {
  542. // 1-byte UTF-8 sequence
  543. case 0x00: case 0x10: case 0x20: case 0x30:
  544. case 0x40: case 0x50: case 0x60: case 0x70:
  545. return c;
  546. // UTF-8 tail byte -- invalid in isolation
  547. case 0x80: case 0x90: case 0xA0: case 0xB0:
  548. return 0xFFFD;
  549. // 2-byte UTF-8 sequence
  550. case 0xC0: case 0xD0:
  551. if (decoder->current + 1 >= decoder->end) goto eof;
  552. ret = c & 0x1F;
  553. if (((c = *decoder->current++) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
  554. ret = (ret << 6) + (c & 0x3F);
  555. return ret;
  556. // 3-byte UTF-8 sequence
  557. case 0xE0:
  558. if (decoder->current + 2 >= decoder->end) goto eof;
  559. ret = c & 0x0F;
  560. if (((c = *decoder->current++) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
  561. ret = (ret << 6) + (c & 0x3F);
  562. if (((c = *decoder->current++) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
  563. ret = (ret << 6) + (c & 0x3F);
  564. return ret;
  565. // 4-byte UTF-8 sequence
  566. case 0xF0:
  567. if (decoder->current + 3 >= decoder->end) goto eof;
  568. ret = c & 0x0F;
  569. if (((c = *decoder->current++) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
  570. ret = (ret << 6) + (c & 0x3F);
  571. if (((c = *decoder->current++) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
  572. ret = (ret << 6) + (c & 0x3F);
  573. if (((c = *decoder->current++) & LEADING_2_BITS) != UTF8_TAIL_BITS) goto error;
  574. ret = (ret << 6) + (c & 0x3F);
  575. return (ret <= 0x10FFFF) ? ret : 0xFFFD;
  576. }
  577. error:
  578. --decoder->current;
  579. return 0xFFFD;
  580. eof:
  581. decoder->current = decoder->end;
  582. return 0xFFFD;
  583. }
  584. static uint32_t
  585. string_decoder_getc_iso_10646_ucs_2(espeak_ng_TEXT_DECODER *decoder)
  586. {
  587. if (decoder->current + 1 >= decoder->end) {
  588. decoder->current = decoder->end;
  589. return 0xFFFD;
  590. }
  591. uint8_t c1 = *decoder->current++;
  592. uint8_t c2 = *decoder->current++;
  593. return c1 + (c2 << 8);
  594. }
  595. static uint32_t
  596. string_decoder_getc_wchar(espeak_ng_TEXT_DECODER *decoder)
  597. {
  598. wchar_t c = *(const wchar_t *)decoder->current;
  599. decoder->current += sizeof(wchar_t);
  600. return c;
  601. }
  602. static uint32_t
  603. string_decoder_getc_auto(espeak_ng_TEXT_DECODER *decoder)
  604. {
  605. const uint8_t *ptr = decoder->current;
  606. uint32_t c = string_decoder_getc_utf_8(decoder);
  607. if (c == 0xFFFD) {
  608. decoder->get = string_decoder_getc_codepage;
  609. decoder->current = ptr;
  610. c = decoder->get(decoder);
  611. }
  612. return c;
  613. }
  614. static uint32_t
  615. null_decoder_getc(espeak_ng_TEXT_DECODER *decoder)
  616. {
  617. return 0;
  618. }
  619. typedef struct
  620. {
  621. uint32_t (*get)(espeak_ng_TEXT_DECODER *decoder);
  622. const uint16_t *codepage;
  623. } encoding_t;
  624. static const encoding_t string_decoders[] = {
  625. { NULL, NULL },
  626. { string_decoder_getc_us_ascii, NULL },
  627. { string_decoder_getc_codepage, ISO_8859_1 },
  628. { string_decoder_getc_codepage, ISO_8859_2 },
  629. { string_decoder_getc_codepage, ISO_8859_3 },
  630. { string_decoder_getc_codepage, ISO_8859_4 },
  631. { string_decoder_getc_codepage, ISO_8859_5 },
  632. { string_decoder_getc_codepage, ISO_8859_6 },
  633. { string_decoder_getc_codepage, ISO_8859_7 },
  634. { string_decoder_getc_codepage, ISO_8859_8 },
  635. { string_decoder_getc_codepage, ISO_8859_9 },
  636. { string_decoder_getc_codepage, ISO_8859_10 },
  637. { string_decoder_getc_codepage, ISO_8859_11 },
  638. // ISO-8859-12 is not a valid encoding.
  639. { string_decoder_getc_codepage, ISO_8859_13 },
  640. { string_decoder_getc_codepage, ISO_8859_14 },
  641. { string_decoder_getc_codepage, ISO_8859_15 },
  642. { string_decoder_getc_codepage, ISO_8859_16 },
  643. { string_decoder_getc_codepage, KOI8_R },
  644. { string_decoder_getc_codepage, ISCII },
  645. { string_decoder_getc_utf_8, NULL },
  646. { string_decoder_getc_iso_10646_ucs_2, NULL },
  647. };
  648. espeak_ng_TEXT_DECODER *
  649. create_text_decoder(void)
  650. {
  651. espeak_ng_TEXT_DECODER *decoder = malloc(sizeof(espeak_ng_TEXT_DECODER));
  652. if (!decoder) return NULL;
  653. decoder->current = NULL;
  654. decoder->end = NULL;
  655. decoder->get = NULL;
  656. decoder->codepage = NULL;
  657. return decoder;
  658. }
  659. void
  660. destroy_text_decoder(espeak_ng_TEXT_DECODER *decoder)
  661. {
  662. if (decoder) free(decoder);
  663. }
  664. espeak_ng_STATUS
  665. text_decoder_decode_string(espeak_ng_TEXT_DECODER *decoder,
  666. const char *string,
  667. int length,
  668. espeak_ng_ENCODING encoding)
  669. {
  670. if (encoding > ESPEAKNG_ENCODING_ISO_10646_UCS_2)
  671. return ENS_UNKNOWN_TEXT_ENCODING;
  672. const encoding_t *enc = string_decoders + encoding;
  673. if (enc->get == NULL)
  674. return ENS_UNKNOWN_TEXT_ENCODING;
  675. if (length < 0) length = string ? strlen(string) + 1 : 0;
  676. decoder->get = string ? enc->get : null_decoder_getc;
  677. decoder->codepage = enc->codepage;
  678. decoder->current = (const uint8_t *)string;
  679. decoder->end = (const uint8_t *)(string ? string + length : string);
  680. return ENS_OK;
  681. }
  682. espeak_ng_STATUS
  683. text_decoder_decode_string_auto(espeak_ng_TEXT_DECODER *decoder,
  684. const char *string,
  685. int length,
  686. espeak_ng_ENCODING encoding)
  687. {
  688. if (encoding > ESPEAKNG_ENCODING_ISO_10646_UCS_2)
  689. return ENS_UNKNOWN_TEXT_ENCODING;
  690. const encoding_t *enc = string_decoders + encoding;
  691. if (enc->get == NULL)
  692. return ENS_UNKNOWN_TEXT_ENCODING;
  693. if (length < 0) length = string ? strlen(string) + 1 : 0;
  694. decoder->get = string ? string_decoder_getc_auto : null_decoder_getc;
  695. decoder->codepage = enc->codepage;
  696. decoder->current = (const uint8_t *)string;
  697. decoder->end = (const uint8_t *)(string ? string + length : string);
  698. return ENS_OK;
  699. }
  700. espeak_ng_STATUS
  701. text_decoder_decode_wstring(espeak_ng_TEXT_DECODER *decoder,
  702. const wchar_t *string,
  703. int length)
  704. {
  705. if (length < 0) length = string ? wcslen(string) + 1 : 0;
  706. decoder->get = string ? string_decoder_getc_wchar : null_decoder_getc;
  707. decoder->codepage = NULL;
  708. decoder->current = (const uint8_t *)string;
  709. decoder->end = (const uint8_t *)(string ? string + length : string);
  710. return ENS_OK;
  711. }
  712. espeak_ng_STATUS
  713. text_decoder_decode_string_multibyte(espeak_ng_TEXT_DECODER *decoder,
  714. const void *input,
  715. espeak_ng_ENCODING encoding,
  716. int flags)
  717. {
  718. switch (flags & 7)
  719. {
  720. case espeakCHARS_WCHAR:
  721. return text_decoder_decode_wstring(decoder, (const wchar_t *)input, -1);
  722. case espeakCHARS_AUTO:
  723. return text_decoder_decode_string_auto(decoder, (const char *)input, -1, encoding);
  724. case espeakCHARS_UTF8:
  725. return text_decoder_decode_string(decoder, (const char *)input, -1, ESPEAKNG_ENCODING_UTF_8);
  726. case espeakCHARS_8BIT:
  727. return text_decoder_decode_string(decoder, (const char *)input, -1, encoding);
  728. case espeakCHARS_16BIT:
  729. return text_decoder_decode_string(decoder, (const char *)input, -1, ESPEAKNG_ENCODING_ISO_10646_UCS_2);
  730. default:
  731. return ENS_UNKNOWN_TEXT_ENCODING;
  732. }
  733. }
  734. int
  735. text_decoder_eof(espeak_ng_TEXT_DECODER *decoder)
  736. {
  737. return decoder->current == decoder->end;
  738. }
  739. uint32_t
  740. text_decoder_getc(espeak_ng_TEXT_DECODER *decoder)
  741. {
  742. return decoder->get(decoder);
  743. }
  744. uint32_t
  745. text_decoder_peekc(espeak_ng_TEXT_DECODER *decoder)
  746. {
  747. const uint8_t *current = decoder->current;
  748. uint32_t c = decoder->get(decoder);
  749. decoder->current = current;
  750. return c;
  751. }
  752. const void *
  753. text_decoder_get_buffer(espeak_ng_TEXT_DECODER *decoder)
  754. {
  755. if (text_decoder_eof(decoder))
  756. return NULL;
  757. return decoder->current;
  758. }