eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizer.c 29KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788
  1. /*
  2. * Copyright (C) 2017 Reece H. Dunn
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 3 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write see:
  16. * <http://www.gnu.org/licenses/>.
  17. */
  18. #include "config.h"
  19. #include <assert.h>
  20. #include <errno.h>
  21. #include <stdint.h>
  22. #include <stdlib.h>
  23. #include <string.h>
  24. #include <stdio.h>
  25. #include <sys/stat.h>
  26. #include <espeak-ng/espeak_ng.h>
  27. #include <espeak-ng/encoding.h>
  28. #include <espeak-ng/tokenizer.h>
  29. #include "speech.h"
  30. #include "phoneme.h"
  31. #include "synthesize.h"
  32. #include "translate.h"
  33. // TODO: Find a better place for this than speech.c, so it can be implemented
  34. // in one place without having to include all of speech.c.
  35. int GetFileLength(const char *filename)
  36. {
  37. struct stat statbuf;
  38. if (stat(filename, &statbuf) != 0)
  39. return -errno;
  40. if (S_ISDIR(statbuf.st_mode))
  41. return -EISDIR;
  42. return statbuf.st_size;
  43. }
  44. void
  45. test_latin_common()
  46. {
  47. printf("testing Latin/Common (Latn/Zyyy) script classification\n");
  48. assert(clause_type_from_codepoint('a') == CLAUSE_NONE);
  49. assert(clause_type_from_codepoint('.') == CLAUSE_PERIOD);
  50. assert(clause_type_from_codepoint('?') == CLAUSE_QUESTION);
  51. assert(clause_type_from_codepoint('!') == CLAUSE_EXCLAMATION);
  52. assert(clause_type_from_codepoint(',') == CLAUSE_COMMA);
  53. assert(clause_type_from_codepoint(':') == CLAUSE_COLON);
  54. assert(clause_type_from_codepoint(';') == CLAUSE_SEMICOLON);
  55. assert(clause_type_from_codepoint(0x00A1) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER));
  56. assert(clause_type_from_codepoint(0x00Bf) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER));
  57. assert(clause_type_from_codepoint(0x2013) == CLAUSE_SEMICOLON);
  58. assert(clause_type_from_codepoint(0x2014) == CLAUSE_SEMICOLON);
  59. assert(clause_type_from_codepoint(0x2026) == (CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER));
  60. }
  61. void
  62. test_greek()
  63. {
  64. printf("testing Greek (Grek) script classification\n");
  65. assert(clause_type_from_codepoint(0x037E) == CLAUSE_QUESTION);
  66. assert(clause_type_from_codepoint(0x0387) == CLAUSE_SEMICOLON);
  67. }
  68. void
  69. test_armenian()
  70. {
  71. printf("testing Armenian (Armn) script classification\n");
  72. assert(clause_type_from_codepoint(0x055B) == (CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD));
  73. assert(clause_type_from_codepoint(0x055C) == (CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD));
  74. assert(clause_type_from_codepoint(0x055D) == CLAUSE_COMMA);
  75. assert(clause_type_from_codepoint(0x055E) == (CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD));
  76. assert(clause_type_from_codepoint(0x0589) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  77. }
  78. void
  79. test_arabic()
  80. {
  81. printf("testing Arabic (Arab) script classification\n");
  82. assert(clause_type_from_codepoint(0x060C) == CLAUSE_COMMA);
  83. assert(clause_type_from_codepoint(0x061B) == CLAUSE_SEMICOLON);
  84. assert(clause_type_from_codepoint(0x061F) == CLAUSE_QUESTION);
  85. assert(clause_type_from_codepoint(0x06D4) == CLAUSE_PERIOD);
  86. }
  87. void
  88. test_devanagari()
  89. {
  90. printf("testing Devanagari (Deva) script classification\n");
  91. assert(clause_type_from_codepoint(0x0964) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  92. }
  93. void
  94. test_tibetan()
  95. {
  96. printf("testing Tibetan (Tibt) script classification\n");
  97. assert(clause_type_from_codepoint(0x0F0D) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  98. assert(clause_type_from_codepoint(0x0F0E) == CLAUSE_PARAGRAPH);
  99. }
  100. void
  101. test_sinhala()
  102. {
  103. printf("testing Sinhala (Sinh) script classification\n");
  104. assert(clause_type_from_codepoint(0x0DF4) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  105. }
  106. void
  107. test_georgian()
  108. {
  109. printf("testing Georgian (Geor) script classification\n");
  110. assert(clause_type_from_codepoint(0x10FB) == CLAUSE_PARAGRAPH);
  111. }
  112. void
  113. test_ethiopic()
  114. {
  115. printf("testing Ethiopic (Ethi) script classification\n");
  116. assert(clause_type_from_codepoint(0x1362) == CLAUSE_PERIOD);
  117. assert(clause_type_from_codepoint(0x1363) == CLAUSE_COMMA);
  118. assert(clause_type_from_codepoint(0x1364) == CLAUSE_SEMICOLON);
  119. assert(clause_type_from_codepoint(0x1365) == CLAUSE_COLON);
  120. assert(clause_type_from_codepoint(0x1366) == CLAUSE_COLON);
  121. assert(clause_type_from_codepoint(0x1367) == CLAUSE_QUESTION);
  122. assert(clause_type_from_codepoint(0x1368) == CLAUSE_PARAGRAPH);
  123. }
  124. void
  125. test_ideographic()
  126. {
  127. printf("testing Ideographic (Hani) script classification\n");
  128. assert(clause_type_from_codepoint(0x3001) == (CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER));
  129. assert(clause_type_from_codepoint(0x3002) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  130. }
  131. void
  132. test_fullwidth()
  133. {
  134. printf("testing Full Width/Common (Zyyy) script classification\n");
  135. assert(clause_type_from_codepoint(0xFF01) == (CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER));
  136. assert(clause_type_from_codepoint(0xFF0C) == (CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER));
  137. assert(clause_type_from_codepoint(0xFF0E) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  138. assert(clause_type_from_codepoint(0xFF1A) == (CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER));
  139. assert(clause_type_from_codepoint(0xFF1B) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER));
  140. assert(clause_type_from_codepoint(0xFF1F) == (CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER));
  141. }
  142. void
  143. test_unbound_tokenizer()
  144. {
  145. printf("testing unbound tokenizer\n");
  146. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  147. assert(tokenizer != NULL);
  148. assert(tokenizer_get_token_text(tokenizer) != NULL);
  149. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  150. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  151. assert(tokenizer_get_token_text(tokenizer) != NULL);
  152. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  153. assert(tokenizer_reset(tokenizer, NULL, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  154. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  155. assert(tokenizer_get_token_text(tokenizer) != NULL);
  156. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  157. destroy_tokenizer(tokenizer);
  158. }
  159. void
  160. test_linux_newline_tokens()
  161. {
  162. printf("testing linux newline tokens\n");
  163. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  164. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  165. assert(text_decoder_decode_string(decoder, "\n\n", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
  166. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  167. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  168. assert(tokenizer_get_token_text(tokenizer) != NULL);
  169. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  170. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  171. assert(tokenizer_get_token_text(tokenizer) != NULL);
  172. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  173. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  174. assert(tokenizer_get_token_text(tokenizer) != NULL);
  175. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  176. destroy_text_decoder(decoder);
  177. destroy_tokenizer(tokenizer);
  178. }
  179. void
  180. test_mac_newline_tokens()
  181. {
  182. printf("testing mac newline tokens\n");
  183. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  184. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  185. assert(text_decoder_decode_string(decoder, "\r\r", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
  186. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  187. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  188. assert(tokenizer_get_token_text(tokenizer) != NULL);
  189. assert(strcmp(tokenizer_get_token_text(tokenizer), "\r") == 0);
  190. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  191. assert(tokenizer_get_token_text(tokenizer) != NULL);
  192. assert(strcmp(tokenizer_get_token_text(tokenizer), "\r") == 0);
  193. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  194. assert(tokenizer_get_token_text(tokenizer) != NULL);
  195. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  196. destroy_text_decoder(decoder);
  197. destroy_tokenizer(tokenizer);
  198. }
  199. void
  200. test_windows_newline_tokens()
  201. {
  202. printf("testing windows newline tokens\n");
  203. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  204. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  205. assert(text_decoder_decode_string(decoder, "\r\n\r\n", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
  206. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  207. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  208. assert(tokenizer_get_token_text(tokenizer) != NULL);
  209. assert(strcmp(tokenizer_get_token_text(tokenizer), "\r\n") == 0);
  210. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  211. assert(tokenizer_get_token_text(tokenizer) != NULL);
  212. assert(strcmp(tokenizer_get_token_text(tokenizer), "\r\n") == 0);
  213. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  214. assert(tokenizer_get_token_text(tokenizer) != NULL);
  215. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  216. destroy_text_decoder(decoder);
  217. destroy_tokenizer(tokenizer);
  218. }
  219. void
  220. test_unicode_newline_tokens()
  221. {
  222. printf("testing unicode newline tokens\n");
  223. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  224. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  225. assert(text_decoder_decode_string(decoder, "\x0C\x0C\xC2\x85\xC2\x85\xE2\x80\xA8\xE2\x80\xA8", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  226. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  227. // FORM FEED (FF) -- Used as a page (not paragraph) break.
  228. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  229. assert(tokenizer_get_token_text(tokenizer) != NULL);
  230. assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0C") == 0);
  231. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  232. assert(tokenizer_get_token_text(tokenizer) != NULL);
  233. assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0C") == 0);
  234. // NEXT LINE (NEL) [U+0085] -- Used in EBCDIC systems as a combined CR+LF character.
  235. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  236. assert(tokenizer_get_token_text(tokenizer) != NULL);
  237. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0);
  238. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  239. assert(tokenizer_get_token_text(tokenizer) != NULL);
  240. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0);
  241. // General Category: Zl -- LINE SEPARATOR [U+2028]
  242. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  243. assert(tokenizer_get_token_text(tokenizer) != NULL);
  244. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0);
  245. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  246. assert(tokenizer_get_token_text(tokenizer) != NULL);
  247. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0);
  248. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  249. assert(tokenizer_get_token_text(tokenizer) != NULL);
  250. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  251. destroy_text_decoder(decoder);
  252. destroy_tokenizer(tokenizer);
  253. }
  254. void
  255. test_paragraph_tokens()
  256. {
  257. printf("testing paragraph tokens\n");
  258. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  259. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  260. assert(text_decoder_decode_string(decoder, "\xE2\x80\xA9\xE2\x80\xA9", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  261. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  262. // General Category: Zp, PARAGRAPH SEPARATOR [U+2029]
  263. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PARAGRAPH);
  264. assert(tokenizer_get_token_text(tokenizer) != NULL);
  265. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0);
  266. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PARAGRAPH);
  267. assert(tokenizer_get_token_text(tokenizer) != NULL);
  268. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0);
  269. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  270. assert(tokenizer_get_token_text(tokenizer) != NULL);
  271. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  272. destroy_text_decoder(decoder);
  273. destroy_tokenizer(tokenizer);
  274. }
  275. void
  276. test_whitespace_tokens()
  277. {
  278. printf("testing whitespace tokens\n");
  279. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  280. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  281. assert(text_decoder_decode_string(decoder, "\t\t\n\x0B\x0B\n \xE3\x80\x80 \n\xC2\xA0\xC2\xA0", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  282. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  283. // General Category: Cc, Property: White_Space
  284. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  285. assert(tokenizer_get_token_text(tokenizer) != NULL);
  286. assert(strcmp(tokenizer_get_token_text(tokenizer), "\t\t") == 0);
  287. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  288. assert(tokenizer_get_token_text(tokenizer) != NULL);
  289. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  290. // General Category: Cc, Property: White_Space, VERTICAL TAB (VT) -- Not treated as newline tokens.
  291. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  292. assert(tokenizer_get_token_text(tokenizer) != NULL);
  293. assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0B\x0B") == 0);
  294. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  295. assert(tokenizer_get_token_text(tokenizer) != NULL);
  296. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  297. // General Category: Zs, Property: White_Space
  298. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  299. assert(tokenizer_get_token_text(tokenizer) != NULL);
  300. assert(strcmp(tokenizer_get_token_text(tokenizer), " \xE3\x80\x80 ") == 0);
  301. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  302. assert(tokenizer_get_token_text(tokenizer) != NULL);
  303. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  304. // General Category: Zs, Property: White_Space, Decomposition: <noBreak>, NO-BREAK SPACE [U+00A0]
  305. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  306. assert(tokenizer_get_token_text(tokenizer) != NULL);
  307. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xA0\xC2\xA0") == 0);
  308. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  309. assert(tokenizer_get_token_text(tokenizer) != NULL);
  310. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  311. destroy_text_decoder(decoder);
  312. destroy_tokenizer(tokenizer);
  313. }
  314. void
  315. test_Latn_word_tokens()
  316. {
  317. printf("testing Latin (Latn) script word tokens\n");
  318. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  319. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  320. assert(text_decoder_decode_string(decoder, "One one ONE OneTwo ONETwo", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
  321. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  322. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_CAPITALIZED);
  323. assert(tokenizer_get_token_text(tokenizer) != NULL);
  324. assert(strcmp(tokenizer_get_token_text(tokenizer), "One") == 0);
  325. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  326. assert(tokenizer_get_token_text(tokenizer) != NULL);
  327. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  328. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_LOWERCASE);
  329. assert(tokenizer_get_token_text(tokenizer) != NULL);
  330. assert(strcmp(tokenizer_get_token_text(tokenizer), "one") == 0);
  331. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  332. assert(tokenizer_get_token_text(tokenizer) != NULL);
  333. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  334. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_UPPERCASE);
  335. assert(tokenizer_get_token_text(tokenizer) != NULL);
  336. assert(strcmp(tokenizer_get_token_text(tokenizer), "ONE") == 0);
  337. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  338. assert(tokenizer_get_token_text(tokenizer) != NULL);
  339. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  340. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_MIXEDCASE);
  341. assert(tokenizer_get_token_text(tokenizer) != NULL);
  342. assert(strcmp(tokenizer_get_token_text(tokenizer), "OneTwo") == 0);
  343. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  344. assert(tokenizer_get_token_text(tokenizer) != NULL);
  345. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  346. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_MIXEDCASE);
  347. assert(tokenizer_get_token_text(tokenizer) != NULL);
  348. assert(strcmp(tokenizer_get_token_text(tokenizer), "ONETwo") == 0);
  349. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  350. assert(tokenizer_get_token_text(tokenizer) != NULL);
  351. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  352. destroy_text_decoder(decoder);
  353. destroy_tokenizer(tokenizer);
  354. }
  355. void
  356. test_Latn_punctuation_tokens()
  357. {
  358. printf("testing Latin (Latn) script punctuation tokens\n");
  359. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  360. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  361. assert(text_decoder_decode_string(decoder, ". ? .. ! ... , .... : ; \xE2\x80\xA6", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  362. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  363. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
  364. assert(tokenizer_get_token_text(tokenizer) != NULL);
  365. assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);
  366. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  367. assert(tokenizer_get_token_text(tokenizer) != NULL);
  368. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  369. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_QUESTION_MARK);
  370. assert(tokenizer_get_token_text(tokenizer) != NULL);
  371. assert(strcmp(tokenizer_get_token_text(tokenizer), "?") == 0);
  372. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  373. assert(tokenizer_get_token_text(tokenizer) != NULL);
  374. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  375. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
  376. assert(tokenizer_get_token_text(tokenizer) != NULL);
  377. assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);
  378. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
  379. assert(tokenizer_get_token_text(tokenizer) != NULL);
  380. assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);
  381. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  382. assert(tokenizer_get_token_text(tokenizer) != NULL);
  383. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  384. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_EXCLAMATION_MARK);
  385. assert(tokenizer_get_token_text(tokenizer) != NULL);
  386. assert(strcmp(tokenizer_get_token_text(tokenizer), "!") == 0);
  387. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  388. assert(tokenizer_get_token_text(tokenizer) != NULL);
  389. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  390. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_ELLIPSIS);
  391. assert(tokenizer_get_token_text(tokenizer) != NULL);
  392. assert(strcmp(tokenizer_get_token_text(tokenizer), "...") == 0);
  393. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  394. assert(tokenizer_get_token_text(tokenizer) != NULL);
  395. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  396. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_COMMA);
  397. assert(tokenizer_get_token_text(tokenizer) != NULL);
  398. assert(strcmp(tokenizer_get_token_text(tokenizer), ",") == 0);
  399. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  400. assert(tokenizer_get_token_text(tokenizer) != NULL);
  401. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  402. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_ELLIPSIS);
  403. assert(tokenizer_get_token_text(tokenizer) != NULL);
  404. assert(strcmp(tokenizer_get_token_text(tokenizer), "...") == 0);
  405. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
  406. assert(tokenizer_get_token_text(tokenizer) != NULL);
  407. assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);
  408. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  409. assert(tokenizer_get_token_text(tokenizer) != NULL);
  410. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  411. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_COLON);
  412. assert(tokenizer_get_token_text(tokenizer) != NULL);
  413. assert(strcmp(tokenizer_get_token_text(tokenizer), ":") == 0);
  414. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  415. assert(tokenizer_get_token_text(tokenizer) != NULL);
  416. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  417. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_SEMICOLON);
  418. assert(tokenizer_get_token_text(tokenizer) != NULL);
  419. assert(strcmp(tokenizer_get_token_text(tokenizer), ";") == 0);
  420. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  421. assert(tokenizer_get_token_text(tokenizer) != NULL);
  422. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  423. // HORIZONTAL ELLIPSIS [U+2026]
  424. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_ELLIPSIS);
  425. assert(tokenizer_get_token_text(tokenizer) != NULL);
  426. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA6") == 0);
  427. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  428. assert(tokenizer_get_token_text(tokenizer) != NULL);
  429. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  430. destroy_text_decoder(decoder);
  431. destroy_tokenizer(tokenizer);
  432. }
  433. void
  434. test_Latn_general_punctuation_tokens()
  435. {
  436. printf("testing Latin (Latn) script general punctuation tokens\n");
  437. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  438. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  439. assert(text_decoder_decode_string(decoder, "\" () - _ \xC2\xAB\xC2\xBB", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  440. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  441. // General Category: Po
  442. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  443. assert(tokenizer_get_token_text(tokenizer) != NULL);
  444. assert(strcmp(tokenizer_get_token_text(tokenizer), "\"") == 0);
  445. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  446. assert(tokenizer_get_token_text(tokenizer) != NULL);
  447. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  448. // General Category: Ps
  449. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  450. assert(tokenizer_get_token_text(tokenizer) != NULL);
  451. assert(strcmp(tokenizer_get_token_text(tokenizer), "(") == 0);
  452. // General Category: Pe
  453. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  454. assert(tokenizer_get_token_text(tokenizer) != NULL);
  455. assert(strcmp(tokenizer_get_token_text(tokenizer), ")") == 0);
  456. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  457. assert(tokenizer_get_token_text(tokenizer) != NULL);
  458. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  459. // General Category: Pd
  460. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  461. assert(tokenizer_get_token_text(tokenizer) != NULL);
  462. assert(strcmp(tokenizer_get_token_text(tokenizer), "-") == 0);
  463. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  464. assert(tokenizer_get_token_text(tokenizer) != NULL);
  465. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  466. // General Category: Pc
  467. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  468. assert(tokenizer_get_token_text(tokenizer) != NULL);
  469. assert(strcmp(tokenizer_get_token_text(tokenizer), "_") == 0);
  470. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  471. assert(tokenizer_get_token_text(tokenizer) != NULL);
  472. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  473. // General Category: Pi, LEFT-POINTING DOUBLE ANGLE QUOTATION MARK [U+00AB]
  474. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  475. assert(tokenizer_get_token_text(tokenizer) != NULL);
  476. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xAB") == 0);
  477. // General Category: Pf, RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK [U+00BB]
  478. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  479. assert(tokenizer_get_token_text(tokenizer) != NULL);
  480. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xBB") == 0);
  481. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  482. assert(tokenizer_get_token_text(tokenizer) != NULL);
  483. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  484. destroy_text_decoder(decoder);
  485. destroy_tokenizer(tokenizer);
  486. }
  487. void
  488. run_tests()
  489. {
  490. test_latin_common();
  491. test_greek();
  492. test_armenian();
  493. test_arabic();
  494. test_devanagari();
  495. test_tibetan();
  496. test_sinhala();
  497. test_georgian();
  498. test_ethiopic();
  499. test_ideographic();
  500. test_fullwidth();
  501. test_unbound_tokenizer();
  502. test_linux_newline_tokens();
  503. test_mac_newline_tokens();
  504. test_windows_newline_tokens();
  505. test_unicode_newline_tokens();
  506. test_paragraph_tokens();
  507. test_whitespace_tokens();
  508. test_Latn_word_tokens();
  509. test_Latn_punctuation_tokens();
  510. test_Latn_general_punctuation_tokens();
  511. printf("done\n");
  512. }
  513. void
  514. escape_newline(const char *s)
  515. {
  516. for ( ; *s; ++s) switch (*s)
  517. {
  518. case '\r': printf("\\r"); break;
  519. case '\n': printf("\\n"); break;
  520. default: putc(*s, stdout); break;
  521. }
  522. }
  523. void
  524. print_tokens(espeak_ng_TEXT_DECODER *decoder)
  525. {
  526. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  527. if (!tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT)) {
  528. destroy_tokenizer(tokenizer);
  529. return;
  530. }
  531. while (1) switch (tokenizer_read_next_token(tokenizer))
  532. {
  533. case ESPEAKNG_TOKEN_END_OF_BUFFER:
  534. destroy_tokenizer(tokenizer);
  535. return;
  536. case ESPEAKNG_TOKEN_UNKNOWN:
  537. printf("unknown : %s\n", tokenizer_get_token_text(tokenizer));
  538. break;
  539. case ESPEAKNG_TOKEN_NEWLINE:
  540. printf("newline : ");
  541. escape_newline(tokenizer_get_token_text(tokenizer));
  542. putc('\n', stdout);
  543. break;
  544. case ESPEAKNG_TOKEN_PARAGRAPH:
  545. printf("paragraph : %s\n", tokenizer_get_token_text(tokenizer));
  546. break;
  547. case ESPEAKNG_TOKEN_WHITESPACE:
  548. printf("whitespace : %s\n", tokenizer_get_token_text(tokenizer));
  549. break;
  550. case ESPEAKNG_TOKEN_WORD_UPPERCASE:
  551. printf("word (upper case) : %s\n", tokenizer_get_token_text(tokenizer));
  552. break;
  553. case ESPEAKNG_TOKEN_WORD_LOWERCASE:
  554. printf("word (lower case) : %s\n", tokenizer_get_token_text(tokenizer));
  555. break;
  556. case ESPEAKNG_TOKEN_WORD_MIXEDCASE:
  557. printf("word (mixed case) : %s\n", tokenizer_get_token_text(tokenizer));
  558. break;
  559. case ESPEAKNG_TOKEN_WORD_CAPITALIZED:
  560. printf("word (capitalized) : %s\n", tokenizer_get_token_text(tokenizer));
  561. break;
  562. case ESPEAKNG_TOKEN_FULL_STOP:
  563. printf("full stop : %s\n", tokenizer_get_token_text(tokenizer));
  564. break;
  565. case ESPEAKNG_TOKEN_QUESTION_MARK:
  566. printf("question mark : %s\n", tokenizer_get_token_text(tokenizer));
  567. break;
  568. case ESPEAKNG_TOKEN_EXCLAMATION_MARK:
  569. printf("exclamation mark : %s\n", tokenizer_get_token_text(tokenizer));
  570. break;
  571. case ESPEAKNG_TOKEN_COMMA:
  572. printf("comma : %s\n", tokenizer_get_token_text(tokenizer));
  573. break;
  574. case ESPEAKNG_TOKEN_COLON:
  575. printf("colon : %s\n", tokenizer_get_token_text(tokenizer));
  576. break;
  577. case ESPEAKNG_TOKEN_SEMICOLON:
  578. printf("semicolon : %s\n", tokenizer_get_token_text(tokenizer));
  579. break;
  580. }
  581. }
  582. void
  583. print_tokens_from_file(const char *filename, const char *encoding_name)
  584. {
  585. espeak_ng_ENCODING encoding = espeak_ng_EncodingFromName(encoding_name);
  586. if (encoding == ESPEAKNG_ENCODING_UNKNOWN) {
  587. printf("Unknown encoding \"%s\".\n", encoding_name);
  588. return;
  589. }
  590. int length = GetFileLength(filename);
  591. FILE *f = (length > 0) ? fopen(filename, "rb") : NULL;
  592. if (!f) {
  593. printf("Cannot open file: %s\n", filename);
  594. return;
  595. }
  596. char *buffer = malloc(length);
  597. if (!buffer) {
  598. fclose(f);
  599. printf("Out of memory!\n");
  600. return;
  601. }
  602. fread(buffer, 1, length, f);
  603. fclose(f);
  604. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  605. if (text_decoder_decode_string(decoder, buffer, length, encoding) == ENS_OK)
  606. print_tokens(decoder);
  607. destroy_text_decoder(decoder);
  608. }
  609. void
  610. usage(const char *program)
  611. {
  612. printf("%s -- Run the tokenizer tests.\n", program);
  613. printf("%s ENCODING FILENAME -- Print the tokens for FILENAME.\n", program);
  614. }
  615. int
  616. main(int argc, char **argv)
  617. {
  618. switch (argc)
  619. {
  620. case 1: run_tests(); break;
  621. case 3: print_tokens_from_file(argv[2], argv[1]); break;
  622. default: usage(argv[0]); return EXIT_FAILURE;
  623. }
  624. return EXIT_SUCCESS;
  625. }