eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizer.c 22KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632
  1. /*
  2. * Copyright (C) 2017 Reece H. Dunn
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 3 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write see:
  16. * <http://www.gnu.org/licenses/>.
  17. */
  18. #include "config.h"
  19. #include <assert.h>
  20. #include <errno.h>
  21. #include <stdint.h>
  22. #include <stdlib.h>
  23. #include <string.h>
  24. #include <stdio.h>
  25. #include <sys/stat.h>
  26. #include <espeak-ng/espeak_ng.h>
  27. #include "encoding.h"
  28. #include "tokenizer.h"
  29. #include "speech.h"
  30. #include "phoneme.h"
  31. #include "synthesize.h"
  32. #include "translate.h"
  33. // TODO: Find a better place for this than speech.c, so it can be implemented
  34. // in one place without having to include all of speech.c.
  35. int GetFileLength(const char *filename)
  36. {
  37. struct stat statbuf;
  38. if (stat(filename, &statbuf) != 0)
  39. return -errno;
  40. if (S_ISDIR(statbuf.st_mode))
  41. return -EISDIR;
  42. return statbuf.st_size;
  43. }
  44. void
  45. test_latin_common()
  46. {
  47. printf("testing Latin/Common (Latn/Zyyy) script classification\n");
  48. assert(clause_type_from_codepoint('a') == CLAUSE_NONE);
  49. assert(clause_type_from_codepoint('.') == CLAUSE_PERIOD);
  50. assert(clause_type_from_codepoint('?') == CLAUSE_QUESTION);
  51. assert(clause_type_from_codepoint('!') == CLAUSE_EXCLAMATION);
  52. assert(clause_type_from_codepoint(',') == CLAUSE_COMMA);
  53. assert(clause_type_from_codepoint(':') == CLAUSE_COLON);
  54. assert(clause_type_from_codepoint(';') == CLAUSE_SEMICOLON);
  55. assert(clause_type_from_codepoint(0x00A1) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER));
  56. assert(clause_type_from_codepoint(0x00Bf) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER));
  57. assert(clause_type_from_codepoint(0x2013) == CLAUSE_SEMICOLON);
  58. assert(clause_type_from_codepoint(0x2014) == CLAUSE_SEMICOLON);
  59. assert(clause_type_from_codepoint(0x2026) == (CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER));
  60. }
  61. void
  62. test_greek()
  63. {
  64. printf("testing Greek (Grek) script classification\n");
  65. assert(clause_type_from_codepoint(0x037E) == CLAUSE_QUESTION);
  66. assert(clause_type_from_codepoint(0x0387) == CLAUSE_SEMICOLON);
  67. }
  68. void
  69. test_armenian()
  70. {
  71. printf("testing Armenian (Armn) script classification\n");
  72. assert(clause_type_from_codepoint(0x055B) == (CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD));
  73. assert(clause_type_from_codepoint(0x055C) == (CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD));
  74. assert(clause_type_from_codepoint(0x055D) == CLAUSE_COMMA);
  75. assert(clause_type_from_codepoint(0x055E) == (CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD));
  76. assert(clause_type_from_codepoint(0x0589) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  77. }
  78. void
  79. test_arabic()
  80. {
  81. printf("testing Arabic (Arab) script classification\n");
  82. assert(clause_type_from_codepoint(0x060C) == CLAUSE_COMMA);
  83. assert(clause_type_from_codepoint(0x061B) == CLAUSE_SEMICOLON);
  84. assert(clause_type_from_codepoint(0x061F) == CLAUSE_QUESTION);
  85. assert(clause_type_from_codepoint(0x06D4) == CLAUSE_PERIOD);
  86. }
  87. void
  88. test_devanagari()
  89. {
  90. printf("testing Devanagari (Deva) script classification\n");
  91. assert(clause_type_from_codepoint(0x0964) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  92. }
  93. void
  94. test_tibetan()
  95. {
  96. printf("testing Tibetan (Tibt) script classification\n");
  97. assert(clause_type_from_codepoint(0x0F0D) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  98. assert(clause_type_from_codepoint(0x0F0E) == CLAUSE_PARAGRAPH);
  99. }
  100. void
  101. test_sinhala()
  102. {
  103. printf("testing Sinhala (Sinh) script classification\n");
  104. assert(clause_type_from_codepoint(0x0DF4) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  105. }
  106. void
  107. test_georgian()
  108. {
  109. printf("testing Georgian (Geor) script classification\n");
  110. assert(clause_type_from_codepoint(0x10FB) == CLAUSE_PARAGRAPH);
  111. }
  112. void
  113. test_ethiopic()
  114. {
  115. printf("testing Ethiopic (Ethi) script classification\n");
  116. assert(clause_type_from_codepoint(0x1362) == CLAUSE_PERIOD);
  117. assert(clause_type_from_codepoint(0x1363) == CLAUSE_COMMA);
  118. assert(clause_type_from_codepoint(0x1364) == CLAUSE_SEMICOLON);
  119. assert(clause_type_from_codepoint(0x1365) == CLAUSE_COLON);
  120. assert(clause_type_from_codepoint(0x1366) == CLAUSE_COLON);
  121. assert(clause_type_from_codepoint(0x1367) == CLAUSE_QUESTION);
  122. assert(clause_type_from_codepoint(0x1368) == CLAUSE_PARAGRAPH);
  123. }
  124. void
  125. test_ideographic()
  126. {
  127. printf("testing Ideographic (Hani) script classification\n");
  128. assert(clause_type_from_codepoint(0x3001) == (CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER));
  129. assert(clause_type_from_codepoint(0x3002) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  130. }
  131. void
  132. test_fullwidth()
  133. {
  134. printf("testing Full Width/Common (Zyyy) script classification\n");
  135. assert(clause_type_from_codepoint(0xFF01) == (CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER));
  136. assert(clause_type_from_codepoint(0xFF0C) == (CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER));
  137. assert(clause_type_from_codepoint(0xFF0E) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  138. assert(clause_type_from_codepoint(0xFF1A) == (CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER));
  139. assert(clause_type_from_codepoint(0xFF1B) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER));
  140. assert(clause_type_from_codepoint(0xFF1F) == (CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER));
  141. }
  142. void
  143. test_unbound_tokenizer()
  144. {
  145. printf("testing unbound tokenizer\n");
  146. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  147. assert(tokenizer != NULL);
  148. assert(tokenizer_get_token_text(tokenizer) != NULL);
  149. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  150. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  151. assert(tokenizer_get_token_text(tokenizer) != NULL);
  152. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  153. assert(tokenizer_reset(tokenizer, NULL, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  154. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  155. assert(tokenizer_get_token_text(tokenizer) != NULL);
  156. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  157. destroy_tokenizer(tokenizer);
  158. }
  159. void
  160. test_linux_newline_tokens()
  161. {
  162. printf("testing linux newline tokens\n");
  163. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  164. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  165. assert(text_decoder_decode_string(decoder, "\n\n", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
  166. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  167. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  168. assert(tokenizer_get_token_text(tokenizer) != NULL);
  169. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  170. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  171. assert(tokenizer_get_token_text(tokenizer) != NULL);
  172. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  173. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  174. assert(tokenizer_get_token_text(tokenizer) != NULL);
  175. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  176. destroy_text_decoder(decoder);
  177. destroy_tokenizer(tokenizer);
  178. }
  179. void
  180. test_mac_newline_tokens()
  181. {
  182. printf("testing mac newline tokens\n");
  183. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  184. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  185. assert(text_decoder_decode_string(decoder, "\r\r", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
  186. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  187. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  188. assert(tokenizer_get_token_text(tokenizer) != NULL);
  189. assert(strcmp(tokenizer_get_token_text(tokenizer), "\r") == 0);
  190. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  191. assert(tokenizer_get_token_text(tokenizer) != NULL);
  192. assert(strcmp(tokenizer_get_token_text(tokenizer), "\r") == 0);
  193. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  194. assert(tokenizer_get_token_text(tokenizer) != NULL);
  195. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  196. destroy_text_decoder(decoder);
  197. destroy_tokenizer(tokenizer);
  198. }
  199. void
  200. test_windows_newline_tokens()
  201. {
  202. printf("testing windows newline tokens\n");
  203. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  204. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  205. assert(text_decoder_decode_string(decoder, "\r\n\r\n", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
  206. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  207. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  208. assert(tokenizer_get_token_text(tokenizer) != NULL);
  209. assert(strcmp(tokenizer_get_token_text(tokenizer), "\r\n") == 0);
  210. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  211. assert(tokenizer_get_token_text(tokenizer) != NULL);
  212. assert(strcmp(tokenizer_get_token_text(tokenizer), "\r\n") == 0);
  213. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  214. assert(tokenizer_get_token_text(tokenizer) != NULL);
  215. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  216. destroy_text_decoder(decoder);
  217. destroy_tokenizer(tokenizer);
  218. }
  219. void
  220. test_unicode_newline_tokens()
  221. {
  222. printf("testing unicode newline tokens\n");
  223. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  224. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  225. assert(text_decoder_decode_string(decoder, "\x0C\x0C\xC2\x85\xC2\x85\xE2\x80\xA8\xE2\x80\xA8", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  226. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  227. // U+000C : FORM FEED (FF) -- Used as a page (not paragraph) break.
  228. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  229. assert(tokenizer_get_token_text(tokenizer) != NULL);
  230. assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0C") == 0);
  231. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  232. assert(tokenizer_get_token_text(tokenizer) != NULL);
  233. assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0C") == 0);
  234. // U+0085 : NEXT LINE (NEL) -- Used in EBCDIC systems as a combined CR+LF character.
  235. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  236. assert(tokenizer_get_token_text(tokenizer) != NULL);
  237. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0);
  238. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  239. assert(tokenizer_get_token_text(tokenizer) != NULL);
  240. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0);
  241. // General Category: Zl -- LINE SEPARATOR
  242. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  243. assert(tokenizer_get_token_text(tokenizer) != NULL);
  244. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0);
  245. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  246. assert(tokenizer_get_token_text(tokenizer) != NULL);
  247. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0);
  248. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  249. assert(tokenizer_get_token_text(tokenizer) != NULL);
  250. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  251. destroy_text_decoder(decoder);
  252. destroy_tokenizer(tokenizer);
  253. }
  254. void
  255. test_paragraph_tokens()
  256. {
  257. printf("testing paragraph tokens\n");
  258. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  259. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  260. assert(text_decoder_decode_string(decoder, "\xE2\x80\xA9\xE2\x80\xA9", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  261. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  262. // General Category: Zp -- PARAGRAPH SEPARATOR
  263. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PARAGRAPH);
  264. assert(tokenizer_get_token_text(tokenizer) != NULL);
  265. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0);
  266. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PARAGRAPH);
  267. assert(tokenizer_get_token_text(tokenizer) != NULL);
  268. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0);
  269. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  270. assert(tokenizer_get_token_text(tokenizer) != NULL);
  271. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  272. destroy_text_decoder(decoder);
  273. destroy_tokenizer(tokenizer);
  274. }
  275. void
  276. test_whitespace_tokens()
  277. {
  278. printf("testing whitespace tokens\n");
  279. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  280. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  281. assert(text_decoder_decode_string(decoder, "\t\t\n\x0B\x0B\n \xE3\x80\x80 \n\xC2\xA0\xC2\xA0", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  282. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  283. // General Category: Cc, Property: White_Space
  284. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  285. assert(tokenizer_get_token_text(tokenizer) != NULL);
  286. assert(strcmp(tokenizer_get_token_text(tokenizer), "\t\t") == 0);
  287. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  288. assert(tokenizer_get_token_text(tokenizer) != NULL);
  289. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  290. // General Category: Cc, Property: White_Space, VERTICAL TAB (VT) -- Not treated as newline tokens.
  291. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  292. assert(tokenizer_get_token_text(tokenizer) != NULL);
  293. assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0B\x0B") == 0);
  294. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  295. assert(tokenizer_get_token_text(tokenizer) != NULL);
  296. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  297. // General Category: Zs, Property: White_Space
  298. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  299. assert(tokenizer_get_token_text(tokenizer) != NULL);
  300. assert(strcmp(tokenizer_get_token_text(tokenizer), " \xE3\x80\x80 ") == 0);
  301. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  302. assert(tokenizer_get_token_text(tokenizer) != NULL);
  303. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  304. // General Category: Zs, Property: White_Space, Decomposition: <noBreak>
  305. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  306. assert(tokenizer_get_token_text(tokenizer) != NULL);
  307. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xA0\xC2\xA0") == 0);
  308. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  309. assert(tokenizer_get_token_text(tokenizer) != NULL);
  310. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  311. destroy_text_decoder(decoder);
  312. destroy_tokenizer(tokenizer);
  313. }
  314. void
  315. test_Latn_word_tokens()
  316. {
  317. printf("testing Latin (Latn) script word tokens\n");
  318. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  319. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  320. assert(text_decoder_decode_string(decoder, "One one ONE OneTwo ONETwo", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
  321. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  322. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_CAPITALIZED);
  323. assert(tokenizer_get_token_text(tokenizer) != NULL);
  324. assert(strcmp(tokenizer_get_token_text(tokenizer), "One") == 0);
  325. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  326. assert(tokenizer_get_token_text(tokenizer) != NULL);
  327. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  328. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_LOWERCASE);
  329. assert(tokenizer_get_token_text(tokenizer) != NULL);
  330. assert(strcmp(tokenizer_get_token_text(tokenizer), "one") == 0);
  331. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  332. assert(tokenizer_get_token_text(tokenizer) != NULL);
  333. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  334. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_UPPERCASE);
  335. assert(tokenizer_get_token_text(tokenizer) != NULL);
  336. assert(strcmp(tokenizer_get_token_text(tokenizer), "ONE") == 0);
  337. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  338. assert(tokenizer_get_token_text(tokenizer) != NULL);
  339. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  340. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_MIXEDCASE);
  341. assert(tokenizer_get_token_text(tokenizer) != NULL);
  342. assert(strcmp(tokenizer_get_token_text(tokenizer), "OneTwo") == 0);
  343. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  344. assert(tokenizer_get_token_text(tokenizer) != NULL);
  345. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  346. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_MIXEDCASE);
  347. assert(tokenizer_get_token_text(tokenizer) != NULL);
  348. assert(strcmp(tokenizer_get_token_text(tokenizer), "ONETwo") == 0);
  349. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  350. assert(tokenizer_get_token_text(tokenizer) != NULL);
  351. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  352. destroy_text_decoder(decoder);
  353. destroy_tokenizer(tokenizer);
  354. }
  355. void
  356. test_Latn_punctuation_tokens()
  357. {
  358. printf("testing Latin (Latn) script punctuation tokens\n");
  359. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  360. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  361. assert(text_decoder_decode_string(decoder, ". ?", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
  362. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  363. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
  364. assert(tokenizer_get_token_text(tokenizer) != NULL);
  365. assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);
  366. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  367. assert(tokenizer_get_token_text(tokenizer) != NULL);
  368. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  369. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_QUESTION_MARK);
  370. assert(tokenizer_get_token_text(tokenizer) != NULL);
  371. assert(strcmp(tokenizer_get_token_text(tokenizer), "?") == 0);
  372. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  373. assert(tokenizer_get_token_text(tokenizer) != NULL);
  374. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  375. destroy_text_decoder(decoder);
  376. destroy_tokenizer(tokenizer);
  377. }
  378. void
  379. run_tests()
  380. {
  381. test_latin_common();
  382. test_greek();
  383. test_armenian();
  384. test_arabic();
  385. test_devanagari();
  386. test_tibetan();
  387. test_sinhala();
  388. test_georgian();
  389. test_ethiopic();
  390. test_ideographic();
  391. test_fullwidth();
  392. test_unbound_tokenizer();
  393. test_linux_newline_tokens();
  394. test_mac_newline_tokens();
  395. test_windows_newline_tokens();
  396. test_unicode_newline_tokens();
  397. test_paragraph_tokens();
  398. test_whitespace_tokens();
  399. test_Latn_word_tokens();
  400. test_Latn_punctuation_tokens();
  401. printf("done\n");
  402. }
  403. void
  404. escape_newline(const char *s)
  405. {
  406. for ( ; *s; ++s) switch (*s)
  407. {
  408. case '\r': printf("\\r"); break;
  409. case '\n': printf("\\n"); break;
  410. default: putc(*s, stdout); break;
  411. }
  412. }
  413. void
  414. print_tokens(espeak_ng_TEXT_DECODER *decoder)
  415. {
  416. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  417. if (!tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT)) {
  418. destroy_tokenizer(tokenizer);
  419. return;
  420. }
  421. while (1) switch (tokenizer_read_next_token(tokenizer))
  422. {
  423. case ESPEAKNG_TOKEN_END_OF_BUFFER:
  424. destroy_tokenizer(tokenizer);
  425. return;
  426. case ESPEAKNG_TOKEN_UNKNOWN:
  427. printf("unknown : %s\n", tokenizer_get_token_text(tokenizer));
  428. break;
  429. case ESPEAKNG_TOKEN_NEWLINE:
  430. printf("newline : ");
  431. escape_newline(tokenizer_get_token_text(tokenizer));
  432. putc('\n', stdout);
  433. break;
  434. case ESPEAKNG_TOKEN_PARAGRAPH:
  435. printf("paragraph : %s\n", tokenizer_get_token_text(tokenizer));
  436. break;
  437. case ESPEAKNG_TOKEN_WHITESPACE:
  438. printf("whitespace : %s\n", tokenizer_get_token_text(tokenizer));
  439. break;
  440. case ESPEAKNG_TOKEN_WORD_UPPERCASE:
  441. printf("word (upper case) : %s\n", tokenizer_get_token_text(tokenizer));
  442. break;
  443. case ESPEAKNG_TOKEN_WORD_LOWERCASE:
  444. printf("word (lower case) : %s\n", tokenizer_get_token_text(tokenizer));
  445. break;
  446. case ESPEAKNG_TOKEN_WORD_MIXEDCASE:
  447. printf("word (mixed case) : %s\n", tokenizer_get_token_text(tokenizer));
  448. break;
  449. case ESPEAKNG_TOKEN_WORD_CAPITALIZED:
  450. printf("word (capitalized) : %s\n", tokenizer_get_token_text(tokenizer));
  451. break;
  452. case ESPEAKNG_TOKEN_FULL_STOP:
  453. printf("full stop : %s\n", tokenizer_get_token_text(tokenizer));
  454. break;
  455. case ESPEAKNG_TOKEN_QUESTION_MARK:
  456. printf("question mark : %s\n", tokenizer_get_token_text(tokenizer));
  457. break;
  458. }
  459. }
  460. void
  461. print_tokens_from_file(const char *filename, const char *encoding_name)
  462. {
  463. espeak_ng_ENCODING encoding = espeak_ng_EncodingFromName(encoding_name);
  464. if (encoding == ESPEAKNG_ENCODING_UNKNOWN) {
  465. printf("Unknown encoding \"%s\".\n", encoding_name);
  466. return;
  467. }
  468. int length = GetFileLength(filename);
  469. FILE *f = (length > 0) ? fopen(filename, "rb") : NULL;
  470. if (!f) {
  471. printf("Cannot open file: %s\n", filename);
  472. return;
  473. }
  474. char *buffer = malloc(length);
  475. if (!buffer) {
  476. fclose(f);
  477. printf("Out of memory!\n");
  478. return;
  479. }
  480. fread(buffer, 1, length, f);
  481. fclose(f);
  482. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  483. if (text_decoder_decode_string(decoder, buffer, length, encoding) == ENS_OK)
  484. print_tokens(decoder);
  485. destroy_text_decoder(decoder);
  486. }
  487. void
  488. usage(const char *program)
  489. {
  490. printf("%s -- Run the tokenizer tests.\n", program);
  491. printf("%s ENCODING FILENAME -- Print the tokens for FILENAME.\n", program);
  492. }
  493. int
  494. main(int argc, char **argv)
  495. {
  496. switch (argc)
  497. {
  498. case 1: run_tests(); break;
  499. case 3: print_tokens_from_file(argv[2], argv[1]); break;
  500. default: usage(argv[0]); return EXIT_FAILURE;
  501. }
  502. return EXIT_SUCCESS;
  503. }