eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizer.c 31KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840
  1. /*
  2. * Copyright (C) 2017 Reece H. Dunn
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 3 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write see:
  16. * <http://www.gnu.org/licenses/>.
  17. */
  18. #include "config.h"
  19. #include <assert.h>
  20. #include <errno.h>
  21. #include <stdint.h>
  22. #include <stdlib.h>
  23. #include <string.h>
  24. #include <stdio.h>
  25. #include <sys/stat.h>
  26. #include <espeak-ng/espeak_ng.h>
  27. #include <espeak-ng/encoding.h>
  28. #include <espeak-ng/tokenizer.h>
  29. #include "speech.h"
  30. #include "phoneme.h"
  31. #include "synthesize.h"
  32. #include "translate.h"
  33. // TODO: Find a better place for this than speech.c, so it can be implemented
  34. // in one place without having to include all of speech.c.
  35. int GetFileLength(const char *filename)
  36. {
  37. struct stat statbuf;
  38. if (stat(filename, &statbuf) != 0)
  39. return -errno;
  40. if (S_ISDIR(statbuf.st_mode))
  41. return -EISDIR;
  42. return statbuf.st_size;
  43. }
  44. void
  45. test_latin_common()
  46. {
  47. printf("testing Latin/Common (Latn/Zyyy) script classification\n");
  48. assert(clause_type_from_codepoint('a') == CLAUSE_NONE);
  49. assert(clause_type_from_codepoint('.') == CLAUSE_PERIOD);
  50. assert(clause_type_from_codepoint('?') == CLAUSE_QUESTION);
  51. assert(clause_type_from_codepoint('!') == CLAUSE_EXCLAMATION);
  52. assert(clause_type_from_codepoint(',') == CLAUSE_COMMA);
  53. assert(clause_type_from_codepoint(':') == CLAUSE_COLON);
  54. assert(clause_type_from_codepoint(';') == CLAUSE_SEMICOLON);
  55. assert(clause_type_from_codepoint(0x00A1) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER));
  56. assert(clause_type_from_codepoint(0x00Bf) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER));
  57. assert(clause_type_from_codepoint(0x2013) == CLAUSE_SEMICOLON);
  58. assert(clause_type_from_codepoint(0x2014) == CLAUSE_SEMICOLON);
  59. assert(clause_type_from_codepoint(0x2026) == (CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER));
  60. }
  61. void
  62. test_greek()
  63. {
  64. printf("testing Greek (Grek) script classification\n");
  65. assert(clause_type_from_codepoint(0x037E) == CLAUSE_QUESTION);
  66. assert(clause_type_from_codepoint(0x0387) == CLAUSE_SEMICOLON);
  67. }
  68. void
  69. test_armenian()
  70. {
  71. printf("testing Armenian (Armn) script classification\n");
  72. assert(clause_type_from_codepoint(0x055B) == (CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD));
  73. assert(clause_type_from_codepoint(0x055C) == (CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD));
  74. assert(clause_type_from_codepoint(0x055D) == CLAUSE_COMMA);
  75. assert(clause_type_from_codepoint(0x055E) == (CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD));
  76. assert(clause_type_from_codepoint(0x0589) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  77. }
  78. void
  79. test_arabic()
  80. {
  81. printf("testing Arabic (Arab) script classification\n");
  82. assert(clause_type_from_codepoint(0x060C) == CLAUSE_COMMA);
  83. assert(clause_type_from_codepoint(0x061B) == CLAUSE_SEMICOLON);
  84. assert(clause_type_from_codepoint(0x061F) == CLAUSE_QUESTION);
  85. assert(clause_type_from_codepoint(0x06D4) == CLAUSE_PERIOD);
  86. }
  87. void
  88. test_devanagari()
  89. {
  90. printf("testing Devanagari (Deva) script classification\n");
  91. assert(clause_type_from_codepoint(0x0964) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  92. }
  93. void
  94. test_tibetan()
  95. {
  96. printf("testing Tibetan (Tibt) script classification\n");
  97. assert(clause_type_from_codepoint(0x0F0D) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  98. assert(clause_type_from_codepoint(0x0F0E) == CLAUSE_PARAGRAPH);
  99. }
  100. void
  101. test_sinhala()
  102. {
  103. printf("testing Sinhala (Sinh) script classification\n");
  104. assert(clause_type_from_codepoint(0x0DF4) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  105. }
  106. void
  107. test_georgian()
  108. {
  109. printf("testing Georgian (Geor) script classification\n");
  110. assert(clause_type_from_codepoint(0x10FB) == CLAUSE_PARAGRAPH);
  111. }
  112. void
  113. test_ethiopic()
  114. {
  115. printf("testing Ethiopic (Ethi) script classification\n");
  116. assert(clause_type_from_codepoint(0x1362) == CLAUSE_PERIOD);
  117. assert(clause_type_from_codepoint(0x1363) == CLAUSE_COMMA);
  118. assert(clause_type_from_codepoint(0x1364) == CLAUSE_SEMICOLON);
  119. assert(clause_type_from_codepoint(0x1365) == CLAUSE_COLON);
  120. assert(clause_type_from_codepoint(0x1366) == CLAUSE_COLON);
  121. assert(clause_type_from_codepoint(0x1367) == CLAUSE_QUESTION);
  122. assert(clause_type_from_codepoint(0x1368) == CLAUSE_PARAGRAPH);
  123. }
  124. void
  125. test_ideographic()
  126. {
  127. printf("testing Ideographic (Hani) script classification\n");
  128. assert(clause_type_from_codepoint(0x3001) == (CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER));
  129. assert(clause_type_from_codepoint(0x3002) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  130. }
  131. void
  132. test_fullwidth()
  133. {
  134. printf("testing Full Width/Common (Zyyy) script classification\n");
  135. assert(clause_type_from_codepoint(0xFF01) == (CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER));
  136. assert(clause_type_from_codepoint(0xFF0C) == (CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER));
  137. assert(clause_type_from_codepoint(0xFF0E) == (CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER));
  138. assert(clause_type_from_codepoint(0xFF1A) == (CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER));
  139. assert(clause_type_from_codepoint(0xFF1B) == (CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER));
  140. assert(clause_type_from_codepoint(0xFF1F) == (CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER));
  141. }
  142. void
  143. test_unbound_tokenizer()
  144. {
  145. printf("testing unbound tokenizer\n");
  146. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  147. assert(tokenizer != NULL);
  148. assert(tokenizer_get_token_text(tokenizer) != NULL);
  149. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  150. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  151. assert(tokenizer_get_token_text(tokenizer) != NULL);
  152. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  153. assert(tokenizer_reset(tokenizer, NULL, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  154. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  155. assert(tokenizer_get_token_text(tokenizer) != NULL);
  156. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  157. destroy_tokenizer(tokenizer);
  158. }
  159. void
  160. test_linux_newline_tokens()
  161. {
  162. printf("testing linux newline tokens\n");
  163. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  164. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  165. assert(text_decoder_decode_string(decoder, "\n\n", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
  166. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  167. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  168. assert(tokenizer_get_token_text(tokenizer) != NULL);
  169. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  170. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  171. assert(tokenizer_get_token_text(tokenizer) != NULL);
  172. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  173. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  174. assert(tokenizer_get_token_text(tokenizer) != NULL);
  175. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  176. destroy_text_decoder(decoder);
  177. destroy_tokenizer(tokenizer);
  178. }
  179. void
  180. test_mac_newline_tokens()
  181. {
  182. printf("testing mac newline tokens\n");
  183. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  184. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  185. assert(text_decoder_decode_string(decoder, "\r\r", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
  186. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  187. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  188. assert(tokenizer_get_token_text(tokenizer) != NULL);
  189. assert(strcmp(tokenizer_get_token_text(tokenizer), "\r") == 0);
  190. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  191. assert(tokenizer_get_token_text(tokenizer) != NULL);
  192. assert(strcmp(tokenizer_get_token_text(tokenizer), "\r") == 0);
  193. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  194. assert(tokenizer_get_token_text(tokenizer) != NULL);
  195. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  196. destroy_text_decoder(decoder);
  197. destroy_tokenizer(tokenizer);
  198. }
  199. void
  200. test_windows_newline_tokens()
  201. {
  202. printf("testing windows newline tokens\n");
  203. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  204. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  205. assert(text_decoder_decode_string(decoder, "\r\n\r\n", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
  206. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  207. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  208. assert(tokenizer_get_token_text(tokenizer) != NULL);
  209. assert(strcmp(tokenizer_get_token_text(tokenizer), "\r\n") == 0);
  210. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  211. assert(tokenizer_get_token_text(tokenizer) != NULL);
  212. assert(strcmp(tokenizer_get_token_text(tokenizer), "\r\n") == 0);
  213. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  214. assert(tokenizer_get_token_text(tokenizer) != NULL);
  215. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  216. destroy_text_decoder(decoder);
  217. destroy_tokenizer(tokenizer);
  218. }
  219. void
  220. test_unicode_newline_tokens()
  221. {
  222. printf("testing unicode newline tokens\n");
  223. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  224. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  225. assert(text_decoder_decode_string(decoder, "\x0C\x0C\xC2\x85\xC2\x85\xE2\x80\xA8\xE2\x80\xA8", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  226. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  227. // FORM FEED (FF) -- Used as a page (not paragraph) break.
  228. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  229. assert(tokenizer_get_token_text(tokenizer) != NULL);
  230. assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0C") == 0);
  231. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  232. assert(tokenizer_get_token_text(tokenizer) != NULL);
  233. assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0C") == 0);
  234. // NEXT LINE (NEL) [U+0085] -- Used in EBCDIC systems as a combined CR+LF character.
  235. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  236. assert(tokenizer_get_token_text(tokenizer) != NULL);
  237. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0);
  238. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  239. assert(tokenizer_get_token_text(tokenizer) != NULL);
  240. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0);
  241. // General Category: Zl -- LINE SEPARATOR [U+2028]
  242. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  243. assert(tokenizer_get_token_text(tokenizer) != NULL);
  244. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0);
  245. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  246. assert(tokenizer_get_token_text(tokenizer) != NULL);
  247. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0);
  248. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  249. assert(tokenizer_get_token_text(tokenizer) != NULL);
  250. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  251. destroy_text_decoder(decoder);
  252. destroy_tokenizer(tokenizer);
  253. }
  254. void
  255. test_paragraph_tokens()
  256. {
  257. printf("testing paragraph tokens\n");
  258. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  259. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  260. assert(text_decoder_decode_string(decoder, "\xE2\x80\xA9\xE2\x80\xA9", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  261. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  262. // General Category: Zp, PARAGRAPH SEPARATOR [U+2029]
  263. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PARAGRAPH);
  264. assert(tokenizer_get_token_text(tokenizer) != NULL);
  265. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0);
  266. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PARAGRAPH);
  267. assert(tokenizer_get_token_text(tokenizer) != NULL);
  268. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0);
  269. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  270. assert(tokenizer_get_token_text(tokenizer) != NULL);
  271. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  272. destroy_text_decoder(decoder);
  273. destroy_tokenizer(tokenizer);
  274. }
  275. void
  276. test_whitespace_tokens()
  277. {
  278. printf("testing whitespace tokens\n");
  279. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  280. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  281. assert(text_decoder_decode_string(decoder, "\t\t\n\x0B\x0B\n \xE3\x80\x80 \n\xC2\xA0\xC2\xA0", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  282. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  283. // General Category: Cc, Property: White_Space
  284. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  285. assert(tokenizer_get_token_text(tokenizer) != NULL);
  286. assert(strcmp(tokenizer_get_token_text(tokenizer), "\t\t") == 0);
  287. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  288. assert(tokenizer_get_token_text(tokenizer) != NULL);
  289. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  290. // General Category: Cc, Property: White_Space, VERTICAL TAB (VT) -- Not treated as newline tokens.
  291. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  292. assert(tokenizer_get_token_text(tokenizer) != NULL);
  293. assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0B\x0B") == 0);
  294. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  295. assert(tokenizer_get_token_text(tokenizer) != NULL);
  296. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  297. // General Category: Zs, Property: White_Space
  298. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  299. assert(tokenizer_get_token_text(tokenizer) != NULL);
  300. assert(strcmp(tokenizer_get_token_text(tokenizer), " \xE3\x80\x80 ") == 0);
  301. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  302. assert(tokenizer_get_token_text(tokenizer) != NULL);
  303. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  304. // General Category: Zs, Property: White_Space, Decomposition: <noBreak>, NO-BREAK SPACE [U+00A0]
  305. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  306. assert(tokenizer_get_token_text(tokenizer) != NULL);
  307. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xA0\xC2\xA0") == 0);
  308. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  309. assert(tokenizer_get_token_text(tokenizer) != NULL);
  310. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  311. destroy_text_decoder(decoder);
  312. destroy_tokenizer(tokenizer);
  313. }
  314. void
  315. test_Latn_word_tokens()
  316. {
  317. printf("testing Latin (Latn) script word tokens\n");
  318. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  319. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  320. assert(text_decoder_decode_string(decoder, "One one ONE OneTwo ONETwo", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
  321. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  322. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_CAPITALIZED);
  323. assert(tokenizer_get_token_text(tokenizer) != NULL);
  324. assert(strcmp(tokenizer_get_token_text(tokenizer), "One") == 0);
  325. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  326. assert(tokenizer_get_token_text(tokenizer) != NULL);
  327. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  328. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_LOWERCASE);
  329. assert(tokenizer_get_token_text(tokenizer) != NULL);
  330. assert(strcmp(tokenizer_get_token_text(tokenizer), "one") == 0);
  331. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  332. assert(tokenizer_get_token_text(tokenizer) != NULL);
  333. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  334. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_UPPERCASE);
  335. assert(tokenizer_get_token_text(tokenizer) != NULL);
  336. assert(strcmp(tokenizer_get_token_text(tokenizer), "ONE") == 0);
  337. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  338. assert(tokenizer_get_token_text(tokenizer) != NULL);
  339. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  340. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_MIXEDCASE);
  341. assert(tokenizer_get_token_text(tokenizer) != NULL);
  342. assert(strcmp(tokenizer_get_token_text(tokenizer), "OneTwo") == 0);
  343. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  344. assert(tokenizer_get_token_text(tokenizer) != NULL);
  345. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  346. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_MIXEDCASE);
  347. assert(tokenizer_get_token_text(tokenizer) != NULL);
  348. assert(strcmp(tokenizer_get_token_text(tokenizer), "ONETwo") == 0);
  349. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  350. assert(tokenizer_get_token_text(tokenizer) != NULL);
  351. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  352. destroy_text_decoder(decoder);
  353. destroy_tokenizer(tokenizer);
  354. }
  355. void
  356. test_Latn_punctuation_tokens()
  357. {
  358. printf("testing Latin (Latn) script punctuation tokens\n");
  359. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  360. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  361. assert(text_decoder_decode_string(decoder, ". ? .. ! ... , .... : ; \xE2\x80\xA6", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  362. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  363. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
  364. assert(tokenizer_get_token_text(tokenizer) != NULL);
  365. assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);
  366. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  367. assert(tokenizer_get_token_text(tokenizer) != NULL);
  368. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  369. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_QUESTION_MARK);
  370. assert(tokenizer_get_token_text(tokenizer) != NULL);
  371. assert(strcmp(tokenizer_get_token_text(tokenizer), "?") == 0);
  372. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  373. assert(tokenizer_get_token_text(tokenizer) != NULL);
  374. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  375. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
  376. assert(tokenizer_get_token_text(tokenizer) != NULL);
  377. assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);
  378. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
  379. assert(tokenizer_get_token_text(tokenizer) != NULL);
  380. assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);
  381. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  382. assert(tokenizer_get_token_text(tokenizer) != NULL);
  383. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  384. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_EXCLAMATION_MARK);
  385. assert(tokenizer_get_token_text(tokenizer) != NULL);
  386. assert(strcmp(tokenizer_get_token_text(tokenizer), "!") == 0);
  387. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  388. assert(tokenizer_get_token_text(tokenizer) != NULL);
  389. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  390. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_ELLIPSIS);
  391. assert(tokenizer_get_token_text(tokenizer) != NULL);
  392. assert(strcmp(tokenizer_get_token_text(tokenizer), "...") == 0);
  393. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  394. assert(tokenizer_get_token_text(tokenizer) != NULL);
  395. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  396. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_COMMA);
  397. assert(tokenizer_get_token_text(tokenizer) != NULL);
  398. assert(strcmp(tokenizer_get_token_text(tokenizer), ",") == 0);
  399. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  400. assert(tokenizer_get_token_text(tokenizer) != NULL);
  401. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  402. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_ELLIPSIS);
  403. assert(tokenizer_get_token_text(tokenizer) != NULL);
  404. assert(strcmp(tokenizer_get_token_text(tokenizer), "...") == 0);
  405. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
  406. assert(tokenizer_get_token_text(tokenizer) != NULL);
  407. assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);
  408. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  409. assert(tokenizer_get_token_text(tokenizer) != NULL);
  410. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  411. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_COLON);
  412. assert(tokenizer_get_token_text(tokenizer) != NULL);
  413. assert(strcmp(tokenizer_get_token_text(tokenizer), ":") == 0);
  414. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  415. assert(tokenizer_get_token_text(tokenizer) != NULL);
  416. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  417. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_SEMICOLON);
  418. assert(tokenizer_get_token_text(tokenizer) != NULL);
  419. assert(strcmp(tokenizer_get_token_text(tokenizer), ";") == 0);
  420. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  421. assert(tokenizer_get_token_text(tokenizer) != NULL);
  422. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  423. // HORIZONTAL ELLIPSIS [U+2026]
  424. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_ELLIPSIS);
  425. assert(tokenizer_get_token_text(tokenizer) != NULL);
  426. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA6") == 0);
  427. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  428. assert(tokenizer_get_token_text(tokenizer) != NULL);
  429. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  430. destroy_text_decoder(decoder);
  431. destroy_tokenizer(tokenizer);
  432. }
  433. void
  434. test_Latn_general_punctuation_tokens()
  435. {
  436. printf("testing Latin (Latn) script general punctuation tokens\n");
  437. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  438. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  439. assert(text_decoder_decode_string(decoder, "\" () - _ \xC2\xAB\xC2\xBB", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  440. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  441. // General Category: Po
  442. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  443. assert(tokenizer_get_token_text(tokenizer) != NULL);
  444. assert(strcmp(tokenizer_get_token_text(tokenizer), "\"") == 0);
  445. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  446. assert(tokenizer_get_token_text(tokenizer) != NULL);
  447. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  448. // General Category: Ps
  449. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  450. assert(tokenizer_get_token_text(tokenizer) != NULL);
  451. assert(strcmp(tokenizer_get_token_text(tokenizer), "(") == 0);
  452. // General Category: Pe
  453. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  454. assert(tokenizer_get_token_text(tokenizer) != NULL);
  455. assert(strcmp(tokenizer_get_token_text(tokenizer), ")") == 0);
  456. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  457. assert(tokenizer_get_token_text(tokenizer) != NULL);
  458. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  459. // General Category: Pd
  460. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  461. assert(tokenizer_get_token_text(tokenizer) != NULL);
  462. assert(strcmp(tokenizer_get_token_text(tokenizer), "-") == 0);
  463. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  464. assert(tokenizer_get_token_text(tokenizer) != NULL);
  465. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  466. // General Category: Pc
  467. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  468. assert(tokenizer_get_token_text(tokenizer) != NULL);
  469. assert(strcmp(tokenizer_get_token_text(tokenizer), "_") == 0);
  470. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  471. assert(tokenizer_get_token_text(tokenizer) != NULL);
  472. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  473. // General Category: Pi, LEFT-POINTING DOUBLE ANGLE QUOTATION MARK [U+00AB]
  474. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  475. assert(tokenizer_get_token_text(tokenizer) != NULL);
  476. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xAB") == 0);
  477. // General Category: Pf, RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK [U+00BB]
  478. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  479. assert(tokenizer_get_token_text(tokenizer) != NULL);
  480. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xBB") == 0);
  481. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  482. assert(tokenizer_get_token_text(tokenizer) != NULL);
  483. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  484. destroy_text_decoder(decoder);
  485. destroy_tokenizer(tokenizer);
  486. }
  487. void
  488. test_Latn_symbol_tokens()
  489. {
  490. printf("testing Latin (Latn) script symbol tokens\n");
  491. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  492. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  493. assert(text_decoder_decode_string(decoder, "$ ^ + \xC2\xA9", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  494. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  495. // General Category: Sc
  496. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_SYMBOL);
  497. assert(tokenizer_get_token_text(tokenizer) != NULL);
  498. assert(strcmp(tokenizer_get_token_text(tokenizer), "$") == 0);
  499. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  500. assert(tokenizer_get_token_text(tokenizer) != NULL);
  501. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  502. // General Category: Sk
  503. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_SYMBOL);
  504. assert(tokenizer_get_token_text(tokenizer) != NULL);
  505. assert(strcmp(tokenizer_get_token_text(tokenizer), "^") == 0);
  506. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  507. assert(tokenizer_get_token_text(tokenizer) != NULL);
  508. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  509. // General Category: Sm
  510. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_SYMBOL);
  511. assert(tokenizer_get_token_text(tokenizer) != NULL);
  512. assert(strcmp(tokenizer_get_token_text(tokenizer), "+") == 0);
  513. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  514. assert(tokenizer_get_token_text(tokenizer) != NULL);
  515. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  516. // General Category: So, COPYRIGHT SIGN [U+00A9]
  517. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_SYMBOL);
  518. assert(tokenizer_get_token_text(tokenizer) != NULL);
  519. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xA9") == 0);
  520. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  521. assert(tokenizer_get_token_text(tokenizer) != NULL);
  522. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  523. destroy_text_decoder(decoder);
  524. destroy_tokenizer(tokenizer);
  525. }
  526. void
  527. run_tests()
  528. {
  529. test_latin_common();
  530. test_greek();
  531. test_armenian();
  532. test_arabic();
  533. test_devanagari();
  534. test_tibetan();
  535. test_sinhala();
  536. test_georgian();
  537. test_ethiopic();
  538. test_ideographic();
  539. test_fullwidth();
  540. test_unbound_tokenizer();
  541. test_linux_newline_tokens();
  542. test_mac_newline_tokens();
  543. test_windows_newline_tokens();
  544. test_unicode_newline_tokens();
  545. test_paragraph_tokens();
  546. test_whitespace_tokens();
  547. test_Latn_word_tokens();
  548. test_Latn_punctuation_tokens();
  549. test_Latn_general_punctuation_tokens();
  550. test_Latn_symbol_tokens();
  551. printf("done\n");
  552. }
  553. void
  554. escape_newline(const char *s)
  555. {
  556. for ( ; *s; ++s) switch (*s)
  557. {
  558. case '\r': printf("\\r"); break;
  559. case '\n': printf("\\n"); break;
  560. default: putc(*s, stdout); break;
  561. }
  562. }
  563. void
  564. print_tokens(espeak_ng_TEXT_DECODER *decoder)
  565. {
  566. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  567. if (!tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT)) {
  568. destroy_tokenizer(tokenizer);
  569. return;
  570. }
  571. while (1) switch (tokenizer_read_next_token(tokenizer))
  572. {
  573. case ESPEAKNG_TOKEN_END_OF_BUFFER:
  574. destroy_tokenizer(tokenizer);
  575. return;
  576. case ESPEAKNG_TOKEN_UNKNOWN:
  577. printf("unknown : %s\n", tokenizer_get_token_text(tokenizer));
  578. break;
  579. case ESPEAKNG_TOKEN_NEWLINE:
  580. printf("newline : ");
  581. escape_newline(tokenizer_get_token_text(tokenizer));
  582. putc('\n', stdout);
  583. break;
  584. case ESPEAKNG_TOKEN_PARAGRAPH:
  585. printf("paragraph : %s\n", tokenizer_get_token_text(tokenizer));
  586. break;
  587. case ESPEAKNG_TOKEN_WHITESPACE:
  588. printf("whitespace : %s\n", tokenizer_get_token_text(tokenizer));
  589. break;
  590. case ESPEAKNG_TOKEN_WORD_UPPERCASE:
  591. printf("word (upper case) : %s\n", tokenizer_get_token_text(tokenizer));
  592. break;
  593. case ESPEAKNG_TOKEN_WORD_LOWERCASE:
  594. printf("word (lower case) : %s\n", tokenizer_get_token_text(tokenizer));
  595. break;
  596. case ESPEAKNG_TOKEN_WORD_MIXEDCASE:
  597. printf("word (mixed case) : %s\n", tokenizer_get_token_text(tokenizer));
  598. break;
  599. case ESPEAKNG_TOKEN_WORD_CAPITALIZED:
  600. printf("word (capitalized) : %s\n", tokenizer_get_token_text(tokenizer));
  601. break;
  602. case ESPEAKNG_TOKEN_FULL_STOP:
  603. printf("full stop : %s\n", tokenizer_get_token_text(tokenizer));
  604. break;
  605. case ESPEAKNG_TOKEN_QUESTION_MARK:
  606. printf("question mark : %s\n", tokenizer_get_token_text(tokenizer));
  607. break;
  608. case ESPEAKNG_TOKEN_EXCLAMATION_MARK:
  609. printf("exclamation mark : %s\n", tokenizer_get_token_text(tokenizer));
  610. break;
  611. case ESPEAKNG_TOKEN_COMMA:
  612. printf("comma : %s\n", tokenizer_get_token_text(tokenizer));
  613. break;
  614. case ESPEAKNG_TOKEN_COLON:
  615. printf("colon : %s\n", tokenizer_get_token_text(tokenizer));
  616. break;
  617. case ESPEAKNG_TOKEN_SEMICOLON:
  618. printf("semicolon : %s\n", tokenizer_get_token_text(tokenizer));
  619. break;
  620. }
  621. }
  622. void
  623. print_tokens_from_file(const char *filename, const char *encoding_name)
  624. {
  625. espeak_ng_ENCODING encoding = espeak_ng_EncodingFromName(encoding_name);
  626. if (encoding == ESPEAKNG_ENCODING_UNKNOWN) {
  627. printf("Unknown encoding \"%s\".\n", encoding_name);
  628. return;
  629. }
  630. int length = GetFileLength(filename);
  631. FILE *f = (length > 0) ? fopen(filename, "rb") : NULL;
  632. if (!f) {
  633. printf("Cannot open file: %s\n", filename);
  634. return;
  635. }
  636. char *buffer = malloc(length);
  637. if (!buffer) {
  638. fclose(f);
  639. printf("Out of memory!\n");
  640. return;
  641. }
  642. fread(buffer, 1, length, f);
  643. fclose(f);
  644. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  645. if (text_decoder_decode_string(decoder, buffer, length, encoding) == ENS_OK)
  646. print_tokens(decoder);
  647. destroy_text_decoder(decoder);
  648. }
  649. void
  650. usage(const char *program)
  651. {
  652. printf("%s -- Run the tokenizer tests.\n", program);
  653. printf("%s ENCODING FILENAME -- Print the tokens for FILENAME.\n", program);
  654. }
  655. int
  656. main(int argc, char **argv)
  657. {
  658. switch (argc)
  659. {
  660. case 1: run_tests(); break;
  661. case 3: print_tokens_from_file(argv[2], argv[1]); break;
  662. default: usage(argv[0]); return EXIT_FAILURE;
  663. }
  664. return EXIT_SUCCESS;
  665. }