eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizer.c 26KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704
  1. /*
  2. * Copyright (C) 2017 Reece H. Dunn
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 3 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write see:
  16. * <http://www.gnu.org/licenses/>.
  17. */
  18. #include "config.h"
  19. #include <assert.h>
  20. #include <errno.h>
  21. #include <stdint.h>
  22. #include <stdlib.h>
  23. #include <string.h>
  24. #include <stdio.h>
  25. #include <sys/stat.h>
  26. #include <espeak-ng/espeak_ng.h>
  27. #include <espeak-ng/encoding.h>
  28. #include <espeak-ng/tokenizer.h>
  29. #include "speech.h"
  30. #include "phoneme.h"
  31. #include "synthesize.h"
  32. #include "translate.h"
  33. // TODO: Find a better place for this than speech.c, so it can be implemented
  34. // in one place without having to include all of speech.c.
  35. int GetFileLength(const char *filename)
  36. {
  37. struct stat statbuf;
  38. if (stat(filename, &statbuf) != 0)
  39. return -errno;
  40. if (S_ISDIR(statbuf.st_mode))
  41. return -EISDIR;
  42. return statbuf.st_size;
  43. }
  44. void
  45. test_unbound_tokenizer()
  46. {
  47. printf("testing unbound tokenizer\n");
  48. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  49. assert(tokenizer != NULL);
  50. assert(tokenizer_get_token_text(tokenizer) != NULL);
  51. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  52. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  53. assert(tokenizer_get_token_text(tokenizer) != NULL);
  54. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  55. assert(tokenizer_reset(tokenizer, NULL, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  56. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  57. assert(tokenizer_get_token_text(tokenizer) != NULL);
  58. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  59. destroy_tokenizer(tokenizer);
  60. }
  61. void
  62. test_linux_newline_tokens()
  63. {
  64. printf("testing linux newline tokens\n");
  65. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  66. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  67. assert(text_decoder_decode_string(decoder, "\n\n", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
  68. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  69. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  70. assert(tokenizer_get_token_text(tokenizer) != NULL);
  71. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  72. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  73. assert(tokenizer_get_token_text(tokenizer) != NULL);
  74. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  75. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  76. assert(tokenizer_get_token_text(tokenizer) != NULL);
  77. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  78. destroy_text_decoder(decoder);
  79. destroy_tokenizer(tokenizer);
  80. }
  81. void
  82. test_mac_newline_tokens()
  83. {
  84. printf("testing mac newline tokens\n");
  85. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  86. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  87. assert(text_decoder_decode_string(decoder, "\r\r", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
  88. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  89. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  90. assert(tokenizer_get_token_text(tokenizer) != NULL);
  91. assert(strcmp(tokenizer_get_token_text(tokenizer), "\r") == 0);
  92. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  93. assert(tokenizer_get_token_text(tokenizer) != NULL);
  94. assert(strcmp(tokenizer_get_token_text(tokenizer), "\r") == 0);
  95. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  96. assert(tokenizer_get_token_text(tokenizer) != NULL);
  97. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  98. destroy_text_decoder(decoder);
  99. destroy_tokenizer(tokenizer);
  100. }
  101. void
  102. test_windows_newline_tokens()
  103. {
  104. printf("testing windows newline tokens\n");
  105. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  106. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  107. assert(text_decoder_decode_string(decoder, "\r\n\r\n", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
  108. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  109. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  110. assert(tokenizer_get_token_text(tokenizer) != NULL);
  111. assert(strcmp(tokenizer_get_token_text(tokenizer), "\r\n") == 0);
  112. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  113. assert(tokenizer_get_token_text(tokenizer) != NULL);
  114. assert(strcmp(tokenizer_get_token_text(tokenizer), "\r\n") == 0);
  115. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  116. assert(tokenizer_get_token_text(tokenizer) != NULL);
  117. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  118. destroy_text_decoder(decoder);
  119. destroy_tokenizer(tokenizer);
  120. }
  121. void
  122. test_unicode_newline_tokens()
  123. {
  124. printf("testing unicode newline tokens\n");
  125. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  126. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  127. assert(text_decoder_decode_string(decoder, "\x0C\x0C\xC2\x85\xC2\x85\xE2\x80\xA8\xE2\x80\xA8", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  128. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  129. // FORM FEED (FF) -- Used as a page (not paragraph) break.
  130. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  131. assert(tokenizer_get_token_text(tokenizer) != NULL);
  132. assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0C") == 0);
  133. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  134. assert(tokenizer_get_token_text(tokenizer) != NULL);
  135. assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0C") == 0);
  136. // NEXT LINE (NEL) [U+0085] -- Used in EBCDIC systems as a combined CR+LF character.
  137. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  138. assert(tokenizer_get_token_text(tokenizer) != NULL);
  139. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0);
  140. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  141. assert(tokenizer_get_token_text(tokenizer) != NULL);
  142. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0);
  143. // General Category: Zl -- LINE SEPARATOR [U+2028]
  144. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  145. assert(tokenizer_get_token_text(tokenizer) != NULL);
  146. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0);
  147. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  148. assert(tokenizer_get_token_text(tokenizer) != NULL);
  149. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0);
  150. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  151. assert(tokenizer_get_token_text(tokenizer) != NULL);
  152. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  153. destroy_text_decoder(decoder);
  154. destroy_tokenizer(tokenizer);
  155. }
  156. void
  157. test_paragraph_tokens()
  158. {
  159. printf("testing paragraph tokens\n");
  160. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  161. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  162. assert(text_decoder_decode_string(decoder, "\xE2\x80\xA9\xE2\x80\xA9", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  163. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  164. // General Category: Zp, PARAGRAPH SEPARATOR [U+2029]
  165. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PARAGRAPH);
  166. assert(tokenizer_get_token_text(tokenizer) != NULL);
  167. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0);
  168. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PARAGRAPH);
  169. assert(tokenizer_get_token_text(tokenizer) != NULL);
  170. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0);
  171. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  172. assert(tokenizer_get_token_text(tokenizer) != NULL);
  173. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  174. destroy_text_decoder(decoder);
  175. destroy_tokenizer(tokenizer);
  176. }
  177. void
  178. test_whitespace_tokens()
  179. {
  180. printf("testing whitespace tokens\n");
  181. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  182. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  183. assert(text_decoder_decode_string(decoder, "\t\t\n\x0B\x0B\n \xE3\x80\x80 \n\xC2\xA0\xC2\xA0", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  184. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  185. // General Category: Cc, Property: White_Space
  186. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  187. assert(tokenizer_get_token_text(tokenizer) != NULL);
  188. assert(strcmp(tokenizer_get_token_text(tokenizer), "\t\t") == 0);
  189. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  190. assert(tokenizer_get_token_text(tokenizer) != NULL);
  191. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  192. // General Category: Cc, Property: White_Space, VERTICAL TAB (VT) -- Not treated as newline tokens.
  193. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  194. assert(tokenizer_get_token_text(tokenizer) != NULL);
  195. assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0B\x0B") == 0);
  196. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  197. assert(tokenizer_get_token_text(tokenizer) != NULL);
  198. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  199. // General Category: Zs, Property: White_Space
  200. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  201. assert(tokenizer_get_token_text(tokenizer) != NULL);
  202. assert(strcmp(tokenizer_get_token_text(tokenizer), " \xE3\x80\x80 ") == 0);
  203. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
  204. assert(tokenizer_get_token_text(tokenizer) != NULL);
  205. assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);
  206. // General Category: Zs, Property: White_Space, Decomposition: <noBreak>, NO-BREAK SPACE [U+00A0]
  207. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  208. assert(tokenizer_get_token_text(tokenizer) != NULL);
  209. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xA0\xC2\xA0") == 0);
  210. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  211. assert(tokenizer_get_token_text(tokenizer) != NULL);
  212. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  213. destroy_text_decoder(decoder);
  214. destroy_tokenizer(tokenizer);
  215. }
  216. void
  217. test_Latn_word_tokens()
  218. {
  219. printf("testing Latin (Latn) script word tokens\n");
  220. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  221. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  222. assert(text_decoder_decode_string(decoder, "One one ONE OneTwo ONETwo", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
  223. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  224. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_CAPITALIZED);
  225. assert(tokenizer_get_token_text(tokenizer) != NULL);
  226. assert(strcmp(tokenizer_get_token_text(tokenizer), "One") == 0);
  227. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  228. assert(tokenizer_get_token_text(tokenizer) != NULL);
  229. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  230. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_LOWERCASE);
  231. assert(tokenizer_get_token_text(tokenizer) != NULL);
  232. assert(strcmp(tokenizer_get_token_text(tokenizer), "one") == 0);
  233. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  234. assert(tokenizer_get_token_text(tokenizer) != NULL);
  235. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  236. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_UPPERCASE);
  237. assert(tokenizer_get_token_text(tokenizer) != NULL);
  238. assert(strcmp(tokenizer_get_token_text(tokenizer), "ONE") == 0);
  239. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  240. assert(tokenizer_get_token_text(tokenizer) != NULL);
  241. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  242. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_MIXEDCASE);
  243. assert(tokenizer_get_token_text(tokenizer) != NULL);
  244. assert(strcmp(tokenizer_get_token_text(tokenizer), "OneTwo") == 0);
  245. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  246. assert(tokenizer_get_token_text(tokenizer) != NULL);
  247. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  248. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_MIXEDCASE);
  249. assert(tokenizer_get_token_text(tokenizer) != NULL);
  250. assert(strcmp(tokenizer_get_token_text(tokenizer), "ONETwo") == 0);
  251. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  252. assert(tokenizer_get_token_text(tokenizer) != NULL);
  253. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  254. destroy_text_decoder(decoder);
  255. destroy_tokenizer(tokenizer);
  256. }
  257. void
  258. test_Latn_punctuation_tokens()
  259. {
  260. printf("testing Latin (Latn) script punctuation tokens\n");
  261. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  262. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  263. assert(text_decoder_decode_string(decoder, ". ? .. ! ... , .... : ; \xE2\x80\xA6", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  264. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  265. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
  266. assert(tokenizer_get_token_text(tokenizer) != NULL);
  267. assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);
  268. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  269. assert(tokenizer_get_token_text(tokenizer) != NULL);
  270. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  271. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_QUESTION_MARK);
  272. assert(tokenizer_get_token_text(tokenizer) != NULL);
  273. assert(strcmp(tokenizer_get_token_text(tokenizer), "?") == 0);
  274. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  275. assert(tokenizer_get_token_text(tokenizer) != NULL);
  276. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  277. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
  278. assert(tokenizer_get_token_text(tokenizer) != NULL);
  279. assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);
  280. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
  281. assert(tokenizer_get_token_text(tokenizer) != NULL);
  282. assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);
  283. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  284. assert(tokenizer_get_token_text(tokenizer) != NULL);
  285. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  286. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_EXCLAMATION_MARK);
  287. assert(tokenizer_get_token_text(tokenizer) != NULL);
  288. assert(strcmp(tokenizer_get_token_text(tokenizer), "!") == 0);
  289. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  290. assert(tokenizer_get_token_text(tokenizer) != NULL);
  291. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  292. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_ELLIPSIS);
  293. assert(tokenizer_get_token_text(tokenizer) != NULL);
  294. assert(strcmp(tokenizer_get_token_text(tokenizer), "...") == 0);
  295. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  296. assert(tokenizer_get_token_text(tokenizer) != NULL);
  297. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  298. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_COMMA);
  299. assert(tokenizer_get_token_text(tokenizer) != NULL);
  300. assert(strcmp(tokenizer_get_token_text(tokenizer), ",") == 0);
  301. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  302. assert(tokenizer_get_token_text(tokenizer) != NULL);
  303. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  304. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_ELLIPSIS);
  305. assert(tokenizer_get_token_text(tokenizer) != NULL);
  306. assert(strcmp(tokenizer_get_token_text(tokenizer), "...") == 0);
  307. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
  308. assert(tokenizer_get_token_text(tokenizer) != NULL);
  309. assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);
  310. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  311. assert(tokenizer_get_token_text(tokenizer) != NULL);
  312. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  313. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_COLON);
  314. assert(tokenizer_get_token_text(tokenizer) != NULL);
  315. assert(strcmp(tokenizer_get_token_text(tokenizer), ":") == 0);
  316. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  317. assert(tokenizer_get_token_text(tokenizer) != NULL);
  318. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  319. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_SEMICOLON);
  320. assert(tokenizer_get_token_text(tokenizer) != NULL);
  321. assert(strcmp(tokenizer_get_token_text(tokenizer), ";") == 0);
  322. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  323. assert(tokenizer_get_token_text(tokenizer) != NULL);
  324. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  325. // HORIZONTAL ELLIPSIS [U+2026]
  326. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_ELLIPSIS);
  327. assert(tokenizer_get_token_text(tokenizer) != NULL);
  328. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA6") == 0);
  329. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  330. assert(tokenizer_get_token_text(tokenizer) != NULL);
  331. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  332. destroy_text_decoder(decoder);
  333. destroy_tokenizer(tokenizer);
  334. }
  335. void
  336. test_Latn_general_punctuation_tokens()
  337. {
  338. printf("testing Latin (Latn) script general punctuation tokens\n");
  339. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  340. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  341. assert(text_decoder_decode_string(decoder, "\" () - _ \xC2\xAB\xC2\xBB", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  342. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  343. // General Category: Po
  344. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  345. assert(tokenizer_get_token_text(tokenizer) != NULL);
  346. assert(strcmp(tokenizer_get_token_text(tokenizer), "\"") == 0);
  347. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  348. assert(tokenizer_get_token_text(tokenizer) != NULL);
  349. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  350. // General Category: Ps
  351. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  352. assert(tokenizer_get_token_text(tokenizer) != NULL);
  353. assert(strcmp(tokenizer_get_token_text(tokenizer), "(") == 0);
  354. // General Category: Pe
  355. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  356. assert(tokenizer_get_token_text(tokenizer) != NULL);
  357. assert(strcmp(tokenizer_get_token_text(tokenizer), ")") == 0);
  358. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  359. assert(tokenizer_get_token_text(tokenizer) != NULL);
  360. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  361. // General Category: Pd
  362. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  363. assert(tokenizer_get_token_text(tokenizer) != NULL);
  364. assert(strcmp(tokenizer_get_token_text(tokenizer), "-") == 0);
  365. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  366. assert(tokenizer_get_token_text(tokenizer) != NULL);
  367. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  368. // General Category: Pc
  369. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  370. assert(tokenizer_get_token_text(tokenizer) != NULL);
  371. assert(strcmp(tokenizer_get_token_text(tokenizer), "_") == 0);
  372. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  373. assert(tokenizer_get_token_text(tokenizer) != NULL);
  374. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  375. // General Category: Pi, LEFT-POINTING DOUBLE ANGLE QUOTATION MARK [U+00AB]
  376. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  377. assert(tokenizer_get_token_text(tokenizer) != NULL);
  378. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xAB") == 0);
  379. // General Category: Pf, RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK [U+00BB]
  380. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PUNCTUATION);
  381. assert(tokenizer_get_token_text(tokenizer) != NULL);
  382. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xBB") == 0);
  383. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  384. assert(tokenizer_get_token_text(tokenizer) != NULL);
  385. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  386. destroy_text_decoder(decoder);
  387. destroy_tokenizer(tokenizer);
  388. }
  389. void
  390. test_Latn_symbol_tokens()
  391. {
  392. printf("testing Latin (Latn) script symbol tokens\n");
  393. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  394. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  395. assert(text_decoder_decode_string(decoder, "$ ^ + \xC2\xA9", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
  396. assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);
  397. // General Category: Sc
  398. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_SYMBOL);
  399. assert(tokenizer_get_token_text(tokenizer) != NULL);
  400. assert(strcmp(tokenizer_get_token_text(tokenizer), "$") == 0);
  401. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  402. assert(tokenizer_get_token_text(tokenizer) != NULL);
  403. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  404. // General Category: Sk
  405. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_SYMBOL);
  406. assert(tokenizer_get_token_text(tokenizer) != NULL);
  407. assert(strcmp(tokenizer_get_token_text(tokenizer), "^") == 0);
  408. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  409. assert(tokenizer_get_token_text(tokenizer) != NULL);
  410. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  411. // General Category: Sm
  412. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_SYMBOL);
  413. assert(tokenizer_get_token_text(tokenizer) != NULL);
  414. assert(strcmp(tokenizer_get_token_text(tokenizer), "+") == 0);
  415. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
  416. assert(tokenizer_get_token_text(tokenizer) != NULL);
  417. assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);
  418. // General Category: So, COPYRIGHT SIGN [U+00A9]
  419. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_SYMBOL);
  420. assert(tokenizer_get_token_text(tokenizer) != NULL);
  421. assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xA9") == 0);
  422. assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
  423. assert(tokenizer_get_token_text(tokenizer) != NULL);
  424. assert(*tokenizer_get_token_text(tokenizer) == '\0');
  425. destroy_text_decoder(decoder);
  426. destroy_tokenizer(tokenizer);
  427. }
  428. void
  429. run_tests()
  430. {
  431. test_unbound_tokenizer();
  432. test_linux_newline_tokens();
  433. test_mac_newline_tokens();
  434. test_windows_newline_tokens();
  435. test_unicode_newline_tokens();
  436. test_paragraph_tokens();
  437. test_whitespace_tokens();
  438. test_Latn_word_tokens();
  439. test_Latn_punctuation_tokens();
  440. test_Latn_general_punctuation_tokens();
  441. test_Latn_symbol_tokens();
  442. }
  443. void
  444. escape_newline(const char *s)
  445. {
  446. for ( ; *s; ++s) switch (*s)
  447. {
  448. case '\r': printf("\\r"); break;
  449. case '\n': printf("\\n"); break;
  450. default: putc(*s, stdout); break;
  451. }
  452. }
  453. void
  454. print_tokens(espeak_ng_TEXT_DECODER *decoder)
  455. {
  456. espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
  457. if (!tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT)) {
  458. destroy_tokenizer(tokenizer);
  459. return;
  460. }
  461. while (1) switch (tokenizer_read_next_token(tokenizer))
  462. {
  463. case ESPEAKNG_TOKEN_END_OF_BUFFER:
  464. destroy_tokenizer(tokenizer);
  465. return;
  466. case ESPEAKNG_TOKEN_UNKNOWN:
  467. printf("unknown : %s\n", tokenizer_get_token_text(tokenizer));
  468. break;
  469. case ESPEAKNG_TOKEN_NEWLINE:
  470. printf("newline : ");
  471. escape_newline(tokenizer_get_token_text(tokenizer));
  472. putc('\n', stdout);
  473. break;
  474. case ESPEAKNG_TOKEN_PARAGRAPH:
  475. printf("paragraph : %s\n", tokenizer_get_token_text(tokenizer));
  476. break;
  477. case ESPEAKNG_TOKEN_WHITESPACE:
  478. printf("whitespace : %s\n", tokenizer_get_token_text(tokenizer));
  479. break;
  480. case ESPEAKNG_TOKEN_WORD_UPPERCASE:
  481. printf("word (upper case) : %s\n", tokenizer_get_token_text(tokenizer));
  482. break;
  483. case ESPEAKNG_TOKEN_WORD_LOWERCASE:
  484. printf("word (lower case) : %s\n", tokenizer_get_token_text(tokenizer));
  485. break;
  486. case ESPEAKNG_TOKEN_WORD_MIXEDCASE:
  487. printf("word (mixed case) : %s\n", tokenizer_get_token_text(tokenizer));
  488. break;
  489. case ESPEAKNG_TOKEN_WORD_CAPITALIZED:
  490. printf("word (capitalized) : %s\n", tokenizer_get_token_text(tokenizer));
  491. break;
  492. case ESPEAKNG_TOKEN_FULL_STOP:
  493. printf("full stop : %s\n", tokenizer_get_token_text(tokenizer));
  494. break;
  495. case ESPEAKNG_TOKEN_QUESTION_MARK:
  496. printf("question mark : %s\n", tokenizer_get_token_text(tokenizer));
  497. break;
  498. case ESPEAKNG_TOKEN_EXCLAMATION_MARK:
  499. printf("exclamation mark : %s\n", tokenizer_get_token_text(tokenizer));
  500. break;
  501. case ESPEAKNG_TOKEN_COMMA:
  502. printf("comma : %s\n", tokenizer_get_token_text(tokenizer));
  503. break;
  504. case ESPEAKNG_TOKEN_COLON:
  505. printf("colon : %s\n", tokenizer_get_token_text(tokenizer));
  506. break;
  507. case ESPEAKNG_TOKEN_SEMICOLON:
  508. printf("semicolon : %s\n", tokenizer_get_token_text(tokenizer));
  509. break;
  510. }
  511. }
  512. void
  513. print_tokens_from_file(const char *filename, const char *encoding_name)
  514. {
  515. espeak_ng_ENCODING encoding = espeak_ng_EncodingFromName(encoding_name);
  516. if (encoding == ESPEAKNG_ENCODING_UNKNOWN) {
  517. printf("Unknown encoding \"%s\".\n", encoding_name);
  518. return;
  519. }
  520. int length = GetFileLength(filename);
  521. FILE *f = (length > 0) ? fopen(filename, "rb") : NULL;
  522. if (!f) {
  523. printf("Cannot open file: %s\n", filename);
  524. return;
  525. }
  526. char *buffer = malloc(length);
  527. if (!buffer) {
  528. fclose(f);
  529. printf("Out of memory!\n");
  530. return;
  531. }
  532. fread(buffer, 1, length, f);
  533. fclose(f);
  534. espeak_ng_TEXT_DECODER *decoder = create_text_decoder();
  535. if (text_decoder_decode_string(decoder, buffer, length, encoding) == ENS_OK)
  536. print_tokens(decoder);
  537. destroy_text_decoder(decoder);
  538. }
  539. void
  540. usage(const char *program)
  541. {
  542. printf("%s -- Run the tokenizer tests.\n", program);
  543. printf("%s ENCODING FILENAME -- Print the tokens for FILENAME.\n", program);
  544. }
  545. int
  546. main(int argc, char **argv)
  547. {
  548. switch (argc)
  549. {
  550. case 1: run_tests(); break;
  551. case 3: print_tokens_from_file(argv[2], argv[1]); break;
  552. default: usage(argv[0]); return EXIT_FAILURE;
  553. }
  554. return EXIT_SUCCESS;
  555. }