eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizer.c 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. /* Tokenizer APIs.
  2. *
  3. * Copyright (C) 2017 Reece H. Dunn
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 3 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, see: <http://www.gnu.org/licenses/>.
  17. */
  18. #include "config.h"
  19. #include <errno.h>
  20. #include <stdint.h>
  21. #include <stdio.h>
  22. #include <stdlib.h>
  23. #include <string.h>
  24. #include <espeak-ng/espeak_ng.h>
  25. #include <ucd/ucd.h>
  26. #include "encoding.h"
  27. #include "tokenizer.h"
  28. #include "speech.h"
  29. #include "phoneme.h"
  30. #include "synthesize.h"
  31. #include "translate.h"
  32. #define ESPEAKNG_CLAUSE_TYPE_PROPERTY_MASK 0xFFF0000000000000ull
  33. int clause_type_from_codepoint(uint32_t c)
  34. {
  35. ucd_category cat = ucd_lookup_category(c);
  36. ucd_property props = ucd_properties(c, cat);
  37. switch (props & ESPEAKNG_CLAUSE_TYPE_PROPERTY_MASK)
  38. {
  39. case ESPEAKNG_PROPERTY_FULL_STOP:
  40. return CLAUSE_PERIOD;
  41. case ESPEAKNG_PROPERTY_FULL_STOP | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
  42. return CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER;
  43. case ESPEAKNG_PROPERTY_QUESTION_MARK:
  44. return CLAUSE_QUESTION;
  45. case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
  46. return CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER;
  47. case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_PUNCTUATION_IN_WORD:
  48. return CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD;
  49. case ESPEAKNG_PROPERTY_EXCLAMATION_MARK:
  50. return CLAUSE_EXCLAMATION;
  51. case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
  52. return CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER;
  53. case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_PUNCTUATION_IN_WORD:
  54. return CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD;
  55. case ESPEAKNG_PROPERTY_COMMA:
  56. return CLAUSE_COMMA;
  57. case ESPEAKNG_PROPERTY_COMMA | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
  58. return CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER;
  59. case ESPEAKNG_PROPERTY_COLON:
  60. return CLAUSE_COLON;
  61. case ESPEAKNG_PROPERTY_COLON | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
  62. return CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER;
  63. case ESPEAKNG_PROPERTY_SEMI_COLON:
  64. case ESPEAKNG_PROPERTY_EXTENDED_DASH:
  65. return CLAUSE_SEMICOLON;
  66. case ESPEAKNG_PROPERTY_SEMI_COLON | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
  67. case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER | ESPEAKNG_PROPERTY_INVERTED_TERMINAL_PUNCTUATION:
  68. case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER | ESPEAKNG_PROPERTY_INVERTED_TERMINAL_PUNCTUATION:
  69. return CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER;
  70. case ESPEAKNG_PROPERTY_ELLIPSIS:
  71. return CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER;
  72. case ESPEAKNG_PROPERTY_PARAGRAPH_SEPARATOR:
  73. return CLAUSE_PARAGRAPH;
  74. }
  75. return CLAUSE_NONE;
  76. }
  77. typedef enum {
  78. ESPEAKNG_CTYPE_OTHER,
  79. ESPEAKNG_CTYPE_CARRIAGE_RETURN,
  80. ESPEAKNG_CTYPE_NEWLINE,
  81. ESPEAKNG_CTYPE_END_OF_STRING,
  82. ESPEAKNG_CTYPE_PARAGRAPH,
  83. ESPEAKNG_CTYPE_WHITESPACE,
  84. ESPEAKNG_CTYPE_LOWERCASE,
  85. ESPEAKNG_CTYPE_UPPERCASE,
  86. ESPEAKNG_CTYPE_FULL_STOP,
  87. ESPEAKNG_CTYPE_QUESTION_MARK,
  88. ESPEAKNG_CTYPE_EXCLAMATION_MARK,
  89. ESPEAKNG_CTYPE_COMMA,
  90. ESPEAKNG_CTYPE_COLON,
  91. ESPEAKNG_CTYPE_SEMICOLON,
  92. ESPEAKNG_CTYPE_ELLIPSIS,
  93. } espeakng_CTYPE;
  94. #define ESPEAKNG_CTYPE_PROPERTY_MASK 0xFE0000000000C001ull
  95. // Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm
  96. static espeakng_CTYPE codepoint_type(uint32_t c)
  97. {
  98. // 1. Detect and classify specific codepoints.
  99. switch (c)
  100. {
  101. case 0x0000: return ESPEAKNG_CTYPE_END_OF_STRING; // NULL
  102. case 0x000A: return ESPEAKNG_CTYPE_NEWLINE; // LINE FEED (LF)
  103. case 0x000C: return ESPEAKNG_CTYPE_NEWLINE; // FORM FEED (FF)
  104. case 0x000D: return ESPEAKNG_CTYPE_CARRIAGE_RETURN; // CARRIAGE RETURN (CR)
  105. case 0x0085: return ESPEAKNG_CTYPE_NEWLINE; // NEW LINE (NEL)
  106. }
  107. // 2. Classify codepoints by their Unicode General Category.
  108. ucd_category cat = ucd_lookup_category(c);
  109. switch (cat)
  110. {
  111. case UCD_CATEGORY_Lu: return ESPEAKNG_CTYPE_UPPERCASE;
  112. case UCD_CATEGORY_Ll: return ESPEAKNG_CTYPE_LOWERCASE;
  113. case UCD_CATEGORY_Zl: return ESPEAKNG_CTYPE_NEWLINE;
  114. case UCD_CATEGORY_Zp: return ESPEAKNG_CTYPE_PARAGRAPH;
  115. case UCD_CATEGORY_Zs: return ESPEAKNG_CTYPE_WHITESPACE;
  116. }
  117. // 3. Classify codepoints by their Unicode properties.
  118. ucd_property props = ucd_properties(c, cat);
  119. switch (props & ESPEAKNG_CTYPE_PROPERTY_MASK)
  120. {
  121. case UCD_PROPERTY_WHITE_SPACE:
  122. return ESPEAKNG_CTYPE_WHITESPACE;
  123. case UCD_PROPERTY_OTHER_LOWERCASE:
  124. return ESPEAKNG_CTYPE_LOWERCASE;
  125. case UCD_PROPERTY_OTHER_UPPERCASE:
  126. return ESPEAKNG_CTYPE_UPPERCASE;
  127. case ESPEAKNG_PROPERTY_FULL_STOP:
  128. return ESPEAKNG_CTYPE_FULL_STOP;
  129. case ESPEAKNG_PROPERTY_QUESTION_MARK:
  130. return ESPEAKNG_CTYPE_QUESTION_MARK;
  131. case ESPEAKNG_PROPERTY_EXCLAMATION_MARK:
  132. return ESPEAKNG_CTYPE_EXCLAMATION_MARK;
  133. case ESPEAKNG_PROPERTY_COMMA:
  134. return ESPEAKNG_CTYPE_COMMA;
  135. case ESPEAKNG_PROPERTY_COLON:
  136. return ESPEAKNG_CTYPE_COLON;
  137. case ESPEAKNG_PROPERTY_SEMI_COLON:
  138. return ESPEAKNG_CTYPE_SEMICOLON;
  139. case ESPEAKNG_PROPERTY_ELLIPSIS:
  140. return ESPEAKNG_CTYPE_ELLIPSIS;
  141. }
  142. // 4. Classify the remaining codepoints.
  143. return ESPEAKNG_CTYPE_OTHER;
  144. }
  145. #define ESPEAKNG_CODEPOINT_INVALID 0xFFFFFFFF
  146. struct espeak_ng_TOKENIZER_
  147. {
  148. espeak_ng_TEXT_DECODER *decoder;
  149. char token[256];
  150. uint32_t keepc;
  151. espeak_ng_TOKEN_TYPE (*read)(espeak_ng_TOKENIZER *tokenizer);
  152. };
  153. static espeak_ng_TOKEN_TYPE
  154. tokenizer_state_end_of_buffer(espeak_ng_TOKENIZER *tokenizer)
  155. {
  156. *tokenizer->token = '\0';
  157. return ESPEAKNG_TOKEN_END_OF_BUFFER;
  158. }
  159. static espeak_ng_TOKEN_TYPE
  160. tokenizer_read_word_token(espeak_ng_TOKENIZER *tokenizer, char *current, espeak_ng_TOKEN_TYPE type)
  161. {
  162. char *end = tokenizer->token + sizeof(tokenizer->token) - 5; // allow for UTF-8 trailing bytes
  163. int initial_state = 1;
  164. while (current < end && !text_decoder_eof(tokenizer->decoder)) {
  165. uint32_t c = text_decoder_getc(tokenizer->decoder);
  166. switch (codepoint_type(c))
  167. {
  168. case ESPEAKNG_CTYPE_LOWERCASE:
  169. current += utf8_out(c, current);
  170. switch (type)
  171. {
  172. case ESPEAKNG_TOKEN_WORD_LOWERCASE:
  173. case ESPEAKNG_TOKEN_WORD_MIXEDCASE:
  174. case ESPEAKNG_TOKEN_WORD_CAPITALIZED:
  175. break;
  176. case ESPEAKNG_TOKEN_WORD_UPPERCASE:
  177. type = initial_state
  178. ? ESPEAKNG_TOKEN_WORD_CAPITALIZED
  179. : ESPEAKNG_TOKEN_WORD_MIXEDCASE;
  180. break;
  181. }
  182. initial_state = 0;
  183. break;
  184. case ESPEAKNG_CTYPE_UPPERCASE:
  185. current += utf8_out(c, current);
  186. switch (type)
  187. {
  188. case ESPEAKNG_TOKEN_WORD_UPPERCASE:
  189. case ESPEAKNG_TOKEN_WORD_MIXEDCASE:
  190. break;
  191. case ESPEAKNG_TOKEN_WORD_LOWERCASE:
  192. case ESPEAKNG_TOKEN_WORD_CAPITALIZED:
  193. type = ESPEAKNG_TOKEN_WORD_MIXEDCASE;
  194. break;
  195. }
  196. initial_state = 0;
  197. break;
  198. default:
  199. tokenizer->keepc = c;
  200. *current = '\0';
  201. return type;
  202. }
  203. }
  204. *current = '\0';
  205. return type;
  206. }
  207. static espeak_ng_TOKEN_TYPE
  208. tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer)
  209. {
  210. if (text_decoder_eof(tokenizer->decoder)) {
  211. tokenizer->read = tokenizer_state_end_of_buffer;
  212. return tokenizer_state_end_of_buffer(tokenizer);
  213. }
  214. char *current = tokenizer->token;
  215. char *end = tokenizer->token + sizeof(tokenizer->token) - 5; // allow for UTF-8 trailing bytes
  216. uint32_t c;
  217. if (tokenizer->keepc != ESPEAKNG_CODEPOINT_INVALID) {
  218. c = tokenizer->keepc;
  219. tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
  220. } else {
  221. c = text_decoder_getc(tokenizer->decoder);
  222. }
  223. switch (codepoint_type(c))
  224. {
  225. case ESPEAKNG_CTYPE_END_OF_STRING: // '\0'
  226. tokenizer->read = tokenizer_state_end_of_buffer;
  227. return tokenizer_state_end_of_buffer(tokenizer);
  228. case ESPEAKNG_CTYPE_CARRIAGE_RETURN: // '\r'
  229. if (text_decoder_peekc(tokenizer->decoder) == '\n') {
  230. current += utf8_out(c, current);
  231. c = text_decoder_getc(tokenizer->decoder);
  232. }
  233. // fallthrough
  234. case ESPEAKNG_CTYPE_NEWLINE:
  235. current += utf8_out(c, current);
  236. *current = '\0';
  237. return ESPEAKNG_TOKEN_NEWLINE;
  238. case ESPEAKNG_CTYPE_PARAGRAPH:
  239. current += utf8_out(c, current);
  240. *current = '\0';
  241. return ESPEAKNG_TOKEN_PARAGRAPH;
  242. case ESPEAKNG_CTYPE_WHITESPACE:
  243. current += utf8_out(c, current);
  244. while (!text_decoder_eof(tokenizer->decoder) &&
  245. current < end &&
  246. codepoint_type(c = text_decoder_getc(tokenizer->decoder)) == ESPEAKNG_CTYPE_WHITESPACE)
  247. {
  248. current += utf8_out(c, current);
  249. }
  250. tokenizer->keepc = c;
  251. *current = '\0';
  252. return ESPEAKNG_TOKEN_WHITESPACE;
  253. case ESPEAKNG_CTYPE_LOWERCASE:
  254. current += utf8_out(c, current);
  255. return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_LOWERCASE);
  256. case ESPEAKNG_CTYPE_UPPERCASE:
  257. current += utf8_out(c, current);
  258. return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_UPPERCASE);
  259. case ESPEAKNG_CTYPE_FULL_STOP:
  260. current += utf8_out(c, current);
  261. if (c == '.' && text_decoder_peekc(tokenizer->decoder) == '.') {
  262. c = text_decoder_getc(tokenizer->decoder);
  263. if (text_decoder_peekc(tokenizer->decoder) == '.') {
  264. c = text_decoder_getc(tokenizer->decoder);
  265. current += utf8_out('.', current);
  266. current += utf8_out('.', current);
  267. *current = '\0';
  268. return ESPEAKNG_TOKEN_ELLIPSIS;
  269. } else {
  270. tokenizer->keepc = c;
  271. }
  272. }
  273. *current = '\0';
  274. return ESPEAKNG_TOKEN_FULL_STOP;
  275. case ESPEAKNG_CTYPE_QUESTION_MARK:
  276. current += utf8_out(c, current);
  277. *current = '\0';
  278. return ESPEAKNG_TOKEN_QUESTION_MARK;
  279. case ESPEAKNG_CTYPE_EXCLAMATION_MARK:
  280. current += utf8_out(c, current);
  281. *current = '\0';
  282. return ESPEAKNG_TOKEN_EXCLAMATION_MARK;
  283. case ESPEAKNG_CTYPE_COMMA:
  284. current += utf8_out(c, current);
  285. *current = '\0';
  286. return ESPEAKNG_TOKEN_COMMA;
  287. case ESPEAKNG_CTYPE_COLON:
  288. current += utf8_out(c, current);
  289. *current = '\0';
  290. return ESPEAKNG_TOKEN_COLON;
  291. case ESPEAKNG_CTYPE_SEMICOLON:
  292. current += utf8_out(c, current);
  293. *current = '\0';
  294. return ESPEAKNG_TOKEN_SEMICOLON;
  295. case ESPEAKNG_CTYPE_ELLIPSIS:
  296. current += utf8_out(c, current);
  297. *current = '\0';
  298. return ESPEAKNG_TOKEN_ELLIPSIS;
  299. default:
  300. current += utf8_out(c, current);
  301. *current = '\0';
  302. return ESPEAKNG_TOKEN_UNKNOWN;
  303. }
  304. return ESPEAKNG_TOKEN_END_OF_BUFFER;
  305. }
  306. espeak_ng_TOKENIZER *
  307. create_tokenizer(void)
  308. {
  309. espeak_ng_TOKENIZER *tokenizer = malloc(sizeof(espeak_ng_TOKENIZER));
  310. if (!tokenizer) return NULL;
  311. tokenizer->decoder = NULL;
  312. tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
  313. tokenizer->read = tokenizer_state_end_of_buffer;
  314. *tokenizer->token = '\0';
  315. return tokenizer;
  316. }
  317. void
  318. destroy_tokenizer(espeak_ng_TOKENIZER *tokenizer)
  319. {
  320. if (tokenizer) free(tokenizer);
  321. }
  322. int
  323. tokenizer_reset(espeak_ng_TOKENIZER *tokenizer,
  324. espeak_ng_TEXT_DECODER *decoder,
  325. espeak_ng_TOKENIZER_OPTIONS options)
  326. {
  327. if (!tokenizer) return 0;
  328. tokenizer->decoder = decoder;
  329. tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
  330. tokenizer->read = decoder ? tokenizer_state_default : tokenizer_state_end_of_buffer;
  331. return 1;
  332. }
  333. espeak_ng_TOKEN_TYPE
  334. tokenizer_read_next_token(espeak_ng_TOKENIZER *tokenizer)
  335. {
  336. return tokenizer->read(tokenizer);
  337. }
  338. const char *
  339. tokenizer_get_token_text(espeak_ng_TOKENIZER *tokenizer)
  340. {
  341. return tokenizer->token;
  342. }