eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizer.c 12KB


  1. /* Tokenizer APIs.
  2. *
  3. * Copyright (C) 2017 Reece H. Dunn
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 3 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, see: <http://www.gnu.org/licenses/>.
  17. */
  18. #include "config.h"
  19. #include <errno.h>
  20. #include <stdint.h>
  21. #include <stdio.h>
  22. #include <stdlib.h>
  23. #include <string.h>
  24. #include <espeak-ng/espeak_ng.h>
  25. #include <espeak-ng/encoding.h>
  26. #include <espeak-ng/tokenizer.h>
  27. #include <ucd/ucd.h>
  28. #include "speech.h"
  29. #include "phoneme.h"
  30. #include "synthesize.h"
  31. #include "translate.h"
  32. #define ESPEAKNG_CLAUSE_TYPE_PROPERTY_MASK 0xFFF0000000000000ull
  33. int clause_type_from_codepoint(uint32_t c)
  34. {
  35. ucd_category cat = ucd_lookup_category(c);
  36. ucd_property props = ucd_properties(c, cat);
  37. switch (props & ESPEAKNG_CLAUSE_TYPE_PROPERTY_MASK)
  38. {
  39. case ESPEAKNG_PROPERTY_FULL_STOP:
  40. return CLAUSE_PERIOD;
  41. case ESPEAKNG_PROPERTY_FULL_STOP | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
  42. return CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER;
  43. case ESPEAKNG_PROPERTY_QUESTION_MARK:
  44. return CLAUSE_QUESTION;
  45. case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
  46. return CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER;
  47. case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_PUNCTUATION_IN_WORD:
  48. return CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD;
  49. case ESPEAKNG_PROPERTY_EXCLAMATION_MARK:
  50. return CLAUSE_EXCLAMATION;
  51. case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
  52. return CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER;
  53. case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_PUNCTUATION_IN_WORD:
  54. return CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD;
  55. case ESPEAKNG_PROPERTY_COMMA:
  56. return CLAUSE_COMMA;
  57. case ESPEAKNG_PROPERTY_COMMA | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
  58. return CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER;
  59. case ESPEAKNG_PROPERTY_COLON:
  60. return CLAUSE_COLON;
  61. case ESPEAKNG_PROPERTY_COLON | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
  62. return CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER;
  63. case ESPEAKNG_PROPERTY_SEMI_COLON:
  64. case ESPEAKNG_PROPERTY_EXTENDED_DASH:
  65. return CLAUSE_SEMICOLON;
  66. case ESPEAKNG_PROPERTY_SEMI_COLON | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
  67. case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER | ESPEAKNG_PROPERTY_INVERTED_TERMINAL_PUNCTUATION:
  68. case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER | ESPEAKNG_PROPERTY_INVERTED_TERMINAL_PUNCTUATION:
  69. return CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER;
  70. case ESPEAKNG_PROPERTY_ELLIPSIS:
  71. return CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER;
  72. case ESPEAKNG_PROPERTY_PARAGRAPH_SEPARATOR:
  73. return CLAUSE_PARAGRAPH;
  74. }
  75. return CLAUSE_NONE;
  76. }
  77. typedef enum {
  78. ESPEAKNG_CTYPE_OTHER,
  79. ESPEAKNG_CTYPE_CARRIAGE_RETURN,
  80. ESPEAKNG_CTYPE_NEWLINE,
  81. ESPEAKNG_CTYPE_END_OF_STRING,
  82. ESPEAKNG_CTYPE_PARAGRAPH,
  83. ESPEAKNG_CTYPE_WHITESPACE,
  84. ESPEAKNG_CTYPE_LOWERCASE,
  85. ESPEAKNG_CTYPE_UPPERCASE,
  86. ESPEAKNG_CTYPE_FULL_STOP,
  87. ESPEAKNG_CTYPE_QUESTION_MARK,
  88. ESPEAKNG_CTYPE_EXCLAMATION_MARK,
  89. ESPEAKNG_CTYPE_COMMA,
  90. ESPEAKNG_CTYPE_COLON,
  91. ESPEAKNG_CTYPE_SEMICOLON,
  92. ESPEAKNG_CTYPE_ELLIPSIS,
  93. ESPEAKNG_CTYPE_PUNCTUATION,
  94. ESPEAKNG_CTYPE_SYMBOL,
  95. } espeakng_CTYPE;
  96. #define ESPEAKNG_CTYPE_PROPERTY_MASK 0xFE0000000000C001ull
  97. // Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm
  98. static espeakng_CTYPE codepoint_type(uint32_t c)
  99. {
  100. // 1. Detect and classify specific codepoints.
  101. switch (c)
  102. {
  103. case 0x0000: return ESPEAKNG_CTYPE_END_OF_STRING; // NULL
  104. case 0x000A: return ESPEAKNG_CTYPE_NEWLINE; // LINE FEED (LF)
  105. case 0x000C: return ESPEAKNG_CTYPE_NEWLINE; // FORM FEED (FF)
  106. case 0x000D: return ESPEAKNG_CTYPE_CARRIAGE_RETURN; // CARRIAGE RETURN (CR)
  107. case 0x0085: return ESPEAKNG_CTYPE_NEWLINE; // NEW LINE (NEL)
  108. }
  109. // 2. Override property types for codepoints by their Unicode General Category.
  110. ucd_category cat = ucd_lookup_category(c);
  111. switch (cat)
  112. {
  113. case UCD_CATEGORY_Zl: return ESPEAKNG_CTYPE_NEWLINE;
  114. case UCD_CATEGORY_Zp: return ESPEAKNG_CTYPE_PARAGRAPH;
  115. }
  116. // 3. Classify codepoints by their Unicode properties.
  117. ucd_property props = ucd_properties(c, cat);
  118. switch (props & ESPEAKNG_CTYPE_PROPERTY_MASK)
  119. {
  120. case UCD_PROPERTY_WHITE_SPACE:
  121. return ESPEAKNG_CTYPE_WHITESPACE;
  122. case UCD_PROPERTY_OTHER_LOWERCASE:
  123. return ESPEAKNG_CTYPE_LOWERCASE;
  124. case UCD_PROPERTY_OTHER_UPPERCASE:
  125. return ESPEAKNG_CTYPE_UPPERCASE;
  126. case ESPEAKNG_PROPERTY_FULL_STOP:
  127. return ESPEAKNG_CTYPE_FULL_STOP;
  128. case ESPEAKNG_PROPERTY_QUESTION_MARK:
  129. return ESPEAKNG_CTYPE_QUESTION_MARK;
  130. case ESPEAKNG_PROPERTY_EXCLAMATION_MARK:
  131. return ESPEAKNG_CTYPE_EXCLAMATION_MARK;
  132. case ESPEAKNG_PROPERTY_COMMA:
  133. return ESPEAKNG_CTYPE_COMMA;
  134. case ESPEAKNG_PROPERTY_COLON:
  135. return ESPEAKNG_CTYPE_COLON;
  136. case ESPEAKNG_PROPERTY_SEMI_COLON:
  137. return ESPEAKNG_CTYPE_SEMICOLON;
  138. case ESPEAKNG_PROPERTY_ELLIPSIS:
  139. return ESPEAKNG_CTYPE_ELLIPSIS;
  140. }
  141. // 4. Classify codepoints by their Unicode General Category.
  142. switch (cat)
  143. {
  144. case UCD_CATEGORY_Lu: return ESPEAKNG_CTYPE_UPPERCASE;
  145. case UCD_CATEGORY_Ll: return ESPEAKNG_CTYPE_LOWERCASE;
  146. case UCD_CATEGORY_Pc: return ESPEAKNG_CTYPE_PUNCTUATION;
  147. case UCD_CATEGORY_Pd: return ESPEAKNG_CTYPE_PUNCTUATION;
  148. case UCD_CATEGORY_Pe: return ESPEAKNG_CTYPE_PUNCTUATION;
  149. case UCD_CATEGORY_Pf: return ESPEAKNG_CTYPE_PUNCTUATION;
  150. case UCD_CATEGORY_Pi: return ESPEAKNG_CTYPE_PUNCTUATION;
  151. case UCD_CATEGORY_Po: return ESPEAKNG_CTYPE_PUNCTUATION;
  152. case UCD_CATEGORY_Ps: return ESPEAKNG_CTYPE_PUNCTUATION;
  153. case UCD_CATEGORY_Sc: return ESPEAKNG_CTYPE_SYMBOL;
  154. case UCD_CATEGORY_Sk: return ESPEAKNG_CTYPE_SYMBOL;
  155. case UCD_CATEGORY_Sm: return ESPEAKNG_CTYPE_SYMBOL;
  156. case UCD_CATEGORY_So: return ESPEAKNG_CTYPE_SYMBOL;
  157. }
  158. // 5. Classify the remaining codepoints.
  159. return ESPEAKNG_CTYPE_OTHER;
  160. }
  161. #define ESPEAKNG_CODEPOINT_INVALID 0xFFFFFFFF
  162. struct espeak_ng_TOKENIZER_
  163. {
  164. espeak_ng_TEXT_DECODER *decoder;
  165. char token[256];
  166. uint32_t keepc;
  167. espeak_ng_TOKEN_TYPE (*read)(espeak_ng_TOKENIZER *tokenizer);
  168. };
  169. static espeak_ng_TOKEN_TYPE
  170. tokenizer_state_end_of_buffer(espeak_ng_TOKENIZER *tokenizer)
  171. {
  172. *tokenizer->token = '\0';
  173. return ESPEAKNG_TOKEN_END_OF_BUFFER;
  174. }
  175. static espeak_ng_TOKEN_TYPE
  176. tokenizer_read_word_token(espeak_ng_TOKENIZER *tokenizer, char *current, espeak_ng_TOKEN_TYPE type)
  177. {
  178. char *end = tokenizer->token + sizeof(tokenizer->token) - 5; // allow for UTF-8 trailing bytes
  179. int initial_state = 1;
  180. while (current < end && !text_decoder_eof(tokenizer->decoder)) {
  181. uint32_t c = text_decoder_getc(tokenizer->decoder);
  182. switch (codepoint_type(c))
  183. {
  184. case ESPEAKNG_CTYPE_LOWERCASE:
  185. current += utf8_out(c, current);
  186. switch (type)
  187. {
  188. case ESPEAKNG_TOKEN_WORD_LOWERCASE:
  189. case ESPEAKNG_TOKEN_WORD_MIXEDCASE:
  190. case ESPEAKNG_TOKEN_WORD_CAPITALIZED:
  191. break;
  192. case ESPEAKNG_TOKEN_WORD_UPPERCASE:
  193. type = initial_state
  194. ? ESPEAKNG_TOKEN_WORD_CAPITALIZED
  195. : ESPEAKNG_TOKEN_WORD_MIXEDCASE;
  196. break;
  197. }
  198. initial_state = 0;
  199. break;
  200. case ESPEAKNG_CTYPE_UPPERCASE:
  201. current += utf8_out(c, current);
  202. switch (type)
  203. {
  204. case ESPEAKNG_TOKEN_WORD_UPPERCASE:
  205. case ESPEAKNG_TOKEN_WORD_MIXEDCASE:
  206. break;
  207. case ESPEAKNG_TOKEN_WORD_LOWERCASE:
  208. case ESPEAKNG_TOKEN_WORD_CAPITALIZED:
  209. type = ESPEAKNG_TOKEN_WORD_MIXEDCASE;
  210. break;
  211. }
  212. initial_state = 0;
  213. break;
  214. default:
  215. tokenizer->keepc = c;
  216. *current = '\0';
  217. return type;
  218. }
  219. }
  220. *current = '\0';
  221. return type;
  222. }
  223. static espeak_ng_TOKEN_TYPE
  224. tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer)
  225. {
  226. if (text_decoder_eof(tokenizer->decoder)) {
  227. tokenizer->read = tokenizer_state_end_of_buffer;
  228. return tokenizer_state_end_of_buffer(tokenizer);
  229. }
  230. char *current = tokenizer->token;
  231. char *end = tokenizer->token + sizeof(tokenizer->token) - 5; // allow for UTF-8 trailing bytes
  232. uint32_t c;
  233. if (tokenizer->keepc != ESPEAKNG_CODEPOINT_INVALID) {
  234. c = tokenizer->keepc;
  235. tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
  236. } else {
  237. c = text_decoder_getc(tokenizer->decoder);
  238. }
  239. switch (codepoint_type(c))
  240. {
  241. case ESPEAKNG_CTYPE_END_OF_STRING: // '\0'
  242. tokenizer->read = tokenizer_state_end_of_buffer;
  243. return tokenizer_state_end_of_buffer(tokenizer);
  244. case ESPEAKNG_CTYPE_CARRIAGE_RETURN: // '\r'
  245. if (text_decoder_peekc(tokenizer->decoder) == '\n') {
  246. current += utf8_out(c, current);
  247. c = text_decoder_getc(tokenizer->decoder);
  248. }
  249. // fallthrough
  250. case ESPEAKNG_CTYPE_NEWLINE:
  251. current += utf8_out(c, current);
  252. *current = '\0';
  253. return ESPEAKNG_TOKEN_NEWLINE;
  254. case ESPEAKNG_CTYPE_PARAGRAPH:
  255. current += utf8_out(c, current);
  256. *current = '\0';
  257. return ESPEAKNG_TOKEN_PARAGRAPH;
  258. case ESPEAKNG_CTYPE_WHITESPACE:
  259. current += utf8_out(c, current);
  260. while (!text_decoder_eof(tokenizer->decoder) &&
  261. current < end &&
  262. codepoint_type(c = text_decoder_getc(tokenizer->decoder)) == ESPEAKNG_CTYPE_WHITESPACE)
  263. {
  264. current += utf8_out(c, current);
  265. }
  266. tokenizer->keepc = c;
  267. *current = '\0';
  268. return ESPEAKNG_TOKEN_WHITESPACE;
  269. case ESPEAKNG_CTYPE_LOWERCASE:
  270. current += utf8_out(c, current);
  271. return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_LOWERCASE);
  272. case ESPEAKNG_CTYPE_UPPERCASE:
  273. current += utf8_out(c, current);
  274. return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_UPPERCASE);
  275. case ESPEAKNG_CTYPE_FULL_STOP:
  276. current += utf8_out(c, current);
  277. if (c == '.' && text_decoder_peekc(tokenizer->decoder) == '.') {
  278. c = text_decoder_getc(tokenizer->decoder);
  279. if (text_decoder_peekc(tokenizer->decoder) == '.') {
  280. c = text_decoder_getc(tokenizer->decoder);
  281. current += utf8_out('.', current);
  282. current += utf8_out('.', current);
  283. *current = '\0';
  284. return ESPEAKNG_TOKEN_ELLIPSIS;
  285. } else {
  286. tokenizer->keepc = c;
  287. }
  288. }
  289. *current = '\0';
  290. return ESPEAKNG_TOKEN_FULL_STOP;
  291. case ESPEAKNG_CTYPE_QUESTION_MARK:
  292. current += utf8_out(c, current);
  293. *current = '\0';
  294. return ESPEAKNG_TOKEN_QUESTION_MARK;
  295. case ESPEAKNG_CTYPE_EXCLAMATION_MARK:
  296. current += utf8_out(c, current);
  297. *current = '\0';
  298. return ESPEAKNG_TOKEN_EXCLAMATION_MARK;
  299. case ESPEAKNG_CTYPE_COMMA:
  300. current += utf8_out(c, current);
  301. *current = '\0';
  302. return ESPEAKNG_TOKEN_COMMA;
  303. case ESPEAKNG_CTYPE_COLON:
  304. current += utf8_out(c, current);
  305. *current = '\0';
  306. return ESPEAKNG_TOKEN_COLON;
  307. case ESPEAKNG_CTYPE_SEMICOLON:
  308. current += utf8_out(c, current);
  309. *current = '\0';
  310. return ESPEAKNG_TOKEN_SEMICOLON;
  311. case ESPEAKNG_CTYPE_ELLIPSIS:
  312. current += utf8_out(c, current);
  313. *current = '\0';
  314. return ESPEAKNG_TOKEN_ELLIPSIS;
  315. case ESPEAKNG_CTYPE_PUNCTUATION:
  316. current += utf8_out(c, current);
  317. *current = '\0';
  318. return ESPEAKNG_TOKEN_PUNCTUATION;
  319. case ESPEAKNG_CTYPE_SYMBOL:
  320. current += utf8_out(c, current);
  321. *current = '\0';
  322. return ESPEAKNG_TOKEN_SYMBOL;
  323. default:
  324. current += utf8_out(c, current);
  325. *current = '\0';
  326. return ESPEAKNG_TOKEN_UNKNOWN;
  327. }
  328. return ESPEAKNG_TOKEN_END_OF_BUFFER;
  329. }
  330. #pragma GCC visibility push(default)
  331. espeak_ng_TOKENIZER *
  332. create_tokenizer(void)
  333. {
  334. espeak_ng_TOKENIZER *tokenizer = malloc(sizeof(espeak_ng_TOKENIZER));
  335. if (!tokenizer) return NULL;
  336. tokenizer->decoder = NULL;
  337. tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
  338. tokenizer->read = tokenizer_state_end_of_buffer;
  339. *tokenizer->token = '\0';
  340. return tokenizer;
  341. }
  342. void
  343. destroy_tokenizer(espeak_ng_TOKENIZER *tokenizer)
  344. {
  345. if (tokenizer) free(tokenizer);
  346. }
  347. int
  348. tokenizer_reset(espeak_ng_TOKENIZER *tokenizer,
  349. espeak_ng_TEXT_DECODER *decoder,
  350. espeak_ng_TOKENIZER_OPTIONS options)
  351. {
  352. if (!tokenizer) return 0;
  353. tokenizer->decoder = decoder;
  354. tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
  355. tokenizer->read = decoder ? tokenizer_state_default : tokenizer_state_end_of_buffer;
  356. return 1;
  357. }
  358. espeak_ng_TOKEN_TYPE
  359. tokenizer_read_next_token(espeak_ng_TOKENIZER *tokenizer)
  360. {
  361. return tokenizer->read(tokenizer);
  362. }
  363. const char *
  364. tokenizer_get_token_text(espeak_ng_TOKENIZER *tokenizer)
  365. {
  366. return tokenizer->token;
  367. }
  368. #pragma GCC visibility pop