123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410 |
- /* Tokenizer APIs.
- *
- * Copyright (C) 2017 Reece H. Dunn
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see: <http://www.gnu.org/licenses/>.
- */
-
- #include "config.h"
-
- #include <errno.h>
- #include <stdint.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
-
- #include <espeak-ng/espeak_ng.h>
- #include <espeak-ng/encoding.h>
- #include <espeak-ng/tokenizer.h>
- #include <ucd/ucd.h>
-
- #include "speech.h"
- #include "phoneme.h"
- #include "synthesize.h"
- #include "translate.h"
-
- #define ESPEAKNG_CLAUSE_TYPE_PROPERTY_MASK 0xFFF0000000000000ull
-
- int clause_type_from_codepoint(uint32_t c)
- {
- ucd_category cat = ucd_lookup_category(c);
- ucd_property props = ucd_properties(c, cat);
-
- switch (props & ESPEAKNG_CLAUSE_TYPE_PROPERTY_MASK)
- {
- case ESPEAKNG_PROPERTY_FULL_STOP:
- return CLAUSE_PERIOD;
- case ESPEAKNG_PROPERTY_FULL_STOP | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
- return CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER;
- case ESPEAKNG_PROPERTY_QUESTION_MARK:
- return CLAUSE_QUESTION;
- case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
- return CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER;
- case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_PUNCTUATION_IN_WORD:
- return CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD;
- case ESPEAKNG_PROPERTY_EXCLAMATION_MARK:
- return CLAUSE_EXCLAMATION;
- case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
- return CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER;
- case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_PUNCTUATION_IN_WORD:
- return CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD;
- case ESPEAKNG_PROPERTY_COMMA:
- return CLAUSE_COMMA;
- case ESPEAKNG_PROPERTY_COMMA | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
- return CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER;
- case ESPEAKNG_PROPERTY_COLON:
- return CLAUSE_COLON;
- case ESPEAKNG_PROPERTY_COLON | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
- return CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER;
- case ESPEAKNG_PROPERTY_SEMI_COLON:
- case ESPEAKNG_PROPERTY_EXTENDED_DASH:
- return CLAUSE_SEMICOLON;
- case ESPEAKNG_PROPERTY_SEMI_COLON | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER:
- case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER | ESPEAKNG_PROPERTY_INVERTED_TERMINAL_PUNCTUATION:
- case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER | ESPEAKNG_PROPERTY_INVERTED_TERMINAL_PUNCTUATION:
- return CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER;
- case ESPEAKNG_PROPERTY_ELLIPSIS:
- return CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER;
- case ESPEAKNG_PROPERTY_PARAGRAPH_SEPARATOR:
- return CLAUSE_PARAGRAPH;
- }
-
- return CLAUSE_NONE;
- }
-
- typedef enum {
- ESPEAKNG_CTYPE_OTHER,
- ESPEAKNG_CTYPE_CARRIAGE_RETURN,
- ESPEAKNG_CTYPE_NEWLINE,
- ESPEAKNG_CTYPE_END_OF_STRING,
- ESPEAKNG_CTYPE_PARAGRAPH,
- ESPEAKNG_CTYPE_WHITESPACE,
- ESPEAKNG_CTYPE_LOWERCASE,
- ESPEAKNG_CTYPE_UPPERCASE,
- ESPEAKNG_CTYPE_FULL_STOP,
- ESPEAKNG_CTYPE_QUESTION_MARK,
- ESPEAKNG_CTYPE_EXCLAMATION_MARK,
- ESPEAKNG_CTYPE_COMMA,
- ESPEAKNG_CTYPE_COLON,
- ESPEAKNG_CTYPE_SEMICOLON,
- ESPEAKNG_CTYPE_ELLIPSIS,
- ESPEAKNG_CTYPE_PUNCTUATION,
- ESPEAKNG_CTYPE_SYMBOL,
- } espeakng_CTYPE;
-
- #define ESPEAKNG_CTYPE_PROPERTY_MASK 0xFE0000000000C001ull
-
- // Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm
- static espeakng_CTYPE codepoint_type(uint32_t c)
- {
- // 1. Detect and classify specific codepoints.
-
- switch (c)
- {
- case 0x0000: return ESPEAKNG_CTYPE_END_OF_STRING; // NULL
- case 0x000A: return ESPEAKNG_CTYPE_NEWLINE; // LINE FEED (LF)
- case 0x000C: return ESPEAKNG_CTYPE_NEWLINE; // FORM FEED (FF)
- case 0x000D: return ESPEAKNG_CTYPE_CARRIAGE_RETURN; // CARRIAGE RETURN (CR)
- case 0x0085: return ESPEAKNG_CTYPE_NEWLINE; // NEW LINE (NEL)
- }
-
- // 2. Override property types for codepoints by their Unicode General Category.
-
- ucd_category cat = ucd_lookup_category(c);
- switch (cat)
- {
- case UCD_CATEGORY_Zl: return ESPEAKNG_CTYPE_NEWLINE;
- case UCD_CATEGORY_Zp: return ESPEAKNG_CTYPE_PARAGRAPH;
- }
-
- // 3. Classify codepoints by their Unicode properties.
-
- ucd_property props = ucd_properties(c, cat);
- switch (props & ESPEAKNG_CTYPE_PROPERTY_MASK)
- {
- case UCD_PROPERTY_WHITE_SPACE:
- return ESPEAKNG_CTYPE_WHITESPACE;
- case UCD_PROPERTY_OTHER_LOWERCASE:
- return ESPEAKNG_CTYPE_LOWERCASE;
- case UCD_PROPERTY_OTHER_UPPERCASE:
- return ESPEAKNG_CTYPE_UPPERCASE;
- case ESPEAKNG_PROPERTY_FULL_STOP:
- return ESPEAKNG_CTYPE_FULL_STOP;
- case ESPEAKNG_PROPERTY_QUESTION_MARK:
- return ESPEAKNG_CTYPE_QUESTION_MARK;
- case ESPEAKNG_PROPERTY_EXCLAMATION_MARK:
- return ESPEAKNG_CTYPE_EXCLAMATION_MARK;
- case ESPEAKNG_PROPERTY_COMMA:
- return ESPEAKNG_CTYPE_COMMA;
- case ESPEAKNG_PROPERTY_COLON:
- return ESPEAKNG_CTYPE_COLON;
- case ESPEAKNG_PROPERTY_SEMI_COLON:
- return ESPEAKNG_CTYPE_SEMICOLON;
- case ESPEAKNG_PROPERTY_ELLIPSIS:
- return ESPEAKNG_CTYPE_ELLIPSIS;
- }
-
- // 4. Classify codepoints by their Unicode General Category.
-
- switch (cat)
- {
- case UCD_CATEGORY_Lu: return ESPEAKNG_CTYPE_UPPERCASE;
- case UCD_CATEGORY_Ll: return ESPEAKNG_CTYPE_LOWERCASE;
- case UCD_CATEGORY_Pc: return ESPEAKNG_CTYPE_PUNCTUATION;
- case UCD_CATEGORY_Pd: return ESPEAKNG_CTYPE_PUNCTUATION;
- case UCD_CATEGORY_Pe: return ESPEAKNG_CTYPE_PUNCTUATION;
- case UCD_CATEGORY_Pf: return ESPEAKNG_CTYPE_PUNCTUATION;
- case UCD_CATEGORY_Pi: return ESPEAKNG_CTYPE_PUNCTUATION;
- case UCD_CATEGORY_Po: return ESPEAKNG_CTYPE_PUNCTUATION;
- case UCD_CATEGORY_Ps: return ESPEAKNG_CTYPE_PUNCTUATION;
- case UCD_CATEGORY_Sc: return ESPEAKNG_CTYPE_SYMBOL;
- case UCD_CATEGORY_Sk: return ESPEAKNG_CTYPE_SYMBOL;
- case UCD_CATEGORY_Sm: return ESPEAKNG_CTYPE_SYMBOL;
- case UCD_CATEGORY_So: return ESPEAKNG_CTYPE_SYMBOL;
- }
-
- // 5. Classify the remaining codepoints.
-
- return ESPEAKNG_CTYPE_OTHER;
- }
-
- #define ESPEAKNG_CODEPOINT_INVALID 0xFFFFFFFF
-
- struct espeak_ng_TOKENIZER_
- {
- espeak_ng_TEXT_DECODER *decoder;
- char token[256];
- uint32_t keepc;
-
- espeak_ng_TOKEN_TYPE (*read)(espeak_ng_TOKENIZER *tokenizer);
- };
-
- static espeak_ng_TOKEN_TYPE
- tokenizer_state_end_of_buffer(espeak_ng_TOKENIZER *tokenizer)
- {
- *tokenizer->token = '\0';
- return ESPEAKNG_TOKEN_END_OF_BUFFER;
- }
-
- static espeak_ng_TOKEN_TYPE
- tokenizer_read_word_token(espeak_ng_TOKENIZER *tokenizer, char *current, espeak_ng_TOKEN_TYPE type)
- {
- char *end = tokenizer->token + sizeof(tokenizer->token) - 5; // allow for UTF-8 trailing bytes
- int initial_state = 1;
-
- while (current < end && !text_decoder_eof(tokenizer->decoder)) {
- uint32_t c = text_decoder_getc(tokenizer->decoder);
- switch (codepoint_type(c))
- {
- case ESPEAKNG_CTYPE_LOWERCASE:
- current += utf8_out(c, current);
- switch (type)
- {
- case ESPEAKNG_TOKEN_WORD_LOWERCASE:
- case ESPEAKNG_TOKEN_WORD_MIXEDCASE:
- case ESPEAKNG_TOKEN_WORD_CAPITALIZED:
- break;
- case ESPEAKNG_TOKEN_WORD_UPPERCASE:
- type = initial_state
- ? ESPEAKNG_TOKEN_WORD_CAPITALIZED
- : ESPEAKNG_TOKEN_WORD_MIXEDCASE;
- break;
- }
- initial_state = 0;
- break;
- case ESPEAKNG_CTYPE_UPPERCASE:
- current += utf8_out(c, current);
- switch (type)
- {
- case ESPEAKNG_TOKEN_WORD_UPPERCASE:
- case ESPEAKNG_TOKEN_WORD_MIXEDCASE:
- break;
- case ESPEAKNG_TOKEN_WORD_LOWERCASE:
- case ESPEAKNG_TOKEN_WORD_CAPITALIZED:
- type = ESPEAKNG_TOKEN_WORD_MIXEDCASE;
- break;
- }
- initial_state = 0;
- break;
- default:
- tokenizer->keepc = c;
- *current = '\0';
- return type;
- }
- }
-
- *current = '\0';
- return type;
- }
-
- static espeak_ng_TOKEN_TYPE
- tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer)
- {
- if (text_decoder_eof(tokenizer->decoder)) {
- tokenizer->read = tokenizer_state_end_of_buffer;
- return tokenizer_state_end_of_buffer(tokenizer);
- }
-
- char *current = tokenizer->token;
- char *end = tokenizer->token + sizeof(tokenizer->token) - 5; // allow for UTF-8 trailing bytes
-
- uint32_t c;
- if (tokenizer->keepc != ESPEAKNG_CODEPOINT_INVALID) {
- c = tokenizer->keepc;
- tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
- } else {
- c = text_decoder_getc(tokenizer->decoder);
- }
-
- switch (codepoint_type(c))
- {
- case ESPEAKNG_CTYPE_END_OF_STRING: // '\0'
- tokenizer->read = tokenizer_state_end_of_buffer;
- return tokenizer_state_end_of_buffer(tokenizer);
- case ESPEAKNG_CTYPE_CARRIAGE_RETURN: // '\r'
- if (text_decoder_peekc(tokenizer->decoder) == '\n') {
- current += utf8_out(c, current);
- c = text_decoder_getc(tokenizer->decoder);
- }
- // fallthrough
- case ESPEAKNG_CTYPE_NEWLINE:
- current += utf8_out(c, current);
- *current = '\0';
- return ESPEAKNG_TOKEN_NEWLINE;
- case ESPEAKNG_CTYPE_PARAGRAPH:
- current += utf8_out(c, current);
- *current = '\0';
- return ESPEAKNG_TOKEN_PARAGRAPH;
- case ESPEAKNG_CTYPE_WHITESPACE:
- current += utf8_out(c, current);
- while (!text_decoder_eof(tokenizer->decoder) &&
- current < end &&
- codepoint_type(c = text_decoder_getc(tokenizer->decoder)) == ESPEAKNG_CTYPE_WHITESPACE)
- {
- current += utf8_out(c, current);
- }
- tokenizer->keepc = c;
- *current = '\0';
- return ESPEAKNG_TOKEN_WHITESPACE;
- case ESPEAKNG_CTYPE_LOWERCASE:
- current += utf8_out(c, current);
- return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_LOWERCASE);
- case ESPEAKNG_CTYPE_UPPERCASE:
- current += utf8_out(c, current);
- return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_UPPERCASE);
- case ESPEAKNG_CTYPE_FULL_STOP:
- current += utf8_out(c, current);
- if (c == '.' && text_decoder_peekc(tokenizer->decoder) == '.') {
- c = text_decoder_getc(tokenizer->decoder);
- if (text_decoder_peekc(tokenizer->decoder) == '.') {
- c = text_decoder_getc(tokenizer->decoder);
- current += utf8_out('.', current);
- current += utf8_out('.', current);
- *current = '\0';
- return ESPEAKNG_TOKEN_ELLIPSIS;
- } else {
- tokenizer->keepc = c;
- }
- }
- *current = '\0';
- return ESPEAKNG_TOKEN_FULL_STOP;
- case ESPEAKNG_CTYPE_QUESTION_MARK:
- current += utf8_out(c, current);
- *current = '\0';
- return ESPEAKNG_TOKEN_QUESTION_MARK;
- case ESPEAKNG_CTYPE_EXCLAMATION_MARK:
- current += utf8_out(c, current);
- *current = '\0';
- return ESPEAKNG_TOKEN_EXCLAMATION_MARK;
- case ESPEAKNG_CTYPE_COMMA:
- current += utf8_out(c, current);
- *current = '\0';
- return ESPEAKNG_TOKEN_COMMA;
- case ESPEAKNG_CTYPE_COLON:
- current += utf8_out(c, current);
- *current = '\0';
- return ESPEAKNG_TOKEN_COLON;
- case ESPEAKNG_CTYPE_SEMICOLON:
- current += utf8_out(c, current);
- *current = '\0';
- return ESPEAKNG_TOKEN_SEMICOLON;
- case ESPEAKNG_CTYPE_ELLIPSIS:
- current += utf8_out(c, current);
- *current = '\0';
- return ESPEAKNG_TOKEN_ELLIPSIS;
- case ESPEAKNG_CTYPE_PUNCTUATION:
- current += utf8_out(c, current);
- *current = '\0';
- return ESPEAKNG_TOKEN_PUNCTUATION;
- case ESPEAKNG_CTYPE_SYMBOL:
- current += utf8_out(c, current);
- *current = '\0';
- return ESPEAKNG_TOKEN_SYMBOL;
- default:
- current += utf8_out(c, current);
- *current = '\0';
- return ESPEAKNG_TOKEN_UNKNOWN;
- }
-
- return ESPEAKNG_TOKEN_END_OF_BUFFER;
- }
-
- #pragma GCC visibility push(default)
-
- espeak_ng_TOKENIZER *
- create_tokenizer(void)
- {
- espeak_ng_TOKENIZER *tokenizer = malloc(sizeof(espeak_ng_TOKENIZER));
- if (!tokenizer) return NULL;
-
- tokenizer->decoder = NULL;
- tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
- tokenizer->read = tokenizer_state_end_of_buffer;
-
- *tokenizer->token = '\0';
- return tokenizer;
- }
-
- void
- destroy_tokenizer(espeak_ng_TOKENIZER *tokenizer)
- {
- if (tokenizer) free(tokenizer);
- }
-
- int
- tokenizer_reset(espeak_ng_TOKENIZER *tokenizer,
- espeak_ng_TEXT_DECODER *decoder,
- espeak_ng_TOKENIZER_OPTIONS options)
- {
- if (!tokenizer) return 0;
-
- tokenizer->decoder = decoder;
- tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
- tokenizer->read = decoder ? tokenizer_state_default : tokenizer_state_end_of_buffer;
- return 1;
- }
-
- espeak_ng_TOKEN_TYPE
- tokenizer_read_next_token(espeak_ng_TOKENIZER *tokenizer)
- {
- return tokenizer->read(tokenizer);
- }
-
- const char *
- tokenizer_get_token_text(espeak_ng_TOKENIZER *tokenizer)
- {
- return tokenizer->token;
- }
-
- #pragma GCC visibility pop
|