/* Tokenizer APIs. * * Copyright (C) 2017 Reece H. Dunn * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see: . */ #include "config.h" #include #include #include #include #include #include #include #include #include #include "speech.h" #include "phoneme.h" #include "synthesize.h" #include "translate.h" #define ESPEAKNG_CLAUSE_TYPE_PROPERTY_MASK 0xFFF0000000000000ull int clause_type_from_codepoint(uint32_t c) { ucd_category cat = ucd_lookup_category(c); ucd_property props = ucd_properties(c, cat); switch (props & ESPEAKNG_CLAUSE_TYPE_PROPERTY_MASK) { case ESPEAKNG_PROPERTY_FULL_STOP: return CLAUSE_PERIOD; case ESPEAKNG_PROPERTY_FULL_STOP | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER: return CLAUSE_PERIOD | CLAUSE_OPTIONAL_SPACE_AFTER; case ESPEAKNG_PROPERTY_QUESTION_MARK: return CLAUSE_QUESTION; case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER: return CLAUSE_QUESTION | CLAUSE_OPTIONAL_SPACE_AFTER; case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_PUNCTUATION_IN_WORD: return CLAUSE_QUESTION | CLAUSE_PUNCTUATION_IN_WORD; case ESPEAKNG_PROPERTY_EXCLAMATION_MARK: return CLAUSE_EXCLAMATION; case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER: return CLAUSE_EXCLAMATION | CLAUSE_OPTIONAL_SPACE_AFTER; case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_PUNCTUATION_IN_WORD: return CLAUSE_EXCLAMATION | CLAUSE_PUNCTUATION_IN_WORD; case ESPEAKNG_PROPERTY_COMMA: return CLAUSE_COMMA; case ESPEAKNG_PROPERTY_COMMA | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER: return CLAUSE_COMMA | CLAUSE_OPTIONAL_SPACE_AFTER; case ESPEAKNG_PROPERTY_COLON: return CLAUSE_COLON; case ESPEAKNG_PROPERTY_COLON | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER: return CLAUSE_COLON | CLAUSE_OPTIONAL_SPACE_AFTER; case ESPEAKNG_PROPERTY_SEMI_COLON: case ESPEAKNG_PROPERTY_EXTENDED_DASH: return CLAUSE_SEMICOLON; case ESPEAKNG_PROPERTY_SEMI_COLON | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER: case ESPEAKNG_PROPERTY_QUESTION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER | ESPEAKNG_PROPERTY_INVERTED_TERMINAL_PUNCTUATION: case ESPEAKNG_PROPERTY_EXCLAMATION_MARK | ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER | ESPEAKNG_PROPERTY_INVERTED_TERMINAL_PUNCTUATION: return CLAUSE_SEMICOLON | CLAUSE_OPTIONAL_SPACE_AFTER; case ESPEAKNG_PROPERTY_ELLIPSIS: return CLAUSE_SEMICOLON | CLAUSE_SPEAK_PUNCTUATION_NAME | CLAUSE_OPTIONAL_SPACE_AFTER; case ESPEAKNG_PROPERTY_PARAGRAPH_SEPARATOR: return CLAUSE_PARAGRAPH; } return CLAUSE_NONE; } typedef enum { ESPEAKNG_CTYPE_OTHER, ESPEAKNG_CTYPE_CARRIAGE_RETURN, ESPEAKNG_CTYPE_NEWLINE, ESPEAKNG_CTYPE_END_OF_STRING, ESPEAKNG_CTYPE_PARAGRAPH, ESPEAKNG_CTYPE_WHITESPACE, ESPEAKNG_CTYPE_LOWERCASE, ESPEAKNG_CTYPE_UPPERCASE, ESPEAKNG_CTYPE_FULL_STOP, ESPEAKNG_CTYPE_QUESTION_MARK, ESPEAKNG_CTYPE_EXCLAMATION_MARK, ESPEAKNG_CTYPE_COMMA, ESPEAKNG_CTYPE_COLON, ESPEAKNG_CTYPE_SEMICOLON, ESPEAKNG_CTYPE_ELLIPSIS, ESPEAKNG_CTYPE_PUNCTUATION, ESPEAKNG_CTYPE_SYMBOL, } espeakng_CTYPE; #define ESPEAKNG_CTYPE_PROPERTY_MASK 0xFE0000000000C001ull // Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm static espeakng_CTYPE codepoint_type(uint32_t c) { // 1. Detect and classify specific codepoints. switch (c) { case 0x0000: return ESPEAKNG_CTYPE_END_OF_STRING; // NULL case 0x000A: return ESPEAKNG_CTYPE_NEWLINE; // LINE FEED (LF) case 0x000C: return ESPEAKNG_CTYPE_NEWLINE; // FORM FEED (FF) case 0x000D: return ESPEAKNG_CTYPE_CARRIAGE_RETURN; // CARRIAGE RETURN (CR) case 0x0085: return ESPEAKNG_CTYPE_NEWLINE; // NEW LINE (NEL) } // 2. Override property types for codepoints by their Unicode General Category. ucd_category cat = ucd_lookup_category(c); switch (cat) { case UCD_CATEGORY_Zl: return ESPEAKNG_CTYPE_NEWLINE; case UCD_CATEGORY_Zp: return ESPEAKNG_CTYPE_PARAGRAPH; } // 3. Classify codepoints by their Unicode properties. ucd_property props = ucd_properties(c, cat); switch (props & ESPEAKNG_CTYPE_PROPERTY_MASK) { case UCD_PROPERTY_WHITE_SPACE: return ESPEAKNG_CTYPE_WHITESPACE; case UCD_PROPERTY_OTHER_LOWERCASE: return ESPEAKNG_CTYPE_LOWERCASE; case UCD_PROPERTY_OTHER_UPPERCASE: return ESPEAKNG_CTYPE_UPPERCASE; case ESPEAKNG_PROPERTY_FULL_STOP: return ESPEAKNG_CTYPE_FULL_STOP; case ESPEAKNG_PROPERTY_QUESTION_MARK: return ESPEAKNG_CTYPE_QUESTION_MARK; case ESPEAKNG_PROPERTY_EXCLAMATION_MARK: return ESPEAKNG_CTYPE_EXCLAMATION_MARK; case ESPEAKNG_PROPERTY_COMMA: return ESPEAKNG_CTYPE_COMMA; case ESPEAKNG_PROPERTY_COLON: return ESPEAKNG_CTYPE_COLON; case ESPEAKNG_PROPERTY_SEMI_COLON: return ESPEAKNG_CTYPE_SEMICOLON; case ESPEAKNG_PROPERTY_ELLIPSIS: return ESPEAKNG_CTYPE_ELLIPSIS; } // 4. Classify codepoints by their Unicode General Category. switch (cat) { case UCD_CATEGORY_Lu: return ESPEAKNG_CTYPE_UPPERCASE; case UCD_CATEGORY_Ll: return ESPEAKNG_CTYPE_LOWERCASE; case UCD_CATEGORY_Pc: return ESPEAKNG_CTYPE_PUNCTUATION; case UCD_CATEGORY_Pd: return ESPEAKNG_CTYPE_PUNCTUATION; case UCD_CATEGORY_Pe: return ESPEAKNG_CTYPE_PUNCTUATION; case UCD_CATEGORY_Pf: return ESPEAKNG_CTYPE_PUNCTUATION; case UCD_CATEGORY_Pi: return ESPEAKNG_CTYPE_PUNCTUATION; case UCD_CATEGORY_Po: return ESPEAKNG_CTYPE_PUNCTUATION; case UCD_CATEGORY_Ps: return ESPEAKNG_CTYPE_PUNCTUATION; case UCD_CATEGORY_Sc: return ESPEAKNG_CTYPE_SYMBOL; case UCD_CATEGORY_Sk: return ESPEAKNG_CTYPE_SYMBOL; case UCD_CATEGORY_Sm: return ESPEAKNG_CTYPE_SYMBOL; case UCD_CATEGORY_So: return ESPEAKNG_CTYPE_SYMBOL; } // 5. Classify the remaining codepoints. return ESPEAKNG_CTYPE_OTHER; } #define ESPEAKNG_CODEPOINT_INVALID 0xFFFFFFFF struct espeak_ng_TOKENIZER_ { espeak_ng_TEXT_DECODER *decoder; char token[256]; uint32_t keepc; espeak_ng_TOKEN_TYPE (*read)(espeak_ng_TOKENIZER *tokenizer); }; static espeak_ng_TOKEN_TYPE tokenizer_state_end_of_buffer(espeak_ng_TOKENIZER *tokenizer) { *tokenizer->token = '\0'; return ESPEAKNG_TOKEN_END_OF_BUFFER; } static espeak_ng_TOKEN_TYPE tokenizer_read_word_token(espeak_ng_TOKENIZER *tokenizer, char *current, espeak_ng_TOKEN_TYPE type) { char *end = tokenizer->token + sizeof(tokenizer->token) - 5; // allow for UTF-8 trailing bytes int initial_state = 1; while (current < end && !text_decoder_eof(tokenizer->decoder)) { uint32_t c = text_decoder_getc(tokenizer->decoder); switch (codepoint_type(c)) { case ESPEAKNG_CTYPE_LOWERCASE: current += utf8_out(c, current); switch (type) { case ESPEAKNG_TOKEN_WORD_LOWERCASE: case ESPEAKNG_TOKEN_WORD_MIXEDCASE: case ESPEAKNG_TOKEN_WORD_CAPITALIZED: break; case ESPEAKNG_TOKEN_WORD_UPPERCASE: type = initial_state ? ESPEAKNG_TOKEN_WORD_CAPITALIZED : ESPEAKNG_TOKEN_WORD_MIXEDCASE; break; } initial_state = 0; break; case ESPEAKNG_CTYPE_UPPERCASE: current += utf8_out(c, current); switch (type) { case ESPEAKNG_TOKEN_WORD_UPPERCASE: case ESPEAKNG_TOKEN_WORD_MIXEDCASE: break; case ESPEAKNG_TOKEN_WORD_LOWERCASE: case ESPEAKNG_TOKEN_WORD_CAPITALIZED: type = ESPEAKNG_TOKEN_WORD_MIXEDCASE; break; } initial_state = 0; break; default: tokenizer->keepc = c; *current = '\0'; return type; } } *current = '\0'; return type; } static espeak_ng_TOKEN_TYPE tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer) { if (text_decoder_eof(tokenizer->decoder)) { tokenizer->read = tokenizer_state_end_of_buffer; return tokenizer_state_end_of_buffer(tokenizer); } char *current = tokenizer->token; char *end = tokenizer->token + sizeof(tokenizer->token) - 5; // allow for UTF-8 trailing bytes uint32_t c; if (tokenizer->keepc != ESPEAKNG_CODEPOINT_INVALID) { c = tokenizer->keepc; tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID; } else { c = text_decoder_getc(tokenizer->decoder); } switch (codepoint_type(c)) { case ESPEAKNG_CTYPE_END_OF_STRING: // '\0' tokenizer->read = tokenizer_state_end_of_buffer; return tokenizer_state_end_of_buffer(tokenizer); case ESPEAKNG_CTYPE_CARRIAGE_RETURN: // '\r' if (text_decoder_peekc(tokenizer->decoder) == '\n') { current += utf8_out(c, current); c = text_decoder_getc(tokenizer->decoder); } // fallthrough case ESPEAKNG_CTYPE_NEWLINE: current += utf8_out(c, current); *current = '\0'; return ESPEAKNG_TOKEN_NEWLINE; case ESPEAKNG_CTYPE_PARAGRAPH: current += utf8_out(c, current); *current = '\0'; return ESPEAKNG_TOKEN_PARAGRAPH; case ESPEAKNG_CTYPE_WHITESPACE: current += utf8_out(c, current); while (!text_decoder_eof(tokenizer->decoder) && current < end && codepoint_type(c = text_decoder_getc(tokenizer->decoder)) == ESPEAKNG_CTYPE_WHITESPACE) { current += utf8_out(c, current); } tokenizer->keepc = c; *current = '\0'; return ESPEAKNG_TOKEN_WHITESPACE; case ESPEAKNG_CTYPE_LOWERCASE: current += utf8_out(c, current); return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_LOWERCASE); case ESPEAKNG_CTYPE_UPPERCASE: current += utf8_out(c, current); return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_UPPERCASE); case ESPEAKNG_CTYPE_FULL_STOP: current += utf8_out(c, current); if (c == '.' && text_decoder_peekc(tokenizer->decoder) == '.') { c = text_decoder_getc(tokenizer->decoder); if (text_decoder_peekc(tokenizer->decoder) == '.') { c = text_decoder_getc(tokenizer->decoder); current += utf8_out('.', current); current += utf8_out('.', current); *current = '\0'; return ESPEAKNG_TOKEN_ELLIPSIS; } else { tokenizer->keepc = c; } } *current = '\0'; return ESPEAKNG_TOKEN_FULL_STOP; case ESPEAKNG_CTYPE_QUESTION_MARK: current += utf8_out(c, current); *current = '\0'; return ESPEAKNG_TOKEN_QUESTION_MARK; case ESPEAKNG_CTYPE_EXCLAMATION_MARK: current += utf8_out(c, current); *current = '\0'; return ESPEAKNG_TOKEN_EXCLAMATION_MARK; case ESPEAKNG_CTYPE_COMMA: current += utf8_out(c, current); *current = '\0'; return ESPEAKNG_TOKEN_COMMA; case ESPEAKNG_CTYPE_COLON: current += utf8_out(c, current); *current = '\0'; return ESPEAKNG_TOKEN_COLON; case ESPEAKNG_CTYPE_SEMICOLON: current += utf8_out(c, current); *current = '\0'; return ESPEAKNG_TOKEN_SEMICOLON; case ESPEAKNG_CTYPE_ELLIPSIS: current += utf8_out(c, current); *current = '\0'; return ESPEAKNG_TOKEN_ELLIPSIS; case ESPEAKNG_CTYPE_PUNCTUATION: current += utf8_out(c, current); *current = '\0'; return ESPEAKNG_TOKEN_PUNCTUATION; case ESPEAKNG_CTYPE_SYMBOL: current += utf8_out(c, current); *current = '\0'; return ESPEAKNG_TOKEN_SYMBOL; default: current += utf8_out(c, current); *current = '\0'; return ESPEAKNG_TOKEN_UNKNOWN; } return ESPEAKNG_TOKEN_END_OF_BUFFER; } #pragma GCC visibility push(default) espeak_ng_TOKENIZER * create_tokenizer(void) { espeak_ng_TOKENIZER *tokenizer = malloc(sizeof(espeak_ng_TOKENIZER)); if (!tokenizer) return NULL; tokenizer->decoder = NULL; tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID; tokenizer->read = tokenizer_state_end_of_buffer; *tokenizer->token = '\0'; return tokenizer; } void destroy_tokenizer(espeak_ng_TOKENIZER *tokenizer) { if (tokenizer) free(tokenizer); } int tokenizer_reset(espeak_ng_TOKENIZER *tokenizer, espeak_ng_TEXT_DECODER *decoder, espeak_ng_TOKENIZER_OPTIONS options) { if (!tokenizer) return 0; tokenizer->decoder = decoder; tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID; tokenizer->read = decoder ? tokenizer_state_default : tokenizer_state_end_of_buffer; return 1; } espeak_ng_TOKEN_TYPE tokenizer_read_next_token(espeak_ng_TOKENIZER *tokenizer) { return tokenizer->read(tokenizer); } const char * tokenizer_get_token_text(espeak_ng_TOKENIZER *tokenizer) { return tokenizer->token; } #pragma GCC visibility pop