| ESPEAKNG_CTYPE_WHITESPACE, | ESPEAKNG_CTYPE_WHITESPACE, | ||||
| ESPEAKNG_CTYPE_LOWERCASE, | ESPEAKNG_CTYPE_LOWERCASE, | ||||
| ESPEAKNG_CTYPE_UPPERCASE, | ESPEAKNG_CTYPE_UPPERCASE, | ||||
| ESPEAKNG_CTYPE_FULL_STOP, | |||||
| } espeakng_CTYPE; | } espeakng_CTYPE; | ||||
| #define ESPEAKNG_CTYPE_PROPERTY_MASK 0x000000000000C001ull | |||||
| #define ESPEAKNG_CTYPE_PROPERTY_MASK 0x800000000000C001ull | |||||
| // Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm | // Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm | ||||
| static espeakng_CTYPE codepoint_type(uint32_t c) | static espeakng_CTYPE codepoint_type(uint32_t c) | ||||
| return ESPEAKNG_CTYPE_LOWERCASE; | return ESPEAKNG_CTYPE_LOWERCASE; | ||||
| case UCD_PROPERTY_OTHER_UPPERCASE: | case UCD_PROPERTY_OTHER_UPPERCASE: | ||||
| return ESPEAKNG_CTYPE_UPPERCASE; | return ESPEAKNG_CTYPE_UPPERCASE; | ||||
| case ESPEAKNG_PROPERTY_FULL_STOP: | |||||
| return ESPEAKNG_CTYPE_FULL_STOP; | |||||
| } | } | ||||
| // 4. Classify the remaining codepoints. | // 4. Classify the remaining codepoints. | ||||
| case ESPEAKNG_CTYPE_UPPERCASE: | case ESPEAKNG_CTYPE_UPPERCASE: | ||||
| current += utf8_out(c, current); | current += utf8_out(c, current); | ||||
| return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_UPPERCASE); | return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_UPPERCASE); | ||||
| case ESPEAKNG_CTYPE_FULL_STOP: | |||||
| current += utf8_out(c, current); | |||||
| *current = '\0'; | |||||
| return ESPEAKNG_TOKEN_FULL_STOP; | |||||
| default: | default: | ||||
| current += utf8_out(c, current); | current += utf8_out(c, current); | ||||
| *current = '\0'; | *current = '\0'; |
| ESPEAKNG_TOKEN_WORD_LOWERCASE, | ESPEAKNG_TOKEN_WORD_LOWERCASE, | ||||
| ESPEAKNG_TOKEN_WORD_MIXEDCASE, | ESPEAKNG_TOKEN_WORD_MIXEDCASE, | ||||
| ESPEAKNG_TOKEN_WORD_CAPITALIZED, | ESPEAKNG_TOKEN_WORD_CAPITALIZED, | ||||
| ESPEAKNG_TOKEN_FULL_STOP, | |||||
| } espeak_ng_TOKEN_TYPE; | } espeak_ng_TOKEN_TYPE; | ||||
| espeak_ng_TOKEN_TYPE | espeak_ng_TOKEN_TYPE |
| destroy_tokenizer(tokenizer); | destroy_tokenizer(tokenizer); | ||||
| } | } | ||||
| void | |||||
| test_Latn_punctuation_tokens() | |||||
| { | |||||
| printf("testing Latin (Latn) script punctuation tokens\n"); | |||||
| espeak_ng_TOKENIZER *tokenizer = create_tokenizer(); | |||||
| espeak_ng_TEXT_DECODER *decoder = create_text_decoder(); | |||||
| assert(text_decoder_decode_string(decoder, ".", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK); | |||||
| assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1); | |||||
| assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP); | |||||
| assert(tokenizer_get_token_text(tokenizer) != NULL); | |||||
| assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0); | |||||
| assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER); | |||||
| assert(tokenizer_get_token_text(tokenizer) != NULL); | |||||
| assert(*tokenizer_get_token_text(tokenizer) == '\0'); | |||||
| destroy_text_decoder(decoder); | |||||
| destroy_tokenizer(tokenizer); | |||||
| } | |||||
| void | void | ||||
| run_tests() | run_tests() | ||||
| { | { | ||||
| test_whitespace_tokens(); | test_whitespace_tokens(); | ||||
| test_Latn_word_tokens(); | test_Latn_word_tokens(); | ||||
| test_Latn_punctuation_tokens(); | |||||
| printf("done\n"); | printf("done\n"); | ||||
| } | } | ||||
| case ESPEAKNG_TOKEN_WORD_CAPITALIZED: | case ESPEAKNG_TOKEN_WORD_CAPITALIZED: | ||||
| printf("word (capitalized) : %s\n", tokenizer_get_token_text(tokenizer)); | printf("word (capitalized) : %s\n", tokenizer_get_token_text(tokenizer)); | ||||
| break; | break; | ||||
| case ESPEAKNG_TOKEN_FULL_STOP: | |||||
| printf("full stop : %s\n", tokenizer_get_token_text(tokenizer)); | |||||
| break; | |||||
| } | } | ||||
| } | } | ||||