ESPEAKNG_CTYPE_WHITESPACE, | ESPEAKNG_CTYPE_WHITESPACE, | ||||
ESPEAKNG_CTYPE_LOWERCASE, | ESPEAKNG_CTYPE_LOWERCASE, | ||||
ESPEAKNG_CTYPE_UPPERCASE, | ESPEAKNG_CTYPE_UPPERCASE, | ||||
ESPEAKNG_CTYPE_FULL_STOP, | |||||
} espeakng_CTYPE; | } espeakng_CTYPE; | ||||
#define ESPEAKNG_CTYPE_PROPERTY_MASK 0x000000000000C001ull | |||||
#define ESPEAKNG_CTYPE_PROPERTY_MASK 0x800000000000C001ull | |||||
// Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm | // Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm | ||||
static espeakng_CTYPE codepoint_type(uint32_t c) | static espeakng_CTYPE codepoint_type(uint32_t c) | ||||
return ESPEAKNG_CTYPE_LOWERCASE; | return ESPEAKNG_CTYPE_LOWERCASE; | ||||
case UCD_PROPERTY_OTHER_UPPERCASE: | case UCD_PROPERTY_OTHER_UPPERCASE: | ||||
return ESPEAKNG_CTYPE_UPPERCASE; | return ESPEAKNG_CTYPE_UPPERCASE; | ||||
case ESPEAKNG_PROPERTY_FULL_STOP: | |||||
return ESPEAKNG_CTYPE_FULL_STOP; | |||||
} | } | ||||
// 4. Classify the remaining codepoints. | // 4. Classify the remaining codepoints. | ||||
case ESPEAKNG_CTYPE_UPPERCASE: | case ESPEAKNG_CTYPE_UPPERCASE: | ||||
current += utf8_out(c, current); | current += utf8_out(c, current); | ||||
return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_UPPERCASE); | return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_UPPERCASE); | ||||
case ESPEAKNG_CTYPE_FULL_STOP: | |||||
current += utf8_out(c, current); | |||||
*current = '\0'; | |||||
return ESPEAKNG_TOKEN_FULL_STOP; | |||||
default: | default: | ||||
current += utf8_out(c, current); | current += utf8_out(c, current); | ||||
*current = '\0'; | *current = '\0'; |
ESPEAKNG_TOKEN_WORD_LOWERCASE, | ESPEAKNG_TOKEN_WORD_LOWERCASE, | ||||
ESPEAKNG_TOKEN_WORD_MIXEDCASE, | ESPEAKNG_TOKEN_WORD_MIXEDCASE, | ||||
ESPEAKNG_TOKEN_WORD_CAPITALIZED, | ESPEAKNG_TOKEN_WORD_CAPITALIZED, | ||||
ESPEAKNG_TOKEN_FULL_STOP, | |||||
} espeak_ng_TOKEN_TYPE; | } espeak_ng_TOKEN_TYPE; | ||||
espeak_ng_TOKEN_TYPE | espeak_ng_TOKEN_TYPE |
destroy_tokenizer(tokenizer); | destroy_tokenizer(tokenizer); | ||||
} | } | ||||
void | |||||
test_Latn_punctuation_tokens() | |||||
{ | |||||
printf("testing Latin (Latn) script punctuation tokens\n"); | |||||
espeak_ng_TOKENIZER *tokenizer = create_tokenizer(); | |||||
espeak_ng_TEXT_DECODER *decoder = create_text_decoder(); | |||||
assert(text_decoder_decode_string(decoder, ".", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK); | |||||
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1); | |||||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP); | |||||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||||
assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0); | |||||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER); | |||||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||||
assert(*tokenizer_get_token_text(tokenizer) == '\0'); | |||||
destroy_text_decoder(decoder); | |||||
destroy_tokenizer(tokenizer); | |||||
} | |||||
void | void | ||||
run_tests() | run_tests() | ||||
{ | { | ||||
test_whitespace_tokens(); | test_whitespace_tokens(); | ||||
test_Latn_word_tokens(); | test_Latn_word_tokens(); | ||||
test_Latn_punctuation_tokens(); | |||||
printf("done\n"); | printf("done\n"); | ||||
} | } | ||||
case ESPEAKNG_TOKEN_WORD_CAPITALIZED: | case ESPEAKNG_TOKEN_WORD_CAPITALIZED: | ||||
printf("word (capitalized) : %s\n", tokenizer_get_token_text(tokenizer)); | printf("word (capitalized) : %s\n", tokenizer_get_token_text(tokenizer)); | ||||
break; | break; | ||||
case ESPEAKNG_TOKEN_FULL_STOP: | |||||
printf("full stop : %s\n", tokenizer_get_token_text(tokenizer)); | |||||
break; | |||||
} | } | ||||
} | } | ||||