ESPEAKNG_CTYPE_END_OF_STRING, | ESPEAKNG_CTYPE_END_OF_STRING, | ||||
ESPEAKNG_CTYPE_PARAGRAPH, | ESPEAKNG_CTYPE_PARAGRAPH, | ||||
ESPEAKNG_CTYPE_WHITESPACE, | ESPEAKNG_CTYPE_WHITESPACE, | ||||
ESPEAKNG_CTYPE_LOWERCASE, | |||||
ESPEAKNG_CTYPE_UPPERCASE, | |||||
} espeakng_CTYPE; | } espeakng_CTYPE; | ||||
#define ESPEAKNG_CTYPE_PROPERTY_MASK 0x0000000000000001ull | |||||
#define ESPEAKNG_CTYPE_PROPERTY_MASK 0x000000000000C001ull | |||||
// Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm | // Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm | ||||
static espeakng_CTYPE codepoint_type(uint32_t c) | static espeakng_CTYPE codepoint_type(uint32_t c) | ||||
ucd_category cat = ucd_lookup_category(c); | ucd_category cat = ucd_lookup_category(c); | ||||
switch (cat) | switch (cat) | ||||
{ | { | ||||
case UCD_CATEGORY_Lu: return ESPEAKNG_CTYPE_UPPERCASE; | |||||
case UCD_CATEGORY_Ll: return ESPEAKNG_CTYPE_LOWERCASE; | |||||
case UCD_CATEGORY_Zl: return ESPEAKNG_CTYPE_NEWLINE; | case UCD_CATEGORY_Zl: return ESPEAKNG_CTYPE_NEWLINE; | ||||
case UCD_CATEGORY_Zp: return ESPEAKNG_CTYPE_PARAGRAPH; | case UCD_CATEGORY_Zp: return ESPEAKNG_CTYPE_PARAGRAPH; | ||||
case UCD_CATEGORY_Zs: return ESPEAKNG_CTYPE_WHITESPACE; | case UCD_CATEGORY_Zs: return ESPEAKNG_CTYPE_WHITESPACE; | ||||
{ | { | ||||
case UCD_PROPERTY_WHITE_SPACE: | case UCD_PROPERTY_WHITE_SPACE: | ||||
return ESPEAKNG_CTYPE_WHITESPACE; | return ESPEAKNG_CTYPE_WHITESPACE; | ||||
case UCD_PROPERTY_OTHER_LOWERCASE: | |||||
return ESPEAKNG_CTYPE_LOWERCASE; | |||||
case UCD_PROPERTY_OTHER_UPPERCASE: | |||||
return ESPEAKNG_CTYPE_UPPERCASE; | |||||
} | } | ||||
// 4. Classify the remaining codepoints. | // 4. Classify the remaining codepoints. | ||||
return ESPEAKNG_TOKEN_END_OF_BUFFER; | return ESPEAKNG_TOKEN_END_OF_BUFFER; | ||||
} | } | ||||
static espeak_ng_TOKEN_TYPE | |||||
tokenizer_read_word_token(espeak_ng_TOKENIZER *tokenizer, char *current, espeak_ng_TOKEN_TYPE type) | |||||
{ | |||||
char *end = tokenizer->token + sizeof(tokenizer->token) - 5; // allow for UTF-8 trailing bytes | |||||
int initial_state = 1; | |||||
while (current < end && !text_decoder_eof(tokenizer->decoder)) { | |||||
uint32_t c = text_decoder_getc(tokenizer->decoder); | |||||
switch (codepoint_type(c)) | |||||
{ | |||||
case ESPEAKNG_CTYPE_LOWERCASE: | |||||
current += utf8_out(c, current); | |||||
switch (type) | |||||
{ | |||||
case ESPEAKNG_TOKEN_WORD_LOWERCASE: | |||||
case ESPEAKNG_TOKEN_WORD_MIXEDCASE: | |||||
case ESPEAKNG_TOKEN_WORD_CAPITALIZED: | |||||
break; | |||||
case ESPEAKNG_TOKEN_WORD_UPPERCASE: | |||||
type = initial_state | |||||
? ESPEAKNG_TOKEN_WORD_CAPITALIZED | |||||
: ESPEAKNG_TOKEN_WORD_MIXEDCASE; | |||||
break; | |||||
} | |||||
initial_state = 0; | |||||
break; | |||||
case ESPEAKNG_CTYPE_UPPERCASE: | |||||
current += utf8_out(c, current); | |||||
switch (type) | |||||
{ | |||||
case ESPEAKNG_TOKEN_WORD_UPPERCASE: | |||||
case ESPEAKNG_TOKEN_WORD_MIXEDCASE: | |||||
break; | |||||
case ESPEAKNG_TOKEN_WORD_LOWERCASE: | |||||
case ESPEAKNG_TOKEN_WORD_CAPITALIZED: | |||||
type = ESPEAKNG_TOKEN_WORD_MIXEDCASE; | |||||
break; | |||||
} | |||||
initial_state = 0; | |||||
break; | |||||
default: | |||||
tokenizer->keepc = c; | |||||
*current = '\0'; | |||||
return type; | |||||
} | |||||
} | |||||
*current = '\0'; | |||||
return type; | |||||
} | |||||
static espeak_ng_TOKEN_TYPE | static espeak_ng_TOKEN_TYPE | ||||
tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer) | tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer) | ||||
{ | { | ||||
tokenizer->keepc = c; | tokenizer->keepc = c; | ||||
*current = '\0'; | *current = '\0'; | ||||
return ESPEAKNG_TOKEN_WHITESPACE; | return ESPEAKNG_TOKEN_WHITESPACE; | ||||
case ESPEAKNG_CTYPE_LOWERCASE: | |||||
current += utf8_out(c, current); | |||||
return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_LOWERCASE); | |||||
case ESPEAKNG_CTYPE_UPPERCASE: | |||||
current += utf8_out(c, current); | |||||
return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_UPPERCASE); | |||||
default: | default: | ||||
current += utf8_out(c, current); | current += utf8_out(c, current); | ||||
*current = '\0'; | *current = '\0'; |
ESPEAKNG_TOKEN_NEWLINE, | ESPEAKNG_TOKEN_NEWLINE, | ||||
ESPEAKNG_TOKEN_PARAGRAPH, | ESPEAKNG_TOKEN_PARAGRAPH, | ||||
ESPEAKNG_TOKEN_WHITESPACE, | ESPEAKNG_TOKEN_WHITESPACE, | ||||
ESPEAKNG_TOKEN_WORD_UPPERCASE, | |||||
ESPEAKNG_TOKEN_WORD_LOWERCASE, | |||||
ESPEAKNG_TOKEN_WORD_MIXEDCASE, | |||||
ESPEAKNG_TOKEN_WORD_CAPITALIZED, | |||||
} espeak_ng_TOKEN_TYPE; | } espeak_ng_TOKEN_TYPE; | ||||
espeak_ng_TOKEN_TYPE | espeak_ng_TOKEN_TYPE |
destroy_tokenizer(tokenizer); | destroy_tokenizer(tokenizer); | ||||
} | } | ||||
void | |||||
test_Latn_word_tokens() | |||||
{ | |||||
printf("testing Latin (Latn) script word tokens\n"); | |||||
espeak_ng_TOKENIZER *tokenizer = create_tokenizer(); | |||||
espeak_ng_TEXT_DECODER *decoder = create_text_decoder(); | |||||
assert(text_decoder_decode_string(decoder, "One one ONE OneTwo ONETwo", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK); | |||||
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1); | |||||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_CAPITALIZED); | |||||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||||
assert(strcmp(tokenizer_get_token_text(tokenizer), "One") == 0); | |||||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE); | |||||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||||
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0); | |||||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_LOWERCASE); | |||||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||||
assert(strcmp(tokenizer_get_token_text(tokenizer), "one") == 0); | |||||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE); | |||||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||||
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0); | |||||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_UPPERCASE); | |||||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||||
assert(strcmp(tokenizer_get_token_text(tokenizer), "ONE") == 0); | |||||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE); | |||||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||||
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0); | |||||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_MIXEDCASE); | |||||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||||
assert(strcmp(tokenizer_get_token_text(tokenizer), "OneTwo") == 0); | |||||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE); | |||||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||||
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0); | |||||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_MIXEDCASE); | |||||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||||
assert(strcmp(tokenizer_get_token_text(tokenizer), "ONETwo") == 0); | |||||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER); | |||||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||||
assert(*tokenizer_get_token_text(tokenizer) == '\0'); | |||||
destroy_text_decoder(decoder); | |||||
destroy_tokenizer(tokenizer); | |||||
} | |||||
void | void | ||||
run_tests() | run_tests() | ||||
{ | { | ||||
test_paragraph_tokens(); | test_paragraph_tokens(); | ||||
test_whitespace_tokens(); | test_whitespace_tokens(); | ||||
test_Latn_word_tokens(); | |||||
printf("done\n"); | printf("done\n"); | ||||
} | } | ||||
destroy_tokenizer(tokenizer); | destroy_tokenizer(tokenizer); | ||||
return; | return; | ||||
case ESPEAKNG_TOKEN_UNKNOWN: | case ESPEAKNG_TOKEN_UNKNOWN: | ||||
printf("unknown : %s\n", tokenizer_get_token_text(tokenizer)); | |||||
printf("unknown : %s\n", tokenizer_get_token_text(tokenizer)); | |||||
break; | break; | ||||
case ESPEAKNG_TOKEN_NEWLINE: | case ESPEAKNG_TOKEN_NEWLINE: | ||||
printf("newline : "); | |||||
printf("newline : "); | |||||
escape_newline(tokenizer_get_token_text(tokenizer)); | escape_newline(tokenizer_get_token_text(tokenizer)); | ||||
putc('\n', stdout); | putc('\n', stdout); | ||||
break; | break; | ||||
case ESPEAKNG_TOKEN_PARAGRAPH: | case ESPEAKNG_TOKEN_PARAGRAPH: | ||||
printf("paragraph : %s\n", tokenizer_get_token_text(tokenizer)); | |||||
printf("paragraph : %s\n", tokenizer_get_token_text(tokenizer)); | |||||
break; | break; | ||||
case ESPEAKNG_TOKEN_WHITESPACE: | case ESPEAKNG_TOKEN_WHITESPACE: | ||||
printf("whitespace : %s\n", tokenizer_get_token_text(tokenizer)); | |||||
printf("whitespace : %s\n", tokenizer_get_token_text(tokenizer)); | |||||
break; | |||||
case ESPEAKNG_TOKEN_WORD_UPPERCASE: | |||||
printf("word (upper case) : %s\n", tokenizer_get_token_text(tokenizer)); | |||||
break; | |||||
case ESPEAKNG_TOKEN_WORD_LOWERCASE: | |||||
printf("word (lower case) : %s\n", tokenizer_get_token_text(tokenizer)); | |||||
break; | |||||
case ESPEAKNG_TOKEN_WORD_MIXEDCASE: | |||||
printf("word (mixed case) : %s\n", tokenizer_get_token_text(tokenizer)); | |||||
break; | |||||
case ESPEAKNG_TOKEN_WORD_CAPITALIZED: | |||||
printf("word (capitalized) : %s\n", tokenizer_get_token_text(tokenizer)); | |||||
break; | break; | ||||
} | } | ||||
} | } |