@@ -88,23 +88,53 @@ typedef enum { | |||
ESPEAKNG_CTYPE_CARRIAGE_RETURN, | |||
ESPEAKNG_CTYPE_NEWLINE, | |||
ESPEAKNG_CTYPE_END_OF_STRING, | |||
ESPEAKNG_CTYPE_WHITESPACE, | |||
} espeakng_CTYPE; | |||
#define ESPEAKNG_CTYPE_PROPERTY_MASK 0x0000000000000001ull | |||
static espeakng_CTYPE codepoint_type(uint32_t c) | |||
{ | |||
// 1. Detect and classify specific codepoints. | |||
switch (c) | |||
{ | |||
case '\r': return ESPEAKNG_CTYPE_CARRIAGE_RETURN; | |||
case '\n': return ESPEAKNG_CTYPE_NEWLINE; | |||
case '\0': return ESPEAKNG_CTYPE_END_OF_STRING; | |||
} | |||
// 2. Classify codepoints by their Unicode General Category. | |||
ucd_category cat = ucd_lookup_category(c); | |||
switch (cat) | |||
{ | |||
case UCD_CATEGORY_Zl: return ESPEAKNG_CTYPE_WHITESPACE; | |||
case UCD_CATEGORY_Zp: return ESPEAKNG_CTYPE_WHITESPACE; | |||
case UCD_CATEGORY_Zs: return ESPEAKNG_CTYPE_WHITESPACE; | |||
} | |||
// 3. Classify codepoints by their Unicode properties. | |||
ucd_property props = ucd_properties(c, cat); | |||
switch (props & ESPEAKNG_CTYPE_PROPERTY_MASK) | |||
{ | |||
case UCD_PROPERTY_WHITE_SPACE: | |||
return ESPEAKNG_CTYPE_WHITESPACE; | |||
} | |||
// 4. Classify the remaining codepoints. | |||
return ESPEAKNG_CTYPE_OTHER; | |||
} | |||
#define ESPEAKNG_CODEPOINT_INVALID 0xFFFFFFFF | |||
struct espeak_ng_TOKENIZER_ | |||
{ | |||
espeak_ng_TEXT_DECODER *decoder; | |||
char token[256]; | |||
uint32_t keepc; | |||
espeak_ng_TOKEN_TYPE (*read)(espeak_ng_TOKENIZER *tokenizer); | |||
}; | |||
@@ -124,10 +154,18 @@ tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer) | |||
return tokenizer_state_end_of_buffer(tokenizer); | |||
} | |||
uint32_t c; | |||
char *current = tokenizer->token; | |||
char *end = tokenizer->token + sizeof(tokenizer->token) - 5; // allow for UTF-8 trailing bytes | |||
switch (codepoint_type(c = text_decoder_getc(tokenizer->decoder))) | |||
uint32_t c; | |||
if (tokenizer->keepc != ESPEAKNG_CODEPOINT_INVALID) { | |||
c = tokenizer->keepc; | |||
tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID; | |||
} else { | |||
c = text_decoder_getc(tokenizer->decoder); | |||
} | |||
switch (codepoint_type(c)) | |||
{ | |||
case ESPEAKNG_CTYPE_CARRIAGE_RETURN: // '\r' | |||
if (text_decoder_peekc(tokenizer->decoder) == '\n') { | |||
@@ -142,6 +180,17 @@ tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer) | |||
case ESPEAKNG_CTYPE_END_OF_STRING: // '\0' | |||
tokenizer->read = tokenizer_state_end_of_buffer; | |||
return tokenizer_state_end_of_buffer(tokenizer); | |||
case ESPEAKNG_CTYPE_WHITESPACE: | |||
current += utf8_out(c, current); | |||
while (!text_decoder_eof(tokenizer->decoder) && | |||
current < end && | |||
codepoint_type(c = text_decoder_getc(tokenizer->decoder)) == ESPEAKNG_CTYPE_WHITESPACE) | |||
{ | |||
current += utf8_out(c, current); | |||
} | |||
tokenizer->keepc = c; | |||
*current = '\0'; | |||
return ESPEAKNG_TOKEN_WHITESPACE; | |||
default: | |||
current += utf8_out(c, current); | |||
*current = '\0'; | |||
@@ -158,6 +207,7 @@ create_tokenizer(void) | |||
if (!tokenizer) return NULL; | |||
tokenizer->decoder = NULL; | |||
tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID; | |||
tokenizer->read = tokenizer_state_end_of_buffer; | |||
*tokenizer->token = '\0'; | |||
@@ -177,6 +227,7 @@ tokenizer_reset(espeak_ng_TOKENIZER *tokenizer, | |||
if (!tokenizer) return 0; | |||
tokenizer->decoder = decoder; | |||
tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID; | |||
tokenizer->read = decoder ? tokenizer_state_default : tokenizer_state_end_of_buffer; | |||
return 1; | |||
} |
@@ -39,6 +39,7 @@ typedef enum | |||
ESPEAKNG_TOKEN_END_OF_BUFFER, | |||
ESPEAKNG_TOKEN_UNKNOWN, | |||
ESPEAKNG_TOKEN_NEWLINE, | |||
ESPEAKNG_TOKEN_WHITESPACE, | |||
} espeak_ng_TOKEN_TYPE; | |||
espeak_ng_TOKEN_TYPE |
@@ -260,6 +260,66 @@ test_windows_newline_tokens() | |||
destroy_tokenizer(tokenizer); | |||
} | |||
void | |||
test_whitespace_tokens() | |||
{ | |||
printf("testing whitespace tokens\n"); | |||
espeak_ng_TOKENIZER *tokenizer = create_tokenizer(); | |||
espeak_ng_TEXT_DECODER *decoder = create_text_decoder(); | |||
assert(text_decoder_decode_string(decoder, "\t\t\n \xE3\x80\x80 \n\xC2\xA0\xC2\xA0\n\xE2\x80\xA8\n\xE2\x80\xA9", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK); | |||
assert(tokenizer_reset(tokenizer, decoder) == 1); | |||
// General Category: Cc, Property: White_Space | |||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE); | |||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
assert(strcmp(tokenizer_get_token_text(tokenizer), "\t\t") == 0); | |||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE); | |||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0); | |||
// General Category: Zs, Property: White_Space | |||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE); | |||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
assert(strcmp(tokenizer_get_token_text(tokenizer), " \xE3\x80\x80 ") == 0); | |||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE); | |||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0); | |||
// General Category: Zs, Property: White_Space, Decomposition: <noBreak> | |||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE); | |||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xA0\xC2\xA0") == 0); | |||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE); | |||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0); | |||
// General Category: Zl -- LINE SEPARATOR | |||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE); | |||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0); | |||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE); | |||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0); | |||
// General Category: Zp -- PARAGRAPH SEPARATOR | |||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE); | |||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0); | |||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER); | |||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
assert(*tokenizer_get_token_text(tokenizer) == '\0'); | |||
destroy_text_decoder(decoder); | |||
destroy_tokenizer(tokenizer); | |||
} | |||
int | |||
main(int argc, char **argv) | |||
{ | |||
@@ -276,9 +336,11 @@ main(int argc, char **argv) | |||
test_fullwidth(); | |||
test_unbound_tokenizer(); | |||
test_linux_newline_tokens(); | |||
test_mac_newline_tokens(); | |||
test_windows_newline_tokens(); | |||
test_whitespace_tokens(); | |||
printf("done\n"); | |||