|
|
|
|
|
|
|
|
ESPEAKNG_CTYPE_CARRIAGE_RETURN, |
|
|
ESPEAKNG_CTYPE_CARRIAGE_RETURN, |
|
|
ESPEAKNG_CTYPE_NEWLINE, |
|
|
ESPEAKNG_CTYPE_NEWLINE, |
|
|
ESPEAKNG_CTYPE_END_OF_STRING, |
|
|
ESPEAKNG_CTYPE_END_OF_STRING, |
|
|
|
|
|
ESPEAKNG_CTYPE_WHITESPACE, |
|
|
} espeakng_CTYPE; |
|
|
} espeakng_CTYPE; |
|
|
|
|
|
|
|
|
|
|
|
#define ESPEAKNG_CTYPE_PROPERTY_MASK 0x0000000000000001ull |
|
|
|
|
|
|
|
|
static espeakng_CTYPE codepoint_type(uint32_t c) |
|
|
static espeakng_CTYPE codepoint_type(uint32_t c) |
|
|
{ |
|
|
{ |
|
|
|
|
|
// 1. Detect and classify specific codepoints. |
|
|
|
|
|
|
|
|
switch (c) |
|
|
switch (c) |
|
|
{ |
|
|
{ |
|
|
case '\r': return ESPEAKNG_CTYPE_CARRIAGE_RETURN; |
|
|
case '\r': return ESPEAKNG_CTYPE_CARRIAGE_RETURN; |
|
|
case '\n': return ESPEAKNG_CTYPE_NEWLINE; |
|
|
case '\n': return ESPEAKNG_CTYPE_NEWLINE; |
|
|
case '\0': return ESPEAKNG_CTYPE_END_OF_STRING; |
|
|
case '\0': return ESPEAKNG_CTYPE_END_OF_STRING; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// 2. Classify codepoints by their Unicode General Category. |
|
|
|
|
|
|
|
|
|
|
|
ucd_category cat = ucd_lookup_category(c); |
|
|
|
|
|
switch (cat) |
|
|
|
|
|
{ |
|
|
|
|
|
case UCD_CATEGORY_Zl: return ESPEAKNG_CTYPE_WHITESPACE; |
|
|
|
|
|
case UCD_CATEGORY_Zp: return ESPEAKNG_CTYPE_WHITESPACE; |
|
|
|
|
|
case UCD_CATEGORY_Zs: return ESPEAKNG_CTYPE_WHITESPACE; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// 3. Classify codepoints by their Unicode properties. |
|
|
|
|
|
|
|
|
|
|
|
ucd_property props = ucd_properties(c, cat); |
|
|
|
|
|
switch (props & ESPEAKNG_CTYPE_PROPERTY_MASK) |
|
|
|
|
|
{ |
|
|
|
|
|
case UCD_PROPERTY_WHITE_SPACE: |
|
|
|
|
|
return ESPEAKNG_CTYPE_WHITESPACE; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// 4. Classify the remaining codepoints. |
|
|
|
|
|
|
|
|
return ESPEAKNG_CTYPE_OTHER; |
|
|
return ESPEAKNG_CTYPE_OTHER; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#define ESPEAKNG_CODEPOINT_INVALID 0xFFFFFFFF |
|
|
|
|
|
|
|
|
struct espeak_ng_TOKENIZER_ |
|
|
struct espeak_ng_TOKENIZER_ |
|
|
{ |
|
|
{ |
|
|
espeak_ng_TEXT_DECODER *decoder; |
|
|
espeak_ng_TEXT_DECODER *decoder; |
|
|
char token[256]; |
|
|
char token[256]; |
|
|
|
|
|
uint32_t keepc; |
|
|
|
|
|
|
|
|
espeak_ng_TOKEN_TYPE (*read)(espeak_ng_TOKENIZER *tokenizer); |
|
|
espeak_ng_TOKEN_TYPE (*read)(espeak_ng_TOKENIZER *tokenizer); |
|
|
}; |
|
|
}; |
|
|
|
|
|
|
|
|
return tokenizer_state_end_of_buffer(tokenizer); |
|
|
return tokenizer_state_end_of_buffer(tokenizer); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
uint32_t c; |
|
|
|
|
|
char *current = tokenizer->token; |
|
|
char *current = tokenizer->token; |
|
|
|
|
|
char *end = tokenizer->token + sizeof(tokenizer->token) - 5; // allow for UTF-8 trailing bytes |
|
|
|
|
|
|
|
|
switch (codepoint_type(c = text_decoder_getc(tokenizer->decoder))) |
|
|
|
|
|
|
|
|
uint32_t c; |
|
|
|
|
|
if (tokenizer->keepc != ESPEAKNG_CODEPOINT_INVALID) { |
|
|
|
|
|
c = tokenizer->keepc; |
|
|
|
|
|
tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID; |
|
|
|
|
|
} else { |
|
|
|
|
|
c = text_decoder_getc(tokenizer->decoder); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
switch (codepoint_type(c)) |
|
|
{ |
|
|
{ |
|
|
case ESPEAKNG_CTYPE_CARRIAGE_RETURN: // '\r' |
|
|
case ESPEAKNG_CTYPE_CARRIAGE_RETURN: // '\r' |
|
|
if (text_decoder_peekc(tokenizer->decoder) == '\n') { |
|
|
if (text_decoder_peekc(tokenizer->decoder) == '\n') { |
|
|
|
|
|
|
|
|
case ESPEAKNG_CTYPE_END_OF_STRING: // '\0' |
|
|
case ESPEAKNG_CTYPE_END_OF_STRING: // '\0' |
|
|
tokenizer->read = tokenizer_state_end_of_buffer; |
|
|
tokenizer->read = tokenizer_state_end_of_buffer; |
|
|
return tokenizer_state_end_of_buffer(tokenizer); |
|
|
return tokenizer_state_end_of_buffer(tokenizer); |
|
|
|
|
|
case ESPEAKNG_CTYPE_WHITESPACE: |
|
|
|
|
|
current += utf8_out(c, current); |
|
|
|
|
|
while (!text_decoder_eof(tokenizer->decoder) && |
|
|
|
|
|
current < end && |
|
|
|
|
|
codepoint_type(c = text_decoder_getc(tokenizer->decoder)) == ESPEAKNG_CTYPE_WHITESPACE) |
|
|
|
|
|
{ |
|
|
|
|
|
current += utf8_out(c, current); |
|
|
|
|
|
} |
|
|
|
|
|
tokenizer->keepc = c; |
|
|
|
|
|
*current = '\0'; |
|
|
|
|
|
return ESPEAKNG_TOKEN_WHITESPACE; |
|
|
default: |
|
|
default: |
|
|
current += utf8_out(c, current); |
|
|
current += utf8_out(c, current); |
|
|
*current = '\0'; |
|
|
*current = '\0'; |
|
|
|
|
|
|
|
|
if (!tokenizer) return NULL; |
|
|
if (!tokenizer) return NULL; |
|
|
|
|
|
|
|
|
tokenizer->decoder = NULL; |
|
|
tokenizer->decoder = NULL; |
|
|
|
|
|
tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID; |
|
|
tokenizer->read = tokenizer_state_end_of_buffer; |
|
|
tokenizer->read = tokenizer_state_end_of_buffer; |
|
|
|
|
|
|
|
|
*tokenizer->token = '\0'; |
|
|
*tokenizer->token = '\0'; |
|
|
|
|
|
|
|
|
if (!tokenizer) return 0; |
|
|
if (!tokenizer) return 0; |
|
|
|
|
|
|
|
|
tokenizer->decoder = decoder; |
|
|
tokenizer->decoder = decoder; |
|
|
|
|
|
tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID; |
|
|
tokenizer->read = decoder ? tokenizer_state_default : tokenizer_state_end_of_buffer; |
|
|
tokenizer->read = decoder ? tokenizer_state_default : tokenizer_state_end_of_buffer; |
|
|
return 1; |
|
|
return 1; |
|
|
} |
|
|
} |