@@ -92,9 +92,10 @@ typedef enum { | |||
ESPEAKNG_CTYPE_WHITESPACE, | |||
ESPEAKNG_CTYPE_LOWERCASE, | |||
ESPEAKNG_CTYPE_UPPERCASE, | |||
ESPEAKNG_CTYPE_FULL_STOP, | |||
} espeakng_CTYPE; | |||
#define ESPEAKNG_CTYPE_PROPERTY_MASK 0x000000000000C001ull | |||
#define ESPEAKNG_CTYPE_PROPERTY_MASK 0x800000000000C001ull | |||
// Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm | |||
static espeakng_CTYPE codepoint_type(uint32_t c) | |||
@@ -133,6 +134,8 @@ static espeakng_CTYPE codepoint_type(uint32_t c) | |||
return ESPEAKNG_CTYPE_LOWERCASE; | |||
case UCD_PROPERTY_OTHER_UPPERCASE: | |||
return ESPEAKNG_CTYPE_UPPERCASE; | |||
case ESPEAKNG_PROPERTY_FULL_STOP: | |||
return ESPEAKNG_CTYPE_FULL_STOP; | |||
} | |||
// 4. Classify the remaining codepoints. | |||
@@ -264,6 +267,10 @@ tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer) | |||
case ESPEAKNG_CTYPE_UPPERCASE: | |||
current += utf8_out(c, current); | |||
return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_UPPERCASE); | |||
case ESPEAKNG_CTYPE_FULL_STOP: | |||
current += utf8_out(c, current); | |||
*current = '\0'; | |||
return ESPEAKNG_TOKEN_FULL_STOP; | |||
default: | |||
current += utf8_out(c, current); | |||
*current = '\0'; |
@@ -51,6 +51,7 @@ typedef enum | |||
ESPEAKNG_TOKEN_WORD_LOWERCASE, | |||
ESPEAKNG_TOKEN_WORD_MIXEDCASE, | |||
ESPEAKNG_TOKEN_WORD_CAPITALIZED, | |||
ESPEAKNG_TOKEN_FULL_STOP, | |||
} espeak_ng_TOKEN_TYPE; | |||
espeak_ng_TOKEN_TYPE |
@@ -457,6 +457,29 @@ test_Latn_word_tokens() | |||
destroy_tokenizer(tokenizer); | |||
} | |||
void | |||
test_Latn_punctuation_tokens() | |||
{ | |||
printf("testing Latin (Latn) script punctuation tokens\n"); | |||
espeak_ng_TOKENIZER *tokenizer = create_tokenizer(); | |||
espeak_ng_TEXT_DECODER *decoder = create_text_decoder(); | |||
assert(text_decoder_decode_string(decoder, ".", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK); | |||
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1); | |||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP); | |||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0); | |||
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER); | |||
assert(tokenizer_get_token_text(tokenizer) != NULL); | |||
assert(*tokenizer_get_token_text(tokenizer) == '\0'); | |||
destroy_text_decoder(decoder); | |||
destroy_tokenizer(tokenizer); | |||
} | |||
void | |||
run_tests() | |||
{ | |||
@@ -482,6 +505,7 @@ run_tests() | |||
test_whitespace_tokens(); | |||
test_Latn_word_tokens(); | |||
test_Latn_punctuation_tokens(); | |||
printf("done\n"); | |||
} | |||
@@ -537,6 +561,9 @@ print_tokens(espeak_ng_TEXT_DECODER *decoder) | |||
case ESPEAKNG_TOKEN_WORD_CAPITALIZED: | |||
printf("word (capitalized) : %s\n", tokenizer_get_token_text(tokenizer)); | |||
break; | |||
case ESPEAKNG_TOKEN_FULL_STOP: | |||
printf("full stop : %s\n", tokenizer_get_token_text(tokenizer)); | |||
break; | |||
} | |||
} | |||