Browse Source

tokenizer.c: Support ellipsis tokens.

master
Reece H. Dunn 8 years ago
parent
commit
55bfbb4754
3 changed files with 62 additions and 2 deletions
  1. 20
    1
      src/libespeak-ng/tokenizer.c
  2. 1
    0
      src/libespeak-ng/tokenizer.h
  3. 41
    1
      tests/tokenizer.c

+ 20
- 1
src/libespeak-ng/tokenizer.c View File

ESPEAKNG_CTYPE_COMMA, ESPEAKNG_CTYPE_COMMA,
ESPEAKNG_CTYPE_COLON, ESPEAKNG_CTYPE_COLON,
ESPEAKNG_CTYPE_SEMICOLON, ESPEAKNG_CTYPE_SEMICOLON,
ESPEAKNG_CTYPE_ELLIPSIS,
} espeakng_CTYPE; } espeakng_CTYPE;


#define ESPEAKNG_CTYPE_PROPERTY_MASK 0xFC0000000000C001ull
#define ESPEAKNG_CTYPE_PROPERTY_MASK 0xFE0000000000C001ull


// Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm // Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm
static espeakng_CTYPE codepoint_type(uint32_t c) static espeakng_CTYPE codepoint_type(uint32_t c)
return ESPEAKNG_CTYPE_COLON; return ESPEAKNG_CTYPE_COLON;
case ESPEAKNG_PROPERTY_SEMI_COLON: case ESPEAKNG_PROPERTY_SEMI_COLON:
return ESPEAKNG_CTYPE_SEMICOLON; return ESPEAKNG_CTYPE_SEMICOLON;
case ESPEAKNG_PROPERTY_ELLIPSIS:
return ESPEAKNG_CTYPE_ELLIPSIS;
} }


// 4. Classify the remaining codepoints. // 4. Classify the remaining codepoints.
return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_UPPERCASE); return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_UPPERCASE);
case ESPEAKNG_CTYPE_FULL_STOP: case ESPEAKNG_CTYPE_FULL_STOP:
current += utf8_out(c, current); current += utf8_out(c, current);
if (c == '.' && text_decoder_peekc(tokenizer->decoder) == '.') {
c = text_decoder_getc(tokenizer->decoder);
if (text_decoder_peekc(tokenizer->decoder) == '.') {
c = text_decoder_getc(tokenizer->decoder);
current += utf8_out('.', current);
current += utf8_out('.', current);
*current = '\0';
return ESPEAKNG_TOKEN_ELLIPSIS;
} else {
tokenizer->keepc = c;
}
}
*current = '\0'; *current = '\0';
return ESPEAKNG_TOKEN_FULL_STOP; return ESPEAKNG_TOKEN_FULL_STOP;
case ESPEAKNG_CTYPE_QUESTION_MARK: case ESPEAKNG_CTYPE_QUESTION_MARK:
current += utf8_out(c, current); current += utf8_out(c, current);
*current = '\0'; *current = '\0';
return ESPEAKNG_TOKEN_SEMICOLON; return ESPEAKNG_TOKEN_SEMICOLON;
case ESPEAKNG_CTYPE_ELLIPSIS:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_ELLIPSIS;
default: default:
current += utf8_out(c, current); current += utf8_out(c, current);
*current = '\0'; *current = '\0';

+ 1
- 0
src/libespeak-ng/tokenizer.h View File

ESPEAKNG_TOKEN_COMMA, ESPEAKNG_TOKEN_COMMA,
ESPEAKNG_TOKEN_COLON, ESPEAKNG_TOKEN_COLON,
ESPEAKNG_TOKEN_SEMICOLON, ESPEAKNG_TOKEN_SEMICOLON,
ESPEAKNG_TOKEN_ELLIPSIS,
} espeak_ng_TOKEN_TYPE; } espeak_ng_TOKEN_TYPE;


espeak_ng_TOKEN_TYPE espeak_ng_TOKEN_TYPE

+ 41
- 1
tests/tokenizer.c View File

espeak_ng_TOKENIZER *tokenizer = create_tokenizer(); espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder(); espeak_ng_TEXT_DECODER *decoder = create_text_decoder();


assert(text_decoder_decode_string(decoder, ". ? ! , : ;", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
assert(text_decoder_decode_string(decoder, ". ? .. ! ... , .... : ; \xE2\x80\xA6", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1); assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);


assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP); assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);


assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_EXCLAMATION_MARK); assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_EXCLAMATION_MARK);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "!") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), "!") == 0);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);


assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_ELLIPSIS);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "...") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_COMMA); assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_COMMA);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ",") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), ",") == 0);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);


assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_ELLIPSIS);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "...") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ".") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_COLON); assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_COLON);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ":") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), ":") == 0);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ";") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), ";") == 0);


assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_ELLIPSIS);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA6") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER); assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0'); assert(*tokenizer_get_token_text(tokenizer) == '\0');

Loading…
Cancel
Save