Browse Source

tokenizer.c: Support semicolon tokens.

master
Reece H. Dunn 8 years ago
parent
commit
b847df63b5
3 changed files with 21 additions and 2 deletions
  1. 8
    1
      src/libespeak-ng/tokenizer.c
  2. 1
    0
      src/libespeak-ng/tokenizer.h
  3. 12
    1
      tests/tokenizer.c

+ 8
- 1
src/libespeak-ng/tokenizer.c View File

@@ -97,9 +97,10 @@ typedef enum {
ESPEAKNG_CTYPE_EXCLAMATION_MARK,
ESPEAKNG_CTYPE_COMMA,
ESPEAKNG_CTYPE_COLON,
ESPEAKNG_CTYPE_SEMICOLON,
} espeakng_CTYPE;

#define ESPEAKNG_CTYPE_PROPERTY_MASK 0xF80000000000C001ull
#define ESPEAKNG_CTYPE_PROPERTY_MASK 0xFC0000000000C001ull

// Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm
static espeakng_CTYPE codepoint_type(uint32_t c)
@@ -148,6 +149,8 @@ static espeakng_CTYPE codepoint_type(uint32_t c)
return ESPEAKNG_CTYPE_COMMA;
case ESPEAKNG_PROPERTY_COLON:
return ESPEAKNG_CTYPE_COLON;
case ESPEAKNG_PROPERTY_SEMI_COLON:
return ESPEAKNG_CTYPE_SEMICOLON;
}

// 4. Classify the remaining codepoints.
@@ -299,6 +302,10 @@ tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer)
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_COLON;
case ESPEAKNG_CTYPE_SEMICOLON:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_SEMICOLON;
default:
current += utf8_out(c, current);
*current = '\0';

+ 1
- 0
src/libespeak-ng/tokenizer.h View File

@@ -56,6 +56,7 @@ typedef enum
ESPEAKNG_TOKEN_EXCLAMATION_MARK,
ESPEAKNG_TOKEN_COMMA,
ESPEAKNG_TOKEN_COLON,
ESPEAKNG_TOKEN_SEMICOLON,
} espeak_ng_TOKEN_TYPE;

espeak_ng_TOKEN_TYPE

+ 12
- 1
tests/tokenizer.c View File

@@ -465,7 +465,7 @@ test_Latn_punctuation_tokens()
espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, ". ? ! , :", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
assert(text_decoder_decode_string(decoder, ". ? ! , : ;", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_FULL_STOP);
@@ -504,6 +504,14 @@ test_Latn_punctuation_tokens()
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ":") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_SEMICOLON);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), ";") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');
@@ -608,6 +616,9 @@ print_tokens(espeak_ng_TEXT_DECODER *decoder)
case ESPEAKNG_TOKEN_COLON:
printf("colon : %s\n", tokenizer_get_token_text(tokenizer));
break;
case ESPEAKNG_TOKEN_SEMICOLON:
printf("semicolon : %s\n", tokenizer_get_token_text(tokenizer));
break;
}
}


Loading…
Cancel
Save