Browse Source

tokenizer.c: Support whitespace tokens.

master
Reece H. Dunn 8 years ago
parent
commit
df6ca7a22c
3 changed files with 116 additions and 2 deletions
  1. 53
    2
      src/libespeak-ng/tokenizer.c
  2. 1
    0
      src/libespeak-ng/tokenizer.h
  3. 62
    0
      tests/tokenizer.c

+ 53
- 2
src/libespeak-ng/tokenizer.c View File

ESPEAKNG_CTYPE_CARRIAGE_RETURN, ESPEAKNG_CTYPE_CARRIAGE_RETURN,
ESPEAKNG_CTYPE_NEWLINE, ESPEAKNG_CTYPE_NEWLINE,
ESPEAKNG_CTYPE_END_OF_STRING, ESPEAKNG_CTYPE_END_OF_STRING,
ESPEAKNG_CTYPE_WHITESPACE,
} espeakng_CTYPE; } espeakng_CTYPE;


#define ESPEAKNG_CTYPE_PROPERTY_MASK 0x0000000000000001ull

static espeakng_CTYPE codepoint_type(uint32_t c) static espeakng_CTYPE codepoint_type(uint32_t c)
{ {
// 1. Detect and classify specific codepoints.

switch (c) switch (c)
{ {
case '\r': return ESPEAKNG_CTYPE_CARRIAGE_RETURN; case '\r': return ESPEAKNG_CTYPE_CARRIAGE_RETURN;
case '\n': return ESPEAKNG_CTYPE_NEWLINE; case '\n': return ESPEAKNG_CTYPE_NEWLINE;
case '\0': return ESPEAKNG_CTYPE_END_OF_STRING; case '\0': return ESPEAKNG_CTYPE_END_OF_STRING;
} }

// 2. Classify codepoints by their Unicode General Category.

ucd_category cat = ucd_lookup_category(c);
switch (cat)
{
case UCD_CATEGORY_Zl: return ESPEAKNG_CTYPE_WHITESPACE;
case UCD_CATEGORY_Zp: return ESPEAKNG_CTYPE_WHITESPACE;
case UCD_CATEGORY_Zs: return ESPEAKNG_CTYPE_WHITESPACE;
}

// 3. Classify codepoints by their Unicode properties.

ucd_property props = ucd_properties(c, cat);
switch (props & ESPEAKNG_CTYPE_PROPERTY_MASK)
{
case UCD_PROPERTY_WHITE_SPACE:
return ESPEAKNG_CTYPE_WHITESPACE;
}

// 4. Classify the remaining codepoints.

return ESPEAKNG_CTYPE_OTHER; return ESPEAKNG_CTYPE_OTHER;
} }


#define ESPEAKNG_CODEPOINT_INVALID 0xFFFFFFFF

struct espeak_ng_TOKENIZER_ struct espeak_ng_TOKENIZER_
{ {
espeak_ng_TEXT_DECODER *decoder; espeak_ng_TEXT_DECODER *decoder;
char token[256]; char token[256];
uint32_t keepc;


espeak_ng_TOKEN_TYPE (*read)(espeak_ng_TOKENIZER *tokenizer); espeak_ng_TOKEN_TYPE (*read)(espeak_ng_TOKENIZER *tokenizer);
}; };
return tokenizer_state_end_of_buffer(tokenizer); return tokenizer_state_end_of_buffer(tokenizer);
} }


uint32_t c;
char *current = tokenizer->token; char *current = tokenizer->token;
char *end = tokenizer->token + sizeof(tokenizer->token) - 5; // allow for UTF-8 trailing bytes


switch (codepoint_type(c = text_decoder_getc(tokenizer->decoder)))
uint32_t c;
if (tokenizer->keepc != ESPEAKNG_CODEPOINT_INVALID) {
c = tokenizer->keepc;
tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
} else {
c = text_decoder_getc(tokenizer->decoder);
}

switch (codepoint_type(c))
{ {
case ESPEAKNG_CTYPE_CARRIAGE_RETURN: // '\r' case ESPEAKNG_CTYPE_CARRIAGE_RETURN: // '\r'
if (text_decoder_peekc(tokenizer->decoder) == '\n') { if (text_decoder_peekc(tokenizer->decoder) == '\n') {
case ESPEAKNG_CTYPE_END_OF_STRING: // '\0' case ESPEAKNG_CTYPE_END_OF_STRING: // '\0'
tokenizer->read = tokenizer_state_end_of_buffer; tokenizer->read = tokenizer_state_end_of_buffer;
return tokenizer_state_end_of_buffer(tokenizer); return tokenizer_state_end_of_buffer(tokenizer);
case ESPEAKNG_CTYPE_WHITESPACE:
current += utf8_out(c, current);
while (!text_decoder_eof(tokenizer->decoder) &&
current < end &&
codepoint_type(c = text_decoder_getc(tokenizer->decoder)) == ESPEAKNG_CTYPE_WHITESPACE)
{
current += utf8_out(c, current);
}
tokenizer->keepc = c;
*current = '\0';
return ESPEAKNG_TOKEN_WHITESPACE;
default: default:
current += utf8_out(c, current); current += utf8_out(c, current);
*current = '\0'; *current = '\0';
if (!tokenizer) return NULL; if (!tokenizer) return NULL;


tokenizer->decoder = NULL; tokenizer->decoder = NULL;
tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
tokenizer->read = tokenizer_state_end_of_buffer; tokenizer->read = tokenizer_state_end_of_buffer;


*tokenizer->token = '\0'; *tokenizer->token = '\0';
if (!tokenizer) return 0; if (!tokenizer) return 0;


tokenizer->decoder = decoder; tokenizer->decoder = decoder;
tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
tokenizer->read = decoder ? tokenizer_state_default : tokenizer_state_end_of_buffer; tokenizer->read = decoder ? tokenizer_state_default : tokenizer_state_end_of_buffer;
return 1; return 1;
} }

+ 1
- 0
src/libespeak-ng/tokenizer.h View File

ESPEAKNG_TOKEN_END_OF_BUFFER, ESPEAKNG_TOKEN_END_OF_BUFFER,
ESPEAKNG_TOKEN_UNKNOWN, ESPEAKNG_TOKEN_UNKNOWN,
ESPEAKNG_TOKEN_NEWLINE, ESPEAKNG_TOKEN_NEWLINE,
ESPEAKNG_TOKEN_WHITESPACE,
} espeak_ng_TOKEN_TYPE; } espeak_ng_TOKEN_TYPE;


espeak_ng_TOKEN_TYPE espeak_ng_TOKEN_TYPE

+ 62
- 0
tests/tokenizer.c View File

destroy_tokenizer(tokenizer); destroy_tokenizer(tokenizer);
} }


void
test_whitespace_tokens()
{
printf("testing whitespace tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "\t\t\n \xE3\x80\x80 \n\xC2\xA0\xC2\xA0\n\xE2\x80\xA8\n\xE2\x80\xA9", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder) == 1);

// General Category: Cc, Property: White_Space
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\t\t") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

// General Category: Zs, Property: White_Space
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " \xE3\x80\x80 ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

// General Category: Zs, Property: White_Space, Decomposition: <noBreak>
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xA0\xC2\xA0") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

// General Category: Zl -- LINE SEPARATOR
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

// General Category: Zp -- PARAGRAPH SEPARATOR
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

int int
main(int argc, char **argv) main(int argc, char **argv)
{ {
test_fullwidth(); test_fullwidth();


test_unbound_tokenizer(); test_unbound_tokenizer();

test_linux_newline_tokens(); test_linux_newline_tokens();
test_mac_newline_tokens(); test_mac_newline_tokens();
test_windows_newline_tokens(); test_windows_newline_tokens();
test_whitespace_tokens();


printf("done\n"); printf("done\n");



Loading…
Cancel
Save