Browse Source

tokenizer.c: Support whitespace tokens.

master
Reece H. Dunn 8 years ago
parent
commit
df6ca7a22c
3 changed files with 116 additions and 2 deletions
  1. 53
    2
      src/libespeak-ng/tokenizer.c
  2. 1
    0
      src/libespeak-ng/tokenizer.h
  3. 62
    0
      tests/tokenizer.c

+ 53
- 2
src/libespeak-ng/tokenizer.c View File

@@ -88,23 +88,53 @@ typedef enum {
ESPEAKNG_CTYPE_CARRIAGE_RETURN,
ESPEAKNG_CTYPE_NEWLINE,
ESPEAKNG_CTYPE_END_OF_STRING,
ESPEAKNG_CTYPE_WHITESPACE,
} espeakng_CTYPE;

#define ESPEAKNG_CTYPE_PROPERTY_MASK 0x0000000000000001ull

static espeakng_CTYPE codepoint_type(uint32_t c)
{
// 1. Detect and classify specific codepoints.

switch (c)
{
case '\r': return ESPEAKNG_CTYPE_CARRIAGE_RETURN;
case '\n': return ESPEAKNG_CTYPE_NEWLINE;
case '\0': return ESPEAKNG_CTYPE_END_OF_STRING;
}

// 2. Classify codepoints by their Unicode General Category.

ucd_category cat = ucd_lookup_category(c);
switch (cat)
{
case UCD_CATEGORY_Zl: return ESPEAKNG_CTYPE_WHITESPACE;
case UCD_CATEGORY_Zp: return ESPEAKNG_CTYPE_WHITESPACE;
case UCD_CATEGORY_Zs: return ESPEAKNG_CTYPE_WHITESPACE;
}

// 3. Classify codepoints by their Unicode properties.

ucd_property props = ucd_properties(c, cat);
switch (props & ESPEAKNG_CTYPE_PROPERTY_MASK)
{
case UCD_PROPERTY_WHITE_SPACE:
return ESPEAKNG_CTYPE_WHITESPACE;
}

// 4. Classify the remaining codepoints.

return ESPEAKNG_CTYPE_OTHER;
}

#define ESPEAKNG_CODEPOINT_INVALID 0xFFFFFFFF

struct espeak_ng_TOKENIZER_
{
espeak_ng_TEXT_DECODER *decoder;
char token[256];
uint32_t keepc;

espeak_ng_TOKEN_TYPE (*read)(espeak_ng_TOKENIZER *tokenizer);
};
@@ -124,10 +154,18 @@ tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer)
return tokenizer_state_end_of_buffer(tokenizer);
}

uint32_t c;
char *current = tokenizer->token;
char *end = tokenizer->token + sizeof(tokenizer->token) - 5; // allow for UTF-8 trailing bytes

switch (codepoint_type(c = text_decoder_getc(tokenizer->decoder)))
uint32_t c;
if (tokenizer->keepc != ESPEAKNG_CODEPOINT_INVALID) {
c = tokenizer->keepc;
tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
} else {
c = text_decoder_getc(tokenizer->decoder);
}

switch (codepoint_type(c))
{
case ESPEAKNG_CTYPE_CARRIAGE_RETURN: // '\r'
if (text_decoder_peekc(tokenizer->decoder) == '\n') {
@@ -142,6 +180,17 @@ tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer)
case ESPEAKNG_CTYPE_END_OF_STRING: // '\0'
tokenizer->read = tokenizer_state_end_of_buffer;
return tokenizer_state_end_of_buffer(tokenizer);
case ESPEAKNG_CTYPE_WHITESPACE:
current += utf8_out(c, current);
while (!text_decoder_eof(tokenizer->decoder) &&
current < end &&
codepoint_type(c = text_decoder_getc(tokenizer->decoder)) == ESPEAKNG_CTYPE_WHITESPACE)
{
current += utf8_out(c, current);
}
tokenizer->keepc = c;
*current = '\0';
return ESPEAKNG_TOKEN_WHITESPACE;
default:
current += utf8_out(c, current);
*current = '\0';
@@ -158,6 +207,7 @@ create_tokenizer(void)
if (!tokenizer) return NULL;

tokenizer->decoder = NULL;
tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
tokenizer->read = tokenizer_state_end_of_buffer;

*tokenizer->token = '\0';
@@ -177,6 +227,7 @@ tokenizer_reset(espeak_ng_TOKENIZER *tokenizer,
if (!tokenizer) return 0;

tokenizer->decoder = decoder;
tokenizer->keepc = ESPEAKNG_CODEPOINT_INVALID;
tokenizer->read = decoder ? tokenizer_state_default : tokenizer_state_end_of_buffer;
return 1;
}

+ 1
- 0
src/libespeak-ng/tokenizer.h View File

@@ -39,6 +39,7 @@ typedef enum
ESPEAKNG_TOKEN_END_OF_BUFFER,
ESPEAKNG_TOKEN_UNKNOWN,
ESPEAKNG_TOKEN_NEWLINE,
ESPEAKNG_TOKEN_WHITESPACE,
} espeak_ng_TOKEN_TYPE;

espeak_ng_TOKEN_TYPE

+ 62
- 0
tests/tokenizer.c View File

@@ -260,6 +260,66 @@ test_windows_newline_tokens()
destroy_tokenizer(tokenizer);
}

void
test_whitespace_tokens()
{
printf("testing whitespace tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "\t\t\n \xE3\x80\x80 \n\xC2\xA0\xC2\xA0\n\xE2\x80\xA8\n\xE2\x80\xA9", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder) == 1);

// General Category: Cc, Property: White_Space
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\t\t") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

// General Category: Zs, Property: White_Space
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " \xE3\x80\x80 ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

// General Category: Zs, Property: White_Space, Decomposition: <noBreak>
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xA0\xC2\xA0") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

// General Category: Zl -- LINE SEPARATOR
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

// General Category: Zp -- PARAGRAPH SEPARATOR
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

int
main(int argc, char **argv)
{
@@ -276,9 +336,11 @@ main(int argc, char **argv)
test_fullwidth();

test_unbound_tokenizer();

test_linux_newline_tokens();
test_mac_newline_tokens();
test_windows_newline_tokens();
test_whitespace_tokens();

printf("done\n");


Loading…
Cancel
Save