Browse Source

tokenizer.c: Support word tokens.

master
Reece H. Dunn 8 years ago
parent
commit
d50f3f2fa5
3 changed files with 143 additions and 5 deletions
  1. 66
    1
      src/libespeak-ng/tokenizer.c
  2. 4
    0
      src/libespeak-ng/tokenizer.h
  3. 73
    4
      tests/tokenizer.c

+ 66
- 1
src/libespeak-ng/tokenizer.c View File

ESPEAKNG_CTYPE_END_OF_STRING, ESPEAKNG_CTYPE_END_OF_STRING,
ESPEAKNG_CTYPE_PARAGRAPH, ESPEAKNG_CTYPE_PARAGRAPH,
ESPEAKNG_CTYPE_WHITESPACE, ESPEAKNG_CTYPE_WHITESPACE,
ESPEAKNG_CTYPE_LOWERCASE,
ESPEAKNG_CTYPE_UPPERCASE,
} espeakng_CTYPE; } espeakng_CTYPE;


#define ESPEAKNG_CTYPE_PROPERTY_MASK 0x0000000000000001ull
#define ESPEAKNG_CTYPE_PROPERTY_MASK 0x000000000000C001ull


// Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm // Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm
static espeakng_CTYPE codepoint_type(uint32_t c) static espeakng_CTYPE codepoint_type(uint32_t c)
ucd_category cat = ucd_lookup_category(c); ucd_category cat = ucd_lookup_category(c);
switch (cat) switch (cat)
{ {
case UCD_CATEGORY_Lu: return ESPEAKNG_CTYPE_UPPERCASE;
case UCD_CATEGORY_Ll: return ESPEAKNG_CTYPE_LOWERCASE;
case UCD_CATEGORY_Zl: return ESPEAKNG_CTYPE_NEWLINE; case UCD_CATEGORY_Zl: return ESPEAKNG_CTYPE_NEWLINE;
case UCD_CATEGORY_Zp: return ESPEAKNG_CTYPE_PARAGRAPH; case UCD_CATEGORY_Zp: return ESPEAKNG_CTYPE_PARAGRAPH;
case UCD_CATEGORY_Zs: return ESPEAKNG_CTYPE_WHITESPACE; case UCD_CATEGORY_Zs: return ESPEAKNG_CTYPE_WHITESPACE;
{ {
case UCD_PROPERTY_WHITE_SPACE: case UCD_PROPERTY_WHITE_SPACE:
return ESPEAKNG_CTYPE_WHITESPACE; return ESPEAKNG_CTYPE_WHITESPACE;
case UCD_PROPERTY_OTHER_LOWERCASE:
return ESPEAKNG_CTYPE_LOWERCASE;
case UCD_PROPERTY_OTHER_UPPERCASE:
return ESPEAKNG_CTYPE_UPPERCASE;
} }


// 4. Classify the remaining codepoints. // 4. Classify the remaining codepoints.
return ESPEAKNG_TOKEN_END_OF_BUFFER; return ESPEAKNG_TOKEN_END_OF_BUFFER;
} }


static espeak_ng_TOKEN_TYPE
tokenizer_read_word_token(espeak_ng_TOKENIZER *tokenizer, char *current, espeak_ng_TOKEN_TYPE type)
{
char *end = tokenizer->token + sizeof(tokenizer->token) - 5; // allow for UTF-8 trailing bytes
int initial_state = 1;

while (current < end && !text_decoder_eof(tokenizer->decoder)) {
uint32_t c = text_decoder_getc(tokenizer->decoder);
switch (codepoint_type(c))
{
case ESPEAKNG_CTYPE_LOWERCASE:
current += utf8_out(c, current);
switch (type)
{
case ESPEAKNG_TOKEN_WORD_LOWERCASE:
case ESPEAKNG_TOKEN_WORD_MIXEDCASE:
case ESPEAKNG_TOKEN_WORD_CAPITALIZED:
break;
case ESPEAKNG_TOKEN_WORD_UPPERCASE:
type = initial_state
? ESPEAKNG_TOKEN_WORD_CAPITALIZED
: ESPEAKNG_TOKEN_WORD_MIXEDCASE;
break;
}
initial_state = 0;
break;
case ESPEAKNG_CTYPE_UPPERCASE:
current += utf8_out(c, current);
switch (type)
{
case ESPEAKNG_TOKEN_WORD_UPPERCASE:
case ESPEAKNG_TOKEN_WORD_MIXEDCASE:
break;
case ESPEAKNG_TOKEN_WORD_LOWERCASE:
case ESPEAKNG_TOKEN_WORD_CAPITALIZED:
type = ESPEAKNG_TOKEN_WORD_MIXEDCASE;
break;
}
initial_state = 0;
break;
default:
tokenizer->keepc = c;
*current = '\0';
return type;
}
}

*current = '\0';
return type;
}

static espeak_ng_TOKEN_TYPE static espeak_ng_TOKEN_TYPE
tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer) tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer)
{ {
tokenizer->keepc = c; tokenizer->keepc = c;
*current = '\0'; *current = '\0';
return ESPEAKNG_TOKEN_WHITESPACE; return ESPEAKNG_TOKEN_WHITESPACE;
case ESPEAKNG_CTYPE_LOWERCASE:
current += utf8_out(c, current);
return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_LOWERCASE);
case ESPEAKNG_CTYPE_UPPERCASE:
current += utf8_out(c, current);
return tokenizer_read_word_token(tokenizer, current, ESPEAKNG_TOKEN_WORD_UPPERCASE);
default: default:
current += utf8_out(c, current); current += utf8_out(c, current);
*current = '\0'; *current = '\0';

+ 4
- 0
src/libespeak-ng/tokenizer.h View File

ESPEAKNG_TOKEN_NEWLINE, ESPEAKNG_TOKEN_NEWLINE,
ESPEAKNG_TOKEN_PARAGRAPH, ESPEAKNG_TOKEN_PARAGRAPH,
ESPEAKNG_TOKEN_WHITESPACE, ESPEAKNG_TOKEN_WHITESPACE,
ESPEAKNG_TOKEN_WORD_UPPERCASE,
ESPEAKNG_TOKEN_WORD_LOWERCASE,
ESPEAKNG_TOKEN_WORD_MIXEDCASE,
ESPEAKNG_TOKEN_WORD_CAPITALIZED,
} espeak_ng_TOKEN_TYPE; } espeak_ng_TOKEN_TYPE;


espeak_ng_TOKEN_TYPE espeak_ng_TOKEN_TYPE

+ 73
- 4
tests/tokenizer.c View File

destroy_tokenizer(tokenizer); destroy_tokenizer(tokenizer);
} }


void
test_Latn_word_tokens()
{
printf("testing Latin (Latn) script word tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "One one ONE OneTwo ONETwo", -1, ESPEAKNG_ENCODING_US_ASCII) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder, ESPEAKNG_TOKENIZER_OPTION_TEXT) == 1);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_CAPITALIZED);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "One") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_LOWERCASE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "one") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_UPPERCASE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "ONE") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_MIXEDCASE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "OneTwo") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), " ") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WORD_MIXEDCASE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "ONETwo") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

void void
run_tests() run_tests()
{ {
test_paragraph_tokens(); test_paragraph_tokens();
test_whitespace_tokens(); test_whitespace_tokens();


test_Latn_word_tokens();

printf("done\n"); printf("done\n");
} }


destroy_tokenizer(tokenizer); destroy_tokenizer(tokenizer);
return; return;
case ESPEAKNG_TOKEN_UNKNOWN: case ESPEAKNG_TOKEN_UNKNOWN:
printf("unknown : %s\n", tokenizer_get_token_text(tokenizer));
printf("unknown : %s\n", tokenizer_get_token_text(tokenizer));
break; break;
case ESPEAKNG_TOKEN_NEWLINE: case ESPEAKNG_TOKEN_NEWLINE:
printf("newline : ");
printf("newline : ");
escape_newline(tokenizer_get_token_text(tokenizer)); escape_newline(tokenizer_get_token_text(tokenizer));
putc('\n', stdout); putc('\n', stdout);
break; break;
case ESPEAKNG_TOKEN_PARAGRAPH: case ESPEAKNG_TOKEN_PARAGRAPH:
printf("paragraph : %s\n", tokenizer_get_token_text(tokenizer));
printf("paragraph : %s\n", tokenizer_get_token_text(tokenizer));
break; break;
case ESPEAKNG_TOKEN_WHITESPACE: case ESPEAKNG_TOKEN_WHITESPACE:
printf("whitespace : %s\n", tokenizer_get_token_text(tokenizer));
printf("whitespace : %s\n", tokenizer_get_token_text(tokenizer));
break;
case ESPEAKNG_TOKEN_WORD_UPPERCASE:
printf("word (upper case) : %s\n", tokenizer_get_token_text(tokenizer));
break;
case ESPEAKNG_TOKEN_WORD_LOWERCASE:
printf("word (lower case) : %s\n", tokenizer_get_token_text(tokenizer));
break;
case ESPEAKNG_TOKEN_WORD_MIXEDCASE:
printf("word (mixed case) : %s\n", tokenizer_get_token_text(tokenizer));
break;
case ESPEAKNG_TOKEN_WORD_CAPITALIZED:
printf("word (capitalized) : %s\n", tokenizer_get_token_text(tokenizer));
break; break;
} }
} }

Loading…
Cancel
Save