Browse Source

tokenizer.c: Tokenise Zp codepoints as paragraphs.

master
Reece H. Dunn 8 years ago
parent
commit
c41ac642fa
3 changed files with 40 additions and 14 deletions
  1. 9
    4
      src/libespeak-ng/tokenizer.c
  2. 1
    0
      src/libespeak-ng/tokenizer.h
  3. 30
    10
      tests/tokenizer.c

+ 9
- 4
src/libespeak-ng/tokenizer.c View File

ESPEAKNG_CTYPE_CARRIAGE_RETURN, ESPEAKNG_CTYPE_CARRIAGE_RETURN,
ESPEAKNG_CTYPE_NEWLINE, ESPEAKNG_CTYPE_NEWLINE,
ESPEAKNG_CTYPE_END_OF_STRING, ESPEAKNG_CTYPE_END_OF_STRING,
ESPEAKNG_CTYPE_PARAGRAPH,
ESPEAKNG_CTYPE_WHITESPACE, ESPEAKNG_CTYPE_WHITESPACE,
} espeakng_CTYPE; } espeakng_CTYPE;


switch (cat) switch (cat)
{ {
case UCD_CATEGORY_Zl: return ESPEAKNG_CTYPE_NEWLINE; case UCD_CATEGORY_Zl: return ESPEAKNG_CTYPE_NEWLINE;
case UCD_CATEGORY_Zp: return ESPEAKNG_CTYPE_WHITESPACE;
case UCD_CATEGORY_Zp: return ESPEAKNG_CTYPE_PARAGRAPH;
case UCD_CATEGORY_Zs: return ESPEAKNG_CTYPE_WHITESPACE; case UCD_CATEGORY_Zs: return ESPEAKNG_CTYPE_WHITESPACE;
} }




switch (codepoint_type(c)) switch (codepoint_type(c))
{ {
case ESPEAKNG_CTYPE_END_OF_STRING: // '\0'
tokenizer->read = tokenizer_state_end_of_buffer;
return tokenizer_state_end_of_buffer(tokenizer);
case ESPEAKNG_CTYPE_CARRIAGE_RETURN: // '\r' case ESPEAKNG_CTYPE_CARRIAGE_RETURN: // '\r'
if (text_decoder_peekc(tokenizer->decoder) == '\n') { if (text_decoder_peekc(tokenizer->decoder) == '\n') {
current += utf8_out(c, current); current += utf8_out(c, current);
current += utf8_out(c, current); current += utf8_out(c, current);
*current = '\0'; *current = '\0';
return ESPEAKNG_TOKEN_NEWLINE; return ESPEAKNG_TOKEN_NEWLINE;
case ESPEAKNG_CTYPE_END_OF_STRING: // '\0'
tokenizer->read = tokenizer_state_end_of_buffer;
return tokenizer_state_end_of_buffer(tokenizer);
case ESPEAKNG_CTYPE_PARAGRAPH:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_PARAGRAPH;
case ESPEAKNG_CTYPE_WHITESPACE: case ESPEAKNG_CTYPE_WHITESPACE:
current += utf8_out(c, current); current += utf8_out(c, current);
while (!text_decoder_eof(tokenizer->decoder) && while (!text_decoder_eof(tokenizer->decoder) &&

+ 1
- 0
src/libespeak-ng/tokenizer.h View File

ESPEAKNG_TOKEN_END_OF_BUFFER, ESPEAKNG_TOKEN_END_OF_BUFFER,
ESPEAKNG_TOKEN_UNKNOWN, ESPEAKNG_TOKEN_UNKNOWN,
ESPEAKNG_TOKEN_NEWLINE, ESPEAKNG_TOKEN_NEWLINE,
ESPEAKNG_TOKEN_PARAGRAPH,
ESPEAKNG_TOKEN_WHITESPACE, ESPEAKNG_TOKEN_WHITESPACE,
} espeak_ng_TOKEN_TYPE; } espeak_ng_TOKEN_TYPE;



+ 30
- 10
tests/tokenizer.c View File

destroy_tokenizer(tokenizer); destroy_tokenizer(tokenizer);
} }


void
test_paragraph_tokens()
{
printf("testing paragraph tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "\xE2\x80\xA9\xE2\x80\xA9", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder) == 1);

// General Category: Zp -- PARAGRAPH SEPARATOR
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PARAGRAPH);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_PARAGRAPH);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

void void
test_whitespace_tokens() test_whitespace_tokens()
{ {
espeak_ng_TOKENIZER *tokenizer = create_tokenizer(); espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder(); espeak_ng_TEXT_DECODER *decoder = create_text_decoder();


assert(text_decoder_decode_string(decoder, "\t\t\n\x0B\x0B\n \xE3\x80\x80 \n\xC2\xA0\xC2\xA0\n\xE2\x80\xA9", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(text_decoder_decode_string(decoder, "\t\t\n\x0B\x0B\n \xE3\x80\x80 \n\xC2\xA0\xC2\xA0", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder) == 1); assert(tokenizer_reset(tokenizer, decoder) == 1);


// General Category: Cc, Property: White_Space // General Category: Cc, Property: White_Space
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xA0\xC2\xA0") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\xA0\xC2\xA0") == 0);


assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

// General Category: Zp -- PARAGRAPH SEPARATOR
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA9") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER); assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0'); assert(*tokenizer_get_token_text(tokenizer) == '\0');
test_mac_newline_tokens(); test_mac_newline_tokens();
test_windows_newline_tokens(); test_windows_newline_tokens();
test_unicode_newline_tokens(); test_unicode_newline_tokens();
test_paragraph_tokens();
test_whitespace_tokens(); test_whitespace_tokens();


printf("done\n"); printf("done\n");

Loading…
Cancel
Save