Browse Source

tokenizer.c: Tokenize line separator codepoints as newline tokens.

master
Reece H. Dunn 8 years ago
parent
commit
d2d718d700
2 changed files with 13 additions and 12 deletions
  1. 2
    1
      src/libespeak-ng/tokenizer.c
  2. 11
    11
      tests/tokenizer.c

+ 2
- 1
src/libespeak-ng/tokenizer.c View File



#define ESPEAKNG_CTYPE_PROPERTY_MASK 0x0000000000000001ull #define ESPEAKNG_CTYPE_PROPERTY_MASK 0x0000000000000001ull


// Reference: http://www.unicode.org/reports/tr14/tr14-32.html -- Unicode Line Breaking Algorithm
static espeakng_CTYPE codepoint_type(uint32_t c) static espeakng_CTYPE codepoint_type(uint32_t c)
{ {
// 1. Detect and classify specific codepoints. // 1. Detect and classify specific codepoints.
ucd_category cat = ucd_lookup_category(c); ucd_category cat = ucd_lookup_category(c);
switch (cat) switch (cat)
{ {
case UCD_CATEGORY_Zl: return ESPEAKNG_CTYPE_WHITESPACE;
case UCD_CATEGORY_Zl: return ESPEAKNG_CTYPE_NEWLINE;
case UCD_CATEGORY_Zp: return ESPEAKNG_CTYPE_WHITESPACE; case UCD_CATEGORY_Zp: return ESPEAKNG_CTYPE_WHITESPACE;
case UCD_CATEGORY_Zs: return ESPEAKNG_CTYPE_WHITESPACE; case UCD_CATEGORY_Zs: return ESPEAKNG_CTYPE_WHITESPACE;
} }

+ 11
- 11
tests/tokenizer.c View File

espeak_ng_TOKENIZER *tokenizer = create_tokenizer(); espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder(); espeak_ng_TEXT_DECODER *decoder = create_text_decoder();


assert(text_decoder_decode_string(decoder, "\xC2\x85\xC2\x85", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(text_decoder_decode_string(decoder, "\xC2\x85\xC2\x85\xE2\x80\xA8\xE2\x80\xA8", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder) == 1); assert(tokenizer_reset(tokenizer, decoder) == 1);


// U+0085 : NEXT LINE (NEL) -- Used in EBCDIC systems as a combined CR+LF character. // U+0085 : NEXT LINE (NEL) -- Used in EBCDIC systems as a combined CR+LF character.
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0);


// General Category: Zl -- LINE SEPARATOR
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER); assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0'); assert(*tokenizer_get_token_text(tokenizer) == '\0');
espeak_ng_TOKENIZER *tokenizer = create_tokenizer(); espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder(); espeak_ng_TEXT_DECODER *decoder = create_text_decoder();


assert(text_decoder_decode_string(decoder, "\t\t\n \xE3\x80\x80 \n\xC2\xA0\xC2\xA0\n\xE2\x80\xA8\n\xE2\x80\xA9", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(text_decoder_decode_string(decoder, "\t\t\n \xE3\x80\x80 \n\xC2\xA0\xC2\xA0\n\xE2\x80\xA9", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder) == 1); assert(tokenizer_reset(tokenizer, decoder) == 1);


// General Category: Cc, Property: White_Space // General Category: Cc, Property: White_Space
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0); assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);


// General Category: Zl -- LINE SEPARATOR
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xE2\x80\xA8") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\n") == 0);

// General Category: Zp -- PARAGRAPH SEPARATOR // General Category: Zp -- PARAGRAPH SEPARATOR
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE); assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_WHITESPACE);
assert(tokenizer_get_token_text(tokenizer) != NULL); assert(tokenizer_get_token_text(tokenizer) != NULL);

Loading…
Cancel
Save