Browse Source

tokenizer.c: Recognise U+0085 [NEW LINE (NEL)] as a newline codepoint.

master
Reece H. Dunn 8 years ago
parent
commit
bf45e7ce36
2 changed files with 34 additions and 4 deletions
  1. 5
    4
      src/libespeak-ng/tokenizer.c
  2. 29
    0
      tests/tokenizer.c

+ 5
- 4
src/libespeak-ng/tokenizer.c View File

@@ -99,9 +99,10 @@ static espeakng_CTYPE codepoint_type(uint32_t c)

switch (c)
{
case '\r': return ESPEAKNG_CTYPE_CARRIAGE_RETURN;
case '\n': return ESPEAKNG_CTYPE_NEWLINE;
case '\0': return ESPEAKNG_CTYPE_END_OF_STRING;
case 0x0000: return ESPEAKNG_CTYPE_END_OF_STRING; // NULL
case 0x000A: return ESPEAKNG_CTYPE_NEWLINE; // LINE FEED (LF)
case 0x000D: return ESPEAKNG_CTYPE_CARRIAGE_RETURN; // CARRIAGE RETURN (CR)
case 0x0085: return ESPEAKNG_CTYPE_NEWLINE; // NEW LINE (NEL)
}

// 2. Classify codepoints by their Unicode General Category.
@@ -173,7 +174,7 @@ tokenizer_state_default(espeak_ng_TOKENIZER *tokenizer)
c = text_decoder_getc(tokenizer->decoder);
}
// fallthrough
case ESPEAKNG_CTYPE_NEWLINE: // '\n'
case ESPEAKNG_CTYPE_NEWLINE:
current += utf8_out(c, current);
*current = '\0';
return ESPEAKNG_TOKEN_NEWLINE;

+ 29
- 0
tests/tokenizer.c View File

@@ -260,6 +260,34 @@ test_windows_newline_tokens()
destroy_tokenizer(tokenizer);
}

void
test_unicode_newline_tokens()
{
printf("testing unicode newline tokens\n");

espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "\xC2\x85\xC2\x85", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder) == 1);

// U+0085 : NEXT LINE (NEL) -- Used in EBCDIC systems as a combined CR+LF character.
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\xC2\x85") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_END_OF_BUFFER);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(*tokenizer_get_token_text(tokenizer) == '\0');

destroy_text_decoder(decoder);
destroy_tokenizer(tokenizer);
}

void
test_whitespace_tokens()
{
@@ -340,6 +368,7 @@ main(int argc, char **argv)
test_linux_newline_tokens();
test_mac_newline_tokens();
test_windows_newline_tokens();
test_unicode_newline_tokens();
test_whitespace_tokens();

printf("done\n");

Loading…
Cancel
Save