Browse Source

tokenizer.c: Recognise U+000C [FORM FEED (FF)] as a newline codepoint.

master
Reece H. Dunn 8 years ago
parent
commit
fc7a4e6701
2 changed files with 11 additions and 1 deletions
  1. 1
    0
      src/libespeak-ng/tokenizer.c
  2. 10
    1
      tests/tokenizer.c

+ 1
- 0
src/libespeak-ng/tokenizer.c View File

@@ -102,6 +102,7 @@ static espeakng_CTYPE codepoint_type(uint32_t c)
{
case 0x0000: return ESPEAKNG_CTYPE_END_OF_STRING; // NULL
case 0x000A: return ESPEAKNG_CTYPE_NEWLINE; // LINE FEED (LF)
case 0x000C: return ESPEAKNG_CTYPE_NEWLINE; // FORM FEED (FF)
case 0x000D: return ESPEAKNG_CTYPE_CARRIAGE_RETURN; // CARRIAGE RETURN (CR)
case 0x0085: return ESPEAKNG_CTYPE_NEWLINE; // NEW LINE (NEL)
}

+ 10
- 1
tests/tokenizer.c View File

@@ -268,9 +268,18 @@ test_unicode_newline_tokens()
espeak_ng_TOKENIZER *tokenizer = create_tokenizer();
espeak_ng_TEXT_DECODER *decoder = create_text_decoder();

assert(text_decoder_decode_string(decoder, "\xC2\x85\xC2\x85\xE2\x80\xA8\xE2\x80\xA8", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(text_decoder_decode_string(decoder, "\x0C\x0C\xC2\x85\xC2\x85\xE2\x80\xA8\xE2\x80\xA8", -1, ESPEAKNG_ENCODING_UTF_8) == ENS_OK);
assert(tokenizer_reset(tokenizer, decoder) == 1);

// U+000C : FORM FEED (FF) -- Used as a page (not paragraph) break.
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0C") == 0);

assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);
assert(strcmp(tokenizer_get_token_text(tokenizer), "\x0C") == 0);

// U+0085 : NEXT LINE (NEL) -- Used in EBCDIC systems as a combined CR+LF character.
assert(tokenizer_read_next_token(tokenizer) == ESPEAKNG_TOKEN_NEWLINE);
assert(tokenizer_get_token_text(tokenizer) != NULL);

Loading…
Cancel
Save